Logo Search packages:      
Sourcecode: ocropus version File versions  Download package

ocrinterfaces.h

Go to the documentation of this file.
#ifndef h_ocrinterfaces__
#define h_ocrinterfaces__

// Copyright 2006 Deutsches Forschungszentrum fuer Kuenstliche Intelligenz 
// or its licensors, as applicable.
// 
// You may not use this file except under the terms of the accompanying license.
// 
// Licensed under the Apache License, Version 2.0 (the "License"); you
// may not use this file except in compliance with the License. You may
// obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
// 
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// 
// Project: iupr common header files
// File: ocrinterfaces.h
// Purpose: interfaces to OCR system components
// Responsible: tmb
// Reviewer: 
// Primary Repository: 
// Web Sites: www.iupr.org, www.dfki.de

/// \file ocrinterfaces.h
/// \brief Interfaces to OCR system components


#include <stdlib.h>
#include "narray.h"
#include "narray-util.h"
#include "smartptr.h"
#include "misc.h"
#include "coords.h"
#include "nustring.h"

namespace colib {

    /// Base class for OCR interfaces.
    
    /// Contains some minimal information
    /// and ways of interacting with an OCR component.

00046     struct IComponent {
        virtual const char *description() = 0;
      // virtual methods for getting and setting parameters

        /// Set a string property or throw an exception if not implemented.
00051         virtual void set(const char *key,const char *value) { 
            throw "IComponent::set(char*,char*) unimplemented by subclass";
        }
        /// Set a number property or throw an exception if not implemented.
00055         virtual void set(const char *key,double value) { 
            throw "IComponent::set(char*,double) unimplemented by subclass";
        }
        /// Get a string property or throw an exception if not implemented.
00059       virtual const char *gets(const char *key) {
            throw "IComponent::gets(char*) unimplemented by subclass";
      }

        /// Get a number property or throw an exception if not implemented.
00064       virtual double getd(const char *key) {
            throw "IComponent::getd(char*) unimplemented by subclass";
      }
        virtual ~IComponent() {}
    };

    /// Cleanup for gray scale document images.

    /// Should work for both gray scale and binary images.
    ///
00074     struct ICleanupGray : IComponent {
        /// Clean up a gray image.
        virtual void cleanup(bytearray &out,bytearray &in) = 0;
    };

    /// Cleanup for binary document images.

    /// Should throw an error when applied to grayscale.
    ///
00083     struct ICleanupBinary : IComponent {
        /// Clean up a binary image.
        virtual void cleanup(bytearray &out,bytearray &in) = 0;
    };

    /// Compute text/image probabilities
    
    /// The output is in the standard RGB format 
    /// for text/image segmentation (see ocropus.org)

00093     struct ITextImageClassification : IComponent {
        /// Compute text/image probabilities.
        virtual void textImageProbabilities(intarray &out,bytearray &in) = 0;
    };

    /// Perform binarization of grayscale images.

00100     struct IBinarize : IComponent {
        /// Binarize an image stored in a floatarray. Override this.
        virtual void binarize(bytearray &out,floatarray &in) = 0;
        /// \brief Binarize an image stored in a bytearray.
        /// Most likely you don't want to override this.
00105         virtual void binarize(bytearray &out,bytearray &in) {
            floatarray temp;
            copy(temp,in);
            binarize(out,temp);
        }
    };

    /// Compute page segmentation into columns, lines, etc.
    
    /// The output is in the standard RGB format
    /// for page segmentation (see ocropus.org)

00117     struct ISegmentPage : IComponent {
        /// Segment the page.
        virtual void segment(intarray &out,bytearray &in) = 0;
    };

    /// Compute line segmentation into character hypotheses.
    //
    /// The output is in the standard RGB format
    /// for page segmentation (see ocropus.org)

00127     struct ISegmentLine : IComponent {
        /// Segment a line.
        virtual void charseg(intarray &out,bytearray &in) = 0;
    };

    /// \brief A generic interface for language models.

    /// An IGenericFst is a directed graph
    /// with output/cost/id written on arcs,
    /// accept cost written on vertices and
    /// a fixed start vertice.
00138     struct IGenericFst : virtual IComponent {
        /// Clear the language model
        virtual void clear() = 0;

        /// Get a single new state
        virtual int newState() = 0;

        /// Add a transition between the given states
        virtual void addTransition(int from,int to,int output,float cost,int input) = 0;
        
        /// A variant of addTransition() with equal input and output.
00149         virtual void addTransition(int from,int to,int symbol,float cost) {
            addTransition(from, to, symbol, cost, symbol);
        }

        /// Set the start state
        virtual void setStart(int node) = 0;

        /// Set a state as an accept state
        virtual void setAccept(int node,float cost=0.0) = 0;

        /// Obtain codes for "specials" (language model dependent)
        virtual int special(const char *s) = 0;

        /// \brief Compute the best path through the language model.
        /// Useful for simple OCR tasks and for debugging.
        virtual void bestpath(nustring &result) = 0;

        /// destroy the language model
00167         virtual ~IGenericFst() {}

        /// simple interface for line recognizers
00170         virtual void setString(nustring &text,floatarray &costs,intarray &ids) {
            int n = text.length();
            intarray states;
            states.clear();
            for(int i=0;i<n+1;i++)
                states.push(newState());
            for(int i=0;i<n;i++)
                addTransition(states[i],states[i+1],text[i].ord(),costs[i],ids[i]);
            setStart(states[0]);
            setAccept(states[n]);
        }

        // reading methods

        /// Get the number of states.
00185         virtual int nStates() { throw "unimplemented"; }
        
        /// Get the starting state.
00188         virtual int getStart() { throw "unimplemented"; }
        
        /// Get the accept cost of a given vertex (a cost to finish the line and quit).
00191         virtual float getAcceptCost(int node) { throw "unimplemented"; }

        /// Return an array of arcs leading from the given node.
00194         virtual void arcs(colib::intarray &ids,
                          colib::intarray &targets,
                          colib::intarray &outputs,
                          colib::floatarray &costs, 
                          int from) { throw "unimplemented"; }
    };

    /// A generic interface for isolated character recognizers.
    /// Note that this is not the preferred interface for character recognition,
    /// since feature extraction is quite inefficient if it's done a character at a time.

00205     struct ICharacterClassifier : IComponent {
        /// \brief Classify a character without any information about position on the line.
        ///
        /// May throw an exception if it's not implemented.
        virtual void set(bytearray &input_image) = 0;

        /// \brief Classify a character with information about position on the line.
        //
        /// May throw an exception if it's not implemented.
        virtual void set(bytearray &image,int base_y, int xheight_y, int descender_y, int ascender_y) = 0;

        /// Get the number of classes returned. Corresponds to indices to cls() and cost().
        virtual int length() = 0;

        /// Unicode character or character string.
        // 
        /// Note that some classifiers may return multiple characters per class
        virtual void cls(nustring &result, int i) = 0;

        /// cost value for this classification; lower costs = better
        /// should aim to return negative log likelihoods
        virtual float cost(int i) = 0;

      /// "adaptation" means temporary adaptation of the classifier
      /// to all the characters between startTraining and finishTraining
      /// other types of training are recognizer-dependent
00231       virtual void startTraining(const char *type="adaptation") { throw "unimplemented"; }

        /// \brief Train a character.
        //
        /// (Commonly, this only stores data in the model; training is via an external program).
        /// This may be also train on ligatures (if supported),
        /// that's why `characters' is a nustring.
00238         virtual void addTrainingChar(bytearray &input_image,nustring &characters) 
          { throw "unimplemented"; }

        /// Train a character.
00242         virtual void addTrainingChar(bytearray &image,int base_y, int xheight_y, int descender_y,
            int ascender_y,nustring &characters) { throw "unimplemented"; }

        /// Train a character in context (think about this some more).
00246         virtual void addTrainingChar(bytearray &image,bytearray &mask,nustring &characters)
            { throw "unimplemented"; }

        /// Finish training and switch back to recognition; this method may
        /// take a long time to complete.
00251       virtual void finishTraining() { throw "unimplemented"; }

        /// Save a trained model to the stream.
00254       virtual void save(FILE *stream) { throw "unimplemented"; }
      void save(const char *path) { save(stdio(path, "wb")); }

        /// Load a trained model from the stream.
00258       virtual void load(FILE *stream) { throw "unimplemented"; }
      void load(const char *path) { load(stdio(path, "rb")); }

        /// \brief Convenience function for getting the best output 
        //
        /// (useful for debugging)
00264         virtual void best(nustring &result) {
            int mi = -1;
            float mc = 1e30;
            for(int i=0;i<length();i++) {
                if(cost(i)<mc) {
                    mi = i;
                    mc = cost(i);
                }
            }
            if(mi>=0)
                cls(result, mi);
            else
                result.clear();
        }
        
        /// destructor
00280         virtual ~ICharacterClassifier() {}
    };


    /// A generic interface for text line recognition.

00286     struct IRecognizeLine : IComponent {
        /// \brief Recognize a text line and return a lattice representing
        /// the recognition alternatives.
        virtual void recognizeLine(IGenericFst &result,bytearray &image) = 0;

        /// \brief Start training of the given type.

        /// "adaptation" means temporary adaptation of the classifier
      /// to all the lines between startTraining and finishTraining
      /// other types of training are recognizer-dependent
00296       virtual void startTraining(const char *type="adaptation") { throw "unimplemented"; }

        /// \brief Train on a text line.
      
        /// Usage is: call addTrainingLine with training data, then call finishTraining 
      /// The state of the object is undefined between calling addTrainingLine and finishTraining, and it is
      /// an error to call recognizeLine before finishTraining completes.  This allows both batch
      /// and incemental training.
      /// NB: you might train on length 1 strings for single character training
      /// and might train on words if line alignment is not working
      /// (well, for some training data)
00307         virtual void addTrainingLine(bytearray &image,nustring &transcription) { throw "unimplemented"; }


        /// \brief Train on a text line, given a segmentation.
        /// This is analogous to addTrainingLine(bytearray,nustring) except that
        /// it takes the "ground truth" line segmentation.
00313         virtual void addTrainingLine(intarray &segmentation, bytearray &image_grayscale, nustring &transcription) { throw "unimplemented"; }


      /// Align a lattice with a transcription.
        /// \param[out] chars Non-space characters along the best path.
        /// \param[out] result Aligned segmentation, colors correspond to chars
        /// \param[out] costs Costs corresponding to chars
        /// \param[in] image Input grayscale image
        /// \param[in] transcription The "ground truth" lattice to align
00322       virtual void align(nustring &chars,intarray &result,floatarray &costs,bytearray &image,IGenericFst &transcription) { throw "unimplemented"; }

      // eventually?
        // virtual void addTrainingLine(bytearray &image,IGenericFst &transcription) { throw "unimplemented"; }

        /// \brief Finish training, possibly making complex calculations.
        
        /// Call this when training is done and the system should switch back to recognition;
        /// this method may take a long time to complete.
00331       virtual void finishTraining() { throw "unimplemented"; }

        /// Save a trained model to the stream.
00334       virtual void save(FILE *stream) { throw "unimplemented"; }

        /// Load a trained model from the stream.
00337       virtual void load(FILE *stream) { throw "unimplemented"; }

        /// Destructor
00340         virtual ~IRecognizeLine() {}

      /// this is a weird, optional method that exposes character segmentation for those line recognizers that have it
      /// segmentation contains colored pixels, and a transition in
      /// the transducer of the form * --- 1/eps --> * --- 2/a --> *
      /// means that pixels with color 1 and 2 together form the
      /// letter "a"
00347         virtual void recognizeLine(intarray &segmentation,IGenericFst &result,bytearray &image) { throw "unimplemented"; }
    };
}

#endif

Generated by  Doxygen 1.6.0   Back to index