tesseract  3.04.00
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cube_reco_context.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: cube_reco_context.cpp
3  * Description: Implementation of the Cube Recognition Context Class
4  * Author: Ahmad Abdulkader
5  * Created: 2007
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <string>
21 #include <limits.h>
22 
23 #include "cube_reco_context.h"
24 
25 #include "classifier_factory.h"
26 #include "cube_tuning_params.h"
27 #include "dict.h"
28 #include "feature_bmp.h"
29 #include "tessdatamanager.h"
30 #include "tesseractclass.h"
31 #include "tess_lang_model.h"
32 
33 namespace tesseract {
34 
35 // Instantiate a CubeRecoContext object using a Tesseract object.
36 // CubeRecoContext will not take ownership of tess_obj, but will
37 // record the pointer to it and will make use of various Tesseract
38 // components (language model, flags, etc). Thus the caller should
39 // keep tess_obj alive so long as the instantiated CubeRecoContext is used.
41  tess_obj_ = tess_obj;
42  lang_ = "";
43  loaded_ = false;
44  lang_mod_ = NULL;
45  params_ = NULL;
46  char_classifier_ = NULL;
47  char_set_ = NULL;
48  word_size_model_ = NULL;
49  char_bigrams_ = NULL;
50  word_unigrams_ = NULL;
51  noisy_input_ = false;
52  size_normalization_ = false;
53 }
54 
56  if (char_classifier_ != NULL) {
57  delete char_classifier_;
58  char_classifier_ = NULL;
59  }
60 
61  if (word_size_model_ != NULL) {
62  delete word_size_model_;
63  word_size_model_ = NULL;
64  }
65 
66  if (char_set_ != NULL) {
67  delete char_set_;
68  char_set_ = NULL;
69  }
70 
71  if (char_bigrams_ != NULL) {
72  delete char_bigrams_;
73  char_bigrams_ = NULL;
74  }
75 
76  if (word_unigrams_ != NULL) {
77  delete word_unigrams_;
78  word_unigrams_ = NULL;
79  }
80 
81  if (lang_mod_ != NULL) {
82  delete lang_mod_;
83  lang_mod_ = NULL;
84  }
85 
86  if (params_ != NULL) {
87  delete params_;
88  params_ = NULL;
89  }
90 }
91 
92 // Returns the path of the data files by looking up the TESSDATA_PREFIX
93 // environment variable and appending a "tessdata" directory to it
94 bool CubeRecoContext::GetDataFilePath(string *path) const {
95  *path = tess_obj_->datadir.string();
96  return true;
97 }
98 
99 // The object initialization function that loads all the necessary
100 // components of a RecoContext. TessdataManager is used to load the
101 // data from [lang].traineddata file. If TESSDATA_CUBE_UNICHARSET
102 // component is present, Cube will be instantiated with the unicharset
103 // specified in this component and the corresponding dictionary
104 // (TESSDATA_CUBE_SYSTEM_DAWG), and will map Cube's unicharset to
105 // Tesseract's. Otherwise, TessdataManager will assume that Cube will
106 // be using Tesseract's unicharset and dawgs, and will load the
107 // unicharset from the TESSDATA_UNICHARSET component and will load the
108 // dawgs from TESSDATA_*_DAWG components.
109 bool CubeRecoContext::Load(TessdataManager *tessdata_manager,
110  UNICHARSET *tess_unicharset) {
111  ASSERT_HOST(tess_obj_ != NULL);
112  tess_unicharset_ = tess_unicharset;
113  string data_file_path;
114 
115  // Get the data file path.
116  if (GetDataFilePath(&data_file_path) == false) {
117  fprintf(stderr, "Unable to get data file path\n");
118  return false;
119  }
120 
121  // Get the language from the Tesseract object.
122  lang_ = tess_obj_->lang.string();
123 
124  // Create the char set.
125  if ((char_set_ =
126  CharSet::Create(tessdata_manager, tess_unicharset)) == NULL) {
127  fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to load "
128  "CharSet\n");
129  return false;
130  }
131  // Create the language model.
132  string lm_file_name = data_file_path + lang_ + ".cube.lm";
133  string lm_params;
134  if (!CubeUtils::ReadFileToString(lm_file_name, &lm_params)) {
135  fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to read cube "
136  "language model params from %s\n", lm_file_name.c_str());
137  return false;
138  }
139  lang_mod_ = new TessLangModel(lm_params, data_file_path,
140  tess_obj_->getDict().load_system_dawg,
141  tessdata_manager, this);
142  if (lang_mod_ == NULL) {
143  fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to create "
144  "TessLangModel\n");
145  return false;
146  }
147 
148  // Create the optional char bigrams object.
149  char_bigrams_ = CharBigrams::Create(data_file_path, lang_);
150 
151  // Create the optional word unigrams object.
152  word_unigrams_ = WordUnigrams::Create(data_file_path, lang_);
153 
154  // Create the optional size model.
155  word_size_model_ = WordSizeModel::Create(data_file_path, lang_,
156  char_set_, Contextual());
157 
158  // Load tuning params.
159  params_ = CubeTuningParams::Create(data_file_path, lang_);
160  if (params_ == NULL) {
161  fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to read "
162  "CubeTuningParams from %s\n", data_file_path.c_str());
163  return false;
164  }
165 
166  // Create the char classifier.
167  char_classifier_ = CharClassifierFactory::Create(data_file_path, lang_,
168  lang_mod_, char_set_,
169  params_);
170  if (char_classifier_ == NULL) {
171  fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to load "
172  "CharClassifierFactory object from %s\n", data_file_path.c_str());
173  return false;
174  }
175 
176  loaded_ = true;
177 
178  return true;
179 }
180 
181 // Creates a CubeRecoContext object using a tesseract object
183  TessdataManager *tessdata_manager,
184  UNICHARSET *tess_unicharset) {
185  // create the object
186  CubeRecoContext *cntxt = new CubeRecoContext(tess_obj);
187  if (cntxt == NULL) {
188  fprintf(stderr, "Cube ERROR (CubeRecoContext::Create): unable to create "
189  "CubeRecoContext object\n");
190  return NULL;
191  }
192  // load the necessary components
193  if (cntxt->Load(tessdata_manager, tess_unicharset) == false) {
194  fprintf(stderr, "Cube ERROR (CubeRecoContext::Create): unable to init "
195  "CubeRecoContext object\n");
196  delete cntxt;
197  return NULL;
198  }
199  // success
200  return cntxt;
201 }
202 } // tesseract}
static bool ReadFileToString(const string &file_name, string *str)
Definition: cube_utils.cpp:177
static CharBigrams * Create(const string &data_file_path, const string &lang)
#define ASSERT_HOST(x)
Definition: errcode.h:84
CubeRecoContext(Tesseract *tess_obj)
STRING datadir
Definition: ccutil.h:67
static WordUnigrams * Create(const string &data_file_path, const string &lang)
static WordSizeModel * Create(const string &data_file_path, const string &lang, CharSet *char_set, bool contextual)
static CharClassifier * Create(const string &data_file_path, const string &lang, LangModel *lang_mod, CharSet *char_set, TuningParams *params)
Dict & getDict()
Definition: classify.h:65
bool load_system_dawg
Definition: dict.h:554
STRING lang
Definition: ccutil.h:69
static CubeRecoContext * Create(Tesseract *tess_obj, TessdataManager *tessdata_manager, UNICHARSET *tess_unicharset)
#define NULL
Definition: host.h:144
static CharSet * Create(TessdataManager *tessdata_manager, UNICHARSET *tess_unicharset)
Definition: char_set.cpp:54
const char * string() const
Definition: strngs.cpp:193
bool GetDataFilePath(string *path) const
static CubeTuningParams * Create(const string &data_file, const string &lang)