tesseract  3.04.00
mftraining.cpp File Reference
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "classify.h"
#include "cluster.h"
#include "clusttool.h"
#include "commontraining.h"
#include "danerror.h"
#include "efio.h"
#include "emalloc.h"
#include "featdefs.h"
#include "fontinfo.h"
#include "genericvector.h"
#include "indexmapbidi.h"
#include "intproto.h"
#include "mastertrainer.h"
#include "mergenf.h"
#include "mf.h"
#include "ndminx.h"
#include "ocrfeatures.h"
#include "oldlist.h"
#include "protos.h"
#include "shapetable.h"
#include "tessopt.h"
#include "tprintf.h"
#include "unicity_table.h"

Go to the source code of this file.

Macros

#define _USE_MATH_DEFINES
 
#define PROGRAM_FEATURE_TYPE   "mf"
 

Functions

 DECLARE_STRING_PARAM_FLAG (test_ch)
 
int main (int argc, char **argv)
 

Variables

const int kMaxShapeLabelLength = 10
 

Macro Definition Documentation

#define _USE_MATH_DEFINES

Include Files and Type Defines

Definition at line 35 of file mftraining.cpp.

#define PROGRAM_FEATURE_TYPE   "mf"

Definition at line 75 of file mftraining.cpp.

Function Documentation

DECLARE_STRING_PARAM_FLAG ( test_ch  )
int main ( int  argc,
char **  argv 
)

Public Function Prototypes

Definition at line 212 of file mftraining.cpp.

212  {
213 /*
214 ** Parameters:
215 ** argc number of command line arguments
216 ** argv array of command line arguments
217 ** Globals: none
218 ** Operation:
219 ** This program reads in a text file consisting of feature
220 ** samples from a training page in the following format:
221 **
222 ** FontName UTF8-char-str xmin ymin xmax ymax page-number
223 ** NumberOfFeatureTypes(N)
224 ** FeatureTypeName1 NumberOfFeatures(M)
225 ** Feature1
226 ** ...
227 ** FeatureM
228 ** FeatureTypeName2 NumberOfFeatures(M)
229 ** Feature1
230 ** ...
231 ** FeatureM
232 ** ...
233 ** FeatureTypeNameN NumberOfFeatures(M)
234 ** Feature1
235 ** ...
236 ** FeatureM
237 ** FontName CharName ...
238 **
239 ** The result of this program is a binary inttemp file used by
240 ** the OCR engine.
241 ** Return: none
242 ** Exceptions: none
243 ** History: Fri Aug 18 08:56:17 1989, DSJ, Created.
244 ** Mon May 18 1998, Christy Russson, Revistion started.
245 */
246  ParseArguments(&argc, &argv);
247 
248  ShapeTable* shape_table = NULL;
249  STRING file_prefix;
250  // Load the training data.
251  MasterTrainer* trainer = tesseract::LoadTrainingData(argc, argv,
252  false,
253  &shape_table,
254  &file_prefix);
255  if (trainer == NULL)
256  return 1; // Failed.
257 
258  // Setup an index mapping from the shapes in the shape table to the classes
259  // that will be trained. In keeping with the original design, each shape
260  // with the same list of unichars becomes a different class and the configs
261  // represent the different combinations of fonts.
262  IndexMapBiDi config_map;
263  SetupConfigMap(shape_table, &config_map);
264 
265  WriteShapeTable(file_prefix, *shape_table);
266  // If the shape_table is flat, then either we didn't run shape clustering, or
267  // it did nothing, so we just output the trainer's unicharset.
268  // Otherwise shape_set will hold a fake unicharset with an entry for each
269  // shape in the shape table, and we will output that instead.
270  UNICHARSET shape_set;
271  const UNICHARSET* unicharset = &trainer->unicharset();
272  // If we ran shapeclustering (and it worked) then at least one shape will
273  // have multiple unichars, so we have to build a fake unicharset.
274  if (shape_table->AnyMultipleUnichars()) {
275  unicharset = &shape_set;
276  // Now build a fake unicharset for the compact shape space to keep the
277  // output modules happy that we are doing things correctly.
278  int num_shapes = config_map.CompactSize();
279  for (int s = 0; s < num_shapes; ++s) {
280  char shape_label[kMaxShapeLabelLength + 1];
281  snprintf(shape_label, kMaxShapeLabelLength, "sh%04d", s);
282  shape_set.unichar_insert(shape_label);
283  }
284  }
285 
286  // Now train each config separately.
287  int num_configs = shape_table->NumShapes();
288  LIST mf_classes = NIL_LIST;
289  for (int s = 0; s < num_configs; ++s) {
290  int unichar_id, font_id;
291  if (unicharset == &shape_set) {
292  // Using fake unichar_ids from the config_map/shape_set.
293  unichar_id = config_map.SparseToCompact(s);
294  } else {
295  // Get the real unichar_id from the shape table/unicharset.
296  shape_table->GetFirstUnicharAndFont(s, &unichar_id, &font_id);
297  }
298  const char* class_label = unicharset->id_to_unichar(unichar_id);
299  mf_classes = ClusterOneConfig(s, class_label, mf_classes, *shape_table,
300  trainer);
301  }
302  STRING inttemp_file = file_prefix;
303  inttemp_file += "inttemp";
304  STRING pffmtable_file = file_prefix;
305  pffmtable_file += "pffmtable";
306  CLASS_STRUCT* float_classes = SetUpForFloat2Int(*unicharset, mf_classes);
307  // Now write the inttemp and pffmtable.
308  trainer->WriteInttempAndPFFMTable(trainer->unicharset(), *unicharset,
309  *shape_table, float_classes,
310  inttemp_file.string(),
311  pffmtable_file.string());
312  delete [] float_classes;
313  FreeLabeledClassList(mf_classes);
314  delete trainer;
315  delete shape_table;
316  printf("Done!\n");
317  if (!FLAGS_test_ch.empty()) {
318  // If we are displaying debug window(s), wait for the user to look at them.
319  printf("Hit return to exit...\n");
320  while (getchar() != '\n');
321  }
322  return 0;
323 } /* main */
void FreeLabeledClassList(LIST ClassList)
const UNICHARSET & unicharset() const
void WriteInttempAndPFFMTable(const UNICHARSET &unicharset, const UNICHARSET &shape_set, const ShapeTable &shape_table, CLASS_STRUCT *float_classes, const char *inttemp_file, const char *pffmtable_file)
#define NULL
Definition: host.h:144
MasterTrainer * LoadTrainingData(int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
int NumShapes() const
Definition: shapetable.h:278
const int kMaxShapeLabelLength
Definition: mftraining.cpp:78
void ParseArguments(int *argc, char ***argv)
int CompactSize() const
Definition: indexmapbidi.h:61
const char * string() const
Definition: strngs.cpp:193
bool AnyMultipleUnichars() const
Definition: shapetable.cpp:454
Definition: strngs.h:44
CLASS_STRUCT * SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList)
void WriteShapeTable(const STRING &file_prefix, const ShapeTable &shape_table)
#define NIL_LIST
Definition: oldlist.h:126
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
void GetFirstUnicharAndFont(int shape_id, int *unichar_id, int *font_id) const
Definition: shapetable.cpp:414
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612
virtual int SparseToCompact(int sparse_index) const
Definition: indexmapbidi.h:138

Variable Documentation

const int kMaxShapeLabelLength = 10

Definition at line 78 of file mftraining.cpp.