tesseract  3.04.00
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
commontraining.cpp File Reference
#include "commontraining.h"
#include "allheaders.h"
#include "ccutil.h"
#include "classify.h"
#include "cluster.h"
#include "clusttool.h"
#include "efio.h"
#include "emalloc.h"
#include "featdefs.h"
#include "fontinfo.h"
#include "freelist.h"
#include "globals.h"
#include "intfeaturespace.h"
#include "mastertrainer.h"
#include "mf.h"
#include "ndminx.h"
#include "oldlist.h"
#include "params.h"
#include "shapetable.h"
#include "tessdatamanager.h"
#include "tessopt.h"
#include "tprintf.h"
#include "unicity_table.h"
#include <math.h>

Go to the source code of this file.

Namespaces

 tesseract
 

Functions

 INT_PARAM_FLAG (debug_level, 0,"Level of Trainer debugging")
 
 INT_PARAM_FLAG (load_images, 0,"Load images with tr files")
 
 STRING_PARAM_FLAG (configfile,"","File to load more configs from")
 
 STRING_PARAM_FLAG (D,"","Directory to write output files to")
 
 STRING_PARAM_FLAG (F,"font_properties","File listing font properties")
 
 STRING_PARAM_FLAG (X,"","File listing font xheights")
 
 STRING_PARAM_FLAG (U,"unicharset","File to load unicharset from")
 
 STRING_PARAM_FLAG (O,"","File to write unicharset to")
 
 STRING_PARAM_FLAG (T,"","File to load trainer from")
 
 STRING_PARAM_FLAG (output_trainer,"","File to write trainer to")
 
 STRING_PARAM_FLAG (test_ch,"","UTF8 test character string")
 
 DOUBLE_PARAM_FLAG (clusterconfig_min_samples_fraction, Config.MinSamples,"Min number of samples per proto as % of total")
 
 DOUBLE_PARAM_FLAG (clusterconfig_max_illegal, Config.MaxIllegal,"Max percentage of samples in a cluster which have more"" than 1 feature in that cluster")
 
 DOUBLE_PARAM_FLAG (clusterconfig_independence, Config.Independence,"Desired independence between dimensions")
 
 DOUBLE_PARAM_FLAG (clusterconfig_confidence, Config.Confidence,"Desired confidence in prototypes created")
 
void ParseArguments (int *argc, char ***argv)
 
ShapeTabletesseract::LoadShapeTable (const STRING &file_prefix)
 
void tesseract::WriteShapeTable (const STRING &file_prefix, const ShapeTable &shape_table)
 
MasterTrainer * tesseract::LoadTrainingData (int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
 
const char * GetNextFilename (int argc, const char *const *argv)
 
LABELEDLIST FindList (LIST List, char *Label)
 
LABELEDLIST NewLabeledList (const char *Label)
 
void ReadTrainingSamples (const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
 
void FreeTrainingSamples (LIST CharList)
 
void FreeLabeledList (LABELEDLIST LabeledList)
 
CLUSTERERSetUpForClustering (const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
 
void MergeInsignificantProtos (LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
 
void CleanUpUnusedData (LIST ProtoList)
 
LIST RemoveInsignificantProtos (LIST ProtoList, BOOL8 KeepSigProtos, BOOL8 KeepInsigProtos, int N)
 
MERGE_CLASS FindClass (LIST List, const char *Label)
 
MERGE_CLASS NewLabeledClass (const char *Label)
 
void FreeLabeledClassList (LIST ClassList)
 
CLASS_STRUCTSetUpForFloat2Int (const UNICHARSET &unicharset, LIST LabeledClassList)
 
void Normalize (float *Values)
 
void FreeNormProtoList (LIST CharList)
 
void AddToNormProtosList (LIST *NormProtoList, LIST ProtoList, char *CharName)
 
int NumberOfProtos (LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
 

Variables

CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 }
 
FEATURE_DEFS_STRUCT feature_defs
 
CCUtil ccutil
 

Function Documentation

void AddToNormProtosList ( LIST NormProtoList,
LIST  ProtoList,
char *  CharName 
)

Definition at line 870 of file commontraining.cpp.

874 {
875  PROTOTYPE* Proto;
876  LABELEDLIST LabeledProtoList;
877 
878  LabeledProtoList = NewLabeledList(CharName);
879  iterate(ProtoList)
880  {
881  Proto = (PROTOTYPE *) first_node (ProtoList);
882  LabeledProtoList->List = push(LabeledProtoList->List, Proto);
883  }
884  *NormProtoList = push(*NormProtoList, LabeledProtoList);
885 }
#define first_node(l)
Definition: oldlist.h:139
#define iterate(l)
Definition: oldlist.h:159
LABELEDLIST NewLabeledList(const char *Label)
LIST push(LIST list, void *element)
Definition: oldlist.cpp:323
void CleanUpUnusedData ( LIST  ProtoList)

Definition at line 618 of file commontraining.cpp.

620 {
621  PROTOTYPE* Prototype;
622 
623  iterate(ProtoList)
624  {
625  Prototype = (PROTOTYPE *) first_node (ProtoList);
626  if(Prototype->Variance.Elliptical != NULL)
627  {
628  memfree(Prototype->Variance.Elliptical);
629  Prototype->Variance.Elliptical = NULL;
630  }
631  if(Prototype->Magnitude.Elliptical != NULL)
632  {
633  memfree(Prototype->Magnitude.Elliptical);
634  Prototype->Magnitude.Elliptical = NULL;
635  }
636  if(Prototype->Weight.Elliptical != NULL)
637  {
638  memfree(Prototype->Weight.Elliptical);
639  Prototype->Weight.Elliptical = NULL;
640  }
641  }
642 }
#define first_node(l)
Definition: oldlist.h:139
#define iterate(l)
Definition: oldlist.h:159
FLOATUNION Variance
Definition: cluster.h:81
FLOATUNION Weight
Definition: cluster.h:83
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 * Elliptical
Definition: cluster.h:64
void memfree(void *element)
Definition: freelist.cpp:30
#define NULL
Definition: host.h:144
DOUBLE_PARAM_FLAG ( clusterconfig_min_samples_fraction  ,
Config.  MinSamples,
"Min number of samples per proto as % of total"   
)
DOUBLE_PARAM_FLAG ( clusterconfig_max_illegal  ,
Config.  MaxIllegal,
"Max percentage of samples in a cluster which have more"" than 1 feature in that cluster"   
)
DOUBLE_PARAM_FLAG ( clusterconfig_independence  ,
Config.  Independence,
"Desired independence between dimensions"   
)
DOUBLE_PARAM_FLAG ( clusterconfig_confidence  ,
Config.  Confidence,
"Desired confidence in prototypes created"   
)
MERGE_CLASS FindClass ( LIST  List,
const char *  Label 
)

Definition at line 713 of file commontraining.cpp.

716 {
717  MERGE_CLASS MergeClass;
718 
719  iterate (List)
720  {
721  MergeClass = (MERGE_CLASS) first_node (List);
722  if (strcmp (MergeClass->Label, Label) == 0)
723  return (MergeClass);
724  }
725  return (NULL);
726 
727 } /* FindClass */
#define first_node(l)
Definition: oldlist.h:139
#define iterate(l)
Definition: oldlist.h:159
MERGE_CLASS_NODE * MERGE_CLASS
#define NULL
Definition: host.h:144
LABELEDLIST FindList ( LIST  List,
char *  Label 
)

Definition at line 320 of file commontraining.cpp.

338 {
339  LABELEDLIST LabeledList;
340 
341  iterate (List)
342  {
343  LabeledList = (LABELEDLIST) first_node (List);
344  if (strcmp (LabeledList->Label, Label) == 0)
345  return (LabeledList);
346  }
347  return (NULL);
348 
349 } /* FindList */
#define first_node(l)
Definition: oldlist.h:139
struct LABELEDLISTNODE * LABELEDLIST
#define iterate(l)
Definition: oldlist.h:159
#define NULL
Definition: host.h:144
void FreeLabeledClassList ( LIST  ClassList)

Definition at line 744 of file commontraining.cpp.

759 {
760  MERGE_CLASS MergeClass;
761 
762  iterate (ClassList) /* iterate thru all of the fonts */
763  {
764  MergeClass = (MERGE_CLASS) first_node (ClassList);
765  free (MergeClass->Label);
766  FreeClass(MergeClass->Class);
767  delete MergeClass;
768  }
769  destroy (ClassList);
770 
771 } /* FreeLabeledClassList */
#define first_node(l)
Definition: oldlist.h:139
LIST destroy(LIST list)
Definition: oldlist.cpp:187
#define iterate(l)
Definition: oldlist.h:159
void FreeClass(CLASS_TYPE Class)
Definition: protos.cpp:215
MERGE_CLASS_NODE * MERGE_CLASS
CLASS_TYPE Class
void FreeLabeledList ( LABELEDLIST  LabeledList)

Definition at line 483 of file commontraining.cpp.

483  {
484 /*
485  ** Parameters:
486  ** LabeledList labeled list to be freed
487  ** Globals: none
488  ** Operation:
489  ** This routine deallocates all of the memory consumed by
490  ** a labeled list. It does not free any memory which may be
491  ** consumed by the items in the list.
492  ** Return: none
493  ** Exceptions: none
494  ** History: Fri Aug 18 17:52:45 1989, DSJ, Created.
495  */
496  destroy(LabeledList->List);
497  free(LabeledList->Label);
498  free(LabeledList);
499 } /* FreeLabeledList */
LIST destroy(LIST list)
Definition: oldlist.cpp:187
void FreeNormProtoList ( LIST  CharList)

Definition at line 854 of file commontraining.cpp.

857 {
858  LABELEDLIST char_sample;
859 
860  iterate (CharList) /* iterate thru all of the fonts */
861  {
862  char_sample = (LABELEDLIST) first_node (CharList);
863  FreeLabeledList (char_sample);
864  }
865  destroy (CharList);
866 
867 } // FreeNormProtoList
#define first_node(l)
Definition: oldlist.h:139
LIST destroy(LIST list)
Definition: oldlist.cpp:187
struct LABELEDLISTNODE * LABELEDLIST
#define iterate(l)
Definition: oldlist.h:159
void FreeLabeledList(LABELEDLIST LabeledList)
void FreeTrainingSamples ( LIST  CharList)

Definition at line 453 of file commontraining.cpp.

453  {
454 /*
455  ** Parameters:
456  ** FontList list of all fonts in document
457  ** Globals: none
458  ** Operation:
459  ** This routine deallocates all of the space allocated to
460  ** the specified list of training samples.
461  ** Return: none
462  ** Exceptions: none
463  ** History: Fri Aug 18 17:44:27 1989, DSJ, Created.
464  */
465  LABELEDLIST char_sample;
466  FEATURE_SET FeatureSet;
467  LIST FeatureList;
468 
469 
470  iterate(CharList) { /* iterate thru all of the fonts */
471  char_sample = (LABELEDLIST) first_node(CharList);
472  FeatureList = char_sample->List;
473  iterate(FeatureList) { /* iterate thru all of the classes */
474  FeatureSet = (FEATURE_SET) first_node(FeatureList);
475  FreeFeatureSet(FeatureSet);
476  }
477  FreeLabeledList(char_sample);
478  }
479  destroy(CharList);
480 } /* FreeTrainingSamples */
#define first_node(l)
Definition: oldlist.h:139
LIST destroy(LIST list)
Definition: oldlist.cpp:187
struct LABELEDLISTNODE * LABELEDLIST
#define iterate(l)
Definition: oldlist.h:159
void FreeLabeledList(LABELEDLIST LabeledList)
FEATURE_SET_STRUCT * FEATURE_SET
Definition: ocrfeatures.h:74
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:79
const char* GetNextFilename ( int  argc,
const char *const *  argv 
)

Definition at line 297 of file commontraining.cpp.

297  {
298  /*
299  ** Parameters: none
300  ** Globals:
301  ** tessoptind defined by tessopt sys call
302  ** Operation:
303  ** This routine returns the next command line argument. If
304  ** there are no remaining command line arguments, it returns
305  ** NULL. This routine should only be called after all option
306  ** arguments have been parsed and removed with ParseArguments.
307  ** Return: Next command line argument or NULL.
308  ** Exceptions: none
309  ** History: Fri Aug 18 09:34:12 1989, DSJ, Created.
310  */
311  if (tessoptind < argc)
312  return argv[tessoptind++];
313  else
314  return NULL;
315 } /* GetNextFilename */
#define NULL
Definition: host.h:144
int tessoptind
Definition: tessopt.cpp:24
INT_PARAM_FLAG ( debug_level  ,
,
"Level of Trainer debugging"   
)
INT_PARAM_FLAG ( load_images  ,
,
"Load images with tr files"   
)
void MergeInsignificantProtos ( LIST  ProtoList,
const char *  label,
CLUSTERER Clusterer,
CLUSTERCONFIG Config 
)

Definition at line 553 of file commontraining.cpp.

554  {
555  PROTOTYPE *Prototype;
556  bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
557 
558  LIST pProtoList = ProtoList;
559  iterate(pProtoList) {
560  Prototype = (PROTOTYPE *) first_node (pProtoList);
561  if (Prototype->Significant || Prototype->Merged)
562  continue;
563  FLOAT32 best_dist = 0.125;
564  PROTOTYPE* best_match = NULL;
565  // Find the nearest alive prototype.
566  LIST list_it = ProtoList;
567  iterate(list_it) {
568  PROTOTYPE* test_p = (PROTOTYPE *) first_node (list_it);
569  if (test_p != Prototype && !test_p->Merged) {
570  FLOAT32 dist = ComputeDistance(Clusterer->SampleSize,
571  Clusterer->ParamDesc,
572  Prototype->Mean, test_p->Mean);
573  if (dist < best_dist) {
574  best_match = test_p;
575  best_dist = dist;
576  }
577  }
578  }
579  if (best_match != NULL && !best_match->Significant) {
580  if (debug)
581  tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n",
582  best_match->NumSamples, Prototype->NumSamples,
583  best_match->Mean[0], best_match->Mean[1],
584  Prototype->Mean[0], Prototype->Mean[1]);
585  best_match->NumSamples = MergeClusters(Clusterer->SampleSize,
586  Clusterer->ParamDesc,
587  best_match->NumSamples,
588  Prototype->NumSamples,
589  best_match->Mean,
590  best_match->Mean, Prototype->Mean);
591  Prototype->NumSamples = 0;
592  Prototype->Merged = 1;
593  } else if (best_match != NULL) {
594  if (debug)
595  tprintf("Red proto at %g,%g matched a green one at %g,%g\n",
596  Prototype->Mean[0], Prototype->Mean[1],
597  best_match->Mean[0], best_match->Mean[1]);
598  Prototype->Merged = 1;
599  }
600  }
601  // Mark significant those that now have enough samples.
602  int min_samples = (inT32) (Config->MinSamples * Clusterer->NumChar);
603  pProtoList = ProtoList;
604  iterate(pProtoList) {
605  Prototype = (PROTOTYPE *) first_node (pProtoList);
606  // Process insignificant protos that do not match a green one
607  if (!Prototype->Significant && Prototype->NumSamples >= min_samples &&
608  !Prototype->Merged) {
609  if (debug)
610  tprintf("Red proto at %g,%g becoming green\n",
611  Prototype->Mean[0], Prototype->Mean[1]);
612  Prototype->Significant = true;
613  }
614  }
615 } /* MergeInsignificantProtos */
#define first_node(l)
Definition: oldlist.h:139
float FLOAT32
Definition: host.h:111
#define tprintf(...)
Definition: tprintf.h:31
#define iterate(l)
Definition: oldlist.h:159
FLOAT32 * Mean
Definition: cluster.h:78
unsigned Significant
Definition: cluster.h:68
unsigned NumSamples
Definition: cluster.h:75
FLOAT32 MinSamples
Definition: cluster.h:50
inT32 NumChar
Definition: cluster.h:93
FLOAT32 ComputeDistance(int k, PARAM_DESC *dim, FLOAT32 p1[], FLOAT32 p2[])
Definition: kdtree.cpp:486
PARAM_DESC * ParamDesc
Definition: cluster.h:88
inT32 MergeClusters(inT16 N, register PARAM_DESC ParamDesc[], register inT32 n1, register inT32 n2, register FLOAT32 m[], register FLOAT32 m1[], register FLOAT32 m2[])
#define NULL
Definition: host.h:144
unsigned Merged
Definition: cluster.h:69
inT16 SampleSize
Definition: cluster.h:87
int inT32
Definition: host.h:102
MERGE_CLASS NewLabeledClass ( const char *  Label)

Definition at line 730 of file commontraining.cpp.

732 {
733  MERGE_CLASS MergeClass;
734 
735  MergeClass = new MERGE_CLASS_NODE;
736  MergeClass->Label = (char*)Emalloc (strlen (Label)+1);
737  strcpy (MergeClass->Label, Label);
738  MergeClass->Class = NewClass (MAX_NUM_PROTOS, MAX_NUM_CONFIGS);
739  return (MergeClass);
740 
741 } /* NewLabeledClass */
#define MAX_NUM_CONFIGS
Definition: intproto.h:46
CLASS_TYPE NewClass(int NumProtos, int NumConfigs)
Definition: protos.cpp:248
void * Emalloc(int Size)
Definition: emalloc.cpp:35
#define MAX_NUM_PROTOS
Definition: intproto.h:47
CLASS_TYPE Class
LABELEDLIST NewLabeledList ( const char *  Label)

Definition at line 352 of file commontraining.cpp.

367 {
368  LABELEDLIST LabeledList;
369 
370  LabeledList = (LABELEDLIST) Emalloc (sizeof (LABELEDLISTNODE));
371  LabeledList->Label = (char*)Emalloc (strlen (Label)+1);
372  strcpy (LabeledList->Label, Label);
373  LabeledList->List = NIL_LIST;
374  LabeledList->SampleCount = 0;
375  LabeledList->font_sample_count = 0;
376  return (LabeledList);
377 
378 } /* NewLabeledList */
struct LABELEDLISTNODE * LABELEDLIST
void * Emalloc(int Size)
Definition: emalloc.cpp:35
#define NIL_LIST
Definition: oldlist.h:126
void Normalize ( float *  Values)

Definition at line 837 of file commontraining.cpp.

839 {
840  register float Slope;
841  register float Intercept;
842  register float Normalizer;
843 
844  Slope = tan (Values [2] * 2 * PI);
845  Intercept = Values [1] - Slope * Values [0];
846  Normalizer = 1 / sqrt (Slope * Slope + 1.0);
847 
848  Values [0] = Slope * Normalizer;
849  Values [1] = - Normalizer;
850  Values [2] = Intercept * Normalizer;
851 } // Normalize
#define PI
Definition: const.h:19
int NumberOfProtos ( LIST  ProtoList,
BOOL8  CountSigProtos,
BOOL8  CountInsigProtos 
)

Definition at line 888 of file commontraining.cpp.

892 {
893  int N = 0;
894  PROTOTYPE *Proto;
895 
896  iterate(ProtoList)
897  {
898  Proto = (PROTOTYPE *) first_node ( ProtoList );
899  if (( Proto->Significant && CountSigProtos ) ||
900  ( ! Proto->Significant && CountInsigProtos ) )
901  N++;
902  }
903  return(N);
904 }
#define first_node(l)
Definition: oldlist.h:139
#define iterate(l)
Definition: oldlist.h:159
unsigned Significant
Definition: cluster.h:68
void ParseArguments ( int *  argc,
char ***  argv 
)

Definition at line 89 of file commontraining.cpp.

89  {
90  STRING usage;
91  if (*argc) {
92  usage += (*argv)[0];
93  }
94  usage += " [.tr files ...]";
95  tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
96  // Record the index of the first non-flag argument to 1, since we set
97  // remove_flags to true when parsing the flags.
98  tessoptind = 1;
99  // Set some global values based on the flags.
101  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_min_samples_fraction)));
103  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_max_illegal)));
105  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_independence)));
107  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_confidence)));
108  // Set additional parameters from config file if specified.
109  if (!FLAGS_configfile.empty()) {
111  FLAGS_configfile.c_str(),
113  ccutil.params());
114  }
115 }
#define MAX(x, y)
Definition: ndminx.h:24
#define MIN(x, y)
Definition: ndminx.h:28
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
CLUSTERCONFIG Config
CCUtil ccutil
FLOAT32 Independence
Definition: cluster.h:53
FLOAT32 MaxIllegal
Definition: cluster.h:51
FLOAT64 Confidence
Definition: cluster.h:54
FLOAT32 MinSamples
Definition: cluster.h:50
ParamsVectors * params()
Definition: ccutil.h:65
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:41
Definition: strngs.h:44
int tessoptind
Definition: tessopt.cpp:24
const char * c_str() const
Definition: strngs.cpp:204
void ReadTrainingSamples ( const FEATURE_DEFS_STRUCT feature_defs,
const char *  feature_name,
int  max_samples,
UNICHARSET unicharset,
FILE *  file,
LIST training_samples 
)

Definition at line 383 of file commontraining.cpp.

386  {
387 /*
388 ** Parameters:
389 ** file open text file to read samples from
390 ** Globals: none
391 ** Operation:
392 ** This routine reads training samples from a file and
393 ** places them into a data structure which organizes the
394 ** samples by FontName and CharName. It then returns this
395 ** data structure.
396 ** Return: none
397 ** Exceptions: none
398 ** History: Fri Aug 18 13:11:39 1989, DSJ, Created.
399 ** Tue May 17 1998 simplifications to structure, illiminated
400 ** font, and feature specification levels of structure.
401 */
402  char buffer[2048];
403  char unichar[UNICHAR_LEN + 1];
404  LABELEDLIST char_sample;
405  FEATURE_SET feature_samples;
406  CHAR_DESC char_desc;
407  int i;
408  int feature_type = ShortNameToFeatureType(feature_defs, feature_name);
409  // Zero out the font_sample_count for all the classes.
410  LIST it = *training_samples;
411  iterate(it) {
412  char_sample = reinterpret_cast<LABELEDLIST>(first_node(it));
413  char_sample->font_sample_count = 0;
414  }
415 
416  while (fgets(buffer, 2048, file) != NULL) {
417  if (buffer[0] == '\n')
418  continue;
419 
420  sscanf(buffer, "%*s %s", unichar);
421  if (unicharset != NULL && !unicharset->contains_unichar(unichar)) {
422  unicharset->unichar_insert(unichar);
423  if (unicharset->size() > MAX_NUM_CLASSES) {
424  tprintf("Error: Size of unicharset in training is "
425  "greater than MAX_NUM_CLASSES\n");
426  exit(1);
427  }
428  }
429  char_sample = FindList(*training_samples, unichar);
430  if (char_sample == NULL) {
431  char_sample = NewLabeledList(unichar);
432  *training_samples = push(*training_samples, char_sample);
433  }
434  char_desc = ReadCharDescription(feature_defs, file);
435  feature_samples = char_desc->FeatureSets[feature_type];
436  if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
437  char_sample->List = push(char_sample->List, feature_samples);
438  char_sample->SampleCount++;
439  char_sample->font_sample_count++;
440  } else {
441  FreeFeatureSet(feature_samples);
442  }
443  for (i = 0; i < char_desc->NumFeatureSets; i++) {
444  if (feature_type != i)
445  FreeFeatureSet(char_desc->FeatureSets[i]);
446  }
447  free(char_desc);
448  }
449 } // ReadTrainingSamples
CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
Definition: featdefs.cpp:263
#define first_node(l)
Definition: oldlist.h:139
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
#define tprintf(...)
Definition: tprintf.h:31
FEATURE_SET FeatureSets[NUM_FEATURE_TYPES]
Definition: featdefs.h:44
uinT32 NumFeatureSets
Definition: featdefs.h:43
#define iterate(l)
Definition: oldlist.h:159
LABELEDLIST NewLabeledList(const char *Label)
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612
LABELEDLIST FindList(LIST List, char *Label)
int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:302
LIST push(LIST list, void *element)
Definition: oldlist.cpp:323
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
#define NULL
Definition: host.h:144
#define UNICHAR_LEN
Definition: unichar.h:30
int size() const
Definition: unicharset.h:297
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:79
LIST RemoveInsignificantProtos ( LIST  ProtoList,
BOOL8  KeepSigProtos,
BOOL8  KeepInsigProtos,
int  N 
)

Definition at line 645 of file commontraining.cpp.

651 {
652  LIST NewProtoList = NIL_LIST;
653  LIST pProtoList;
654  PROTOTYPE* Proto;
655  PROTOTYPE* NewProto;
656  int i;
657 
658  pProtoList = ProtoList;
659  iterate(pProtoList)
660  {
661  Proto = (PROTOTYPE *) first_node (pProtoList);
662  if ((Proto->Significant && KeepSigProtos) ||
663  (!Proto->Significant && KeepInsigProtos))
664  {
665  NewProto = (PROTOTYPE *)Emalloc(sizeof(PROTOTYPE));
666 
667  NewProto->Mean = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
668  NewProto->Significant = Proto->Significant;
669  NewProto->Style = Proto->Style;
670  NewProto->NumSamples = Proto->NumSamples;
671  NewProto->Cluster = NULL;
672  NewProto->Distrib = NULL;
673 
674  for (i=0; i < N; i++)
675  NewProto->Mean[i] = Proto->Mean[i];
676  if (Proto->Variance.Elliptical != NULL)
677  {
678  NewProto->Variance.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
679  for (i=0; i < N; i++)
680  NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i];
681  }
682  else
683  NewProto->Variance.Elliptical = NULL;
684  //---------------------------------------------
685  if (Proto->Magnitude.Elliptical != NULL)
686  {
687  NewProto->Magnitude.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
688  for (i=0; i < N; i++)
689  NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i];
690  }
691  else
692  NewProto->Magnitude.Elliptical = NULL;
693  //------------------------------------------------
694  if (Proto->Weight.Elliptical != NULL)
695  {
696  NewProto->Weight.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
697  for (i=0; i < N; i++)
698  NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i];
699  }
700  else
701  NewProto->Weight.Elliptical = NULL;
702 
703  NewProto->TotalMagnitude = Proto->TotalMagnitude;
704  NewProto->LogMagnitude = Proto->LogMagnitude;
705  NewProtoList = push_last(NewProtoList, NewProto);
706  }
707  }
708  FreeProtoList(&ProtoList);
709  return (NewProtoList);
710 } /* RemoveInsignificantProtos */
#define first_node(l)
Definition: oldlist.h:139
float FLOAT32
Definition: host.h:111
DISTRIBUTION * Distrib
Definition: cluster.h:77
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:338
#define iterate(l)
Definition: oldlist.h:159
FLOAT32 LogMagnitude
Definition: cluster.h:80
FLOATUNION Variance
Definition: cluster.h:81
FLOAT32 * Mean
Definition: cluster.h:78
unsigned Significant
Definition: cluster.h:68
FLOATUNION Weight
Definition: cluster.h:83
FLOAT32 TotalMagnitude
Definition: cluster.h:79
unsigned NumSamples
Definition: cluster.h:75
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:564
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 * Elliptical
Definition: cluster.h:64
CLUSTER * Cluster
Definition: cluster.h:76
void * Emalloc(int Size)
Definition: emalloc.cpp:35
#define NIL_LIST
Definition: oldlist.h:126
unsigned Style
Definition: cluster.h:74
#define NULL
Definition: host.h:144
CLUSTERER* SetUpForClustering ( const FEATURE_DEFS_STRUCT FeatureDefs,
LABELEDLIST  char_sample,
const char *  program_feature_type 
)

Definition at line 502 of file commontraining.cpp.

504  {
505 /*
506  ** Parameters:
507  ** char_sample: LABELEDLIST that holds all the feature information for a
508  ** given character.
509  ** Globals:
510  ** None
511  ** Operation:
512  ** This routine reads samples from a LABELEDLIST and enters
513  ** those samples into a clusterer data structure. This
514  ** data structure is then returned to the caller.
515  ** Return:
516  ** Pointer to new clusterer data structure.
517  ** Exceptions:
518  ** None
519  ** History:
520  ** 8/16/89, DSJ, Created.
521  */
522  uinT16 N;
523  int i, j;
524  FLOAT32 *Sample = NULL;
525  CLUSTERER *Clusterer;
526  inT32 CharID;
527  LIST FeatureList = NULL;
528  FEATURE_SET FeatureSet = NULL;
529 
530  int desc_index = ShortNameToFeatureType(FeatureDefs, program_feature_type);
531  N = FeatureDefs.FeatureDesc[desc_index]->NumParams;
532  Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
533 
534  FeatureList = char_sample->List;
535  CharID = 0;
536  iterate(FeatureList) {
537  FeatureSet = (FEATURE_SET) first_node(FeatureList);
538  for (i = 0; i < FeatureSet->MaxNumFeatures; i++) {
539  if (Sample == NULL)
540  Sample = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
541  for (j = 0; j < N; j++)
542  Sample[j] = FeatureSet->Features[i]->Params[j];
543  MakeSample (Clusterer, Sample, CharID);
544  }
545  CharID++;
546  }
547  if ( Sample != NULL ) free( Sample );
548  return( Clusterer );
549 
550 } /* SetUpForClustering */
#define first_node(l)
Definition: oldlist.h:139
float FLOAT32
Definition: host.h:111
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:50
CLUSTERER * MakeClusterer(inT16 SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:399
SAMPLE * MakeSample(CLUSTERER *Clusterer, const FLOAT32 *Feature, inT32 CharID)
Definition: cluster.cpp:454
#define iterate(l)
Definition: oldlist.h:159
FEATURE Features[1]
Definition: ocrfeatures.h:72
FLOAT32 Params[1]
Definition: ocrfeatures.h:65
void * Emalloc(int Size)
Definition: emalloc.cpp:35
FEATURE_SET_STRUCT * FEATURE_SET
Definition: ocrfeatures.h:74
int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:302
#define NULL
Definition: host.h:144
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:59
unsigned short uinT16
Definition: host.h:101
int inT32
Definition: host.h:102
CLASS_STRUCT* SetUpForFloat2Int ( const UNICHARSET unicharset,
LIST  LabeledClassList 
)

SetUpForFloat2Int

Definition at line 774 of file commontraining.cpp.

775  {
776  MERGE_CLASS MergeClass;
777  CLASS_TYPE Class;
778  int NumProtos;
779  int NumConfigs;
780  int NumWords;
781  int i, j;
782  float Values[3];
783  PROTO NewProto;
784  PROTO OldProto;
785  BIT_VECTOR NewConfig;
786  BIT_VECTOR OldConfig;
787 
788  // printf("Float2Int ...\n");
789 
790  CLASS_STRUCT* float_classes = new CLASS_STRUCT[unicharset.size()];
791  iterate(LabeledClassList)
792  {
793  UnicityTableEqEq<int> font_set;
794  MergeClass = (MERGE_CLASS) first_node (LabeledClassList);
795  Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label)];
796  NumProtos = MergeClass->Class->NumProtos;
797  NumConfigs = MergeClass->Class->NumConfigs;
798  font_set.move(&MergeClass->Class->font_set);
799  Class->NumProtos = NumProtos;
800  Class->MaxNumProtos = NumProtos;
801  Class->Prototypes = (PROTO) Emalloc (sizeof(PROTO_STRUCT) * NumProtos);
802  for(i=0; i < NumProtos; i++)
803  {
804  NewProto = ProtoIn(Class, i);
805  OldProto = ProtoIn(MergeClass->Class, i);
806  Values[0] = OldProto->X;
807  Values[1] = OldProto->Y;
808  Values[2] = OldProto->Angle;
809  Normalize(Values);
810  NewProto->X = OldProto->X;
811  NewProto->Y = OldProto->Y;
812  NewProto->Length = OldProto->Length;
813  NewProto->Angle = OldProto->Angle;
814  NewProto->A = Values[0];
815  NewProto->B = Values[1];
816  NewProto->C = Values[2];
817  }
818 
819  Class->NumConfigs = NumConfigs;
820  Class->MaxNumConfigs = NumConfigs;
821  Class->font_set.move(&font_set);
822  Class->Configurations = (BIT_VECTOR*) Emalloc (sizeof(BIT_VECTOR) * NumConfigs);
823  NumWords = WordsInVectorOfSize(NumProtos);
824  for(i=0; i < NumConfigs; i++)
825  {
826  NewConfig = NewBitVector(NumProtos);
827  OldConfig = MergeClass->Class->Configurations[i];
828  for(j=0; j < NumWords; j++)
829  NewConfig[j] = OldConfig[j];
830  Class->Configurations[i] = NewConfig;
831  }
832  }
833  return float_classes;
834 } // SetUpForFloat2Int
PROTO_STRUCT * PROTO
Definition: protos.h:52
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
#define first_node(l)
Definition: oldlist.h:139
#define ProtoIn(Class, Pid)
Definition: protos.h:123
inT16 NumConfigs
Definition: protos.h:62
void Normalize(float *Values)
#define iterate(l)
Definition: oldlist.h:159
UnicityTableEqEq< int > font_set
Definition: protos.h:65
inT16 NumProtos
Definition: protos.h:59
BIT_VECTOR NewBitVector(int NumBits)
Definition: bitvec.cpp:90
FLOAT32 X
Definition: protos.h:47
FLOAT32 Angle
Definition: protos.h:49
inT16 MaxNumConfigs
Definition: protos.h:63
FLOAT32 B
Definition: protos.h:45
#define WordsInVectorOfSize(NumBits)
Definition: bitvec.h:63
MERGE_CLASS_NODE * MERGE_CLASS
void * Emalloc(int Size)
Definition: emalloc.cpp:35
FLOAT32 Length
Definition: protos.h:50
uinT32 * BIT_VECTOR
Definition: bitvec.h:28
void move(UnicityTable< T > *from)
FLOAT32 C
Definition: protos.h:46
int size() const
Definition: unicharset.h:297
FLOAT32 A
Definition: protos.h:44
inT16 MaxNumProtos
Definition: protos.h:60
PROTO Prototypes
Definition: protos.h:61
CLASS_TYPE Class
CONFIGS Configurations
Definition: protos.h:64
FLOAT32 Y
Definition: protos.h:48
STRING_PARAM_FLAG ( configfile  ,
""  ,
"File to load more configs from"   
)
STRING_PARAM_FLAG ( ,
""  ,
"Directory to write output files to"   
)
STRING_PARAM_FLAG ( ,
"font_properties"  ,
"File listing font properties"   
)
STRING_PARAM_FLAG ( ,
""  ,
"File listing font xheights"   
)
STRING_PARAM_FLAG ( ,
"unicharset"  ,
"File to load unicharset from"   
)
STRING_PARAM_FLAG ( ,
""  ,
"File to write unicharset to"   
)
STRING_PARAM_FLAG ( ,
""  ,
"File to load trainer from"   
)
STRING_PARAM_FLAG ( output_trainer  ,
""  ,
"File to write trainer to"   
)
STRING_PARAM_FLAG ( test_ch  ,
""  ,
"UTF8 test character string"   
)

Variable Documentation

CCUtil ccutil

Definition at line 53 of file commontraining.cpp.

CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 }

Definition at line 51 of file commontraining.cpp.

FEATURE_DEFS_STRUCT feature_defs

Definition at line 52 of file commontraining.cpp.