tesseract  3.04.00
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cntraining.cpp
Go to the documentation of this file.
1 /******************************************************************************
2 ** Filename: cntraining.cpp
3 ** Purpose: Generates a normproto and pffmtable.
4 ** Author: Dan Johnson
5 ** Revisment: Christy Russon
6 ** History: Fri Aug 18 08:53:50 1989, DSJ, Created.
7 ** 5/25/90, DSJ, Adapted to multiple feature types.
8 ** Tuesday, May 17, 1998 Changes made to make feature specific and
9 ** simplify structures. First step in simplifying training process.
10 **
11  ** (c) Copyright Hewlett-Packard Company, 1988.
12  ** Licensed under the Apache License, Version 2.0 (the "License");
13  ** you may not use this file except in compliance with the License.
14  ** You may obtain a copy of the License at
15  ** http://www.apache.org/licenses/LICENSE-2.0
16  ** Unless required by applicable law or agreed to in writing, software
17  ** distributed under the License is distributed on an "AS IS" BASIS,
18  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19  ** See the License for the specific language governing permissions and
20  ** limitations under the License.
21 ******************************************************************************/
22 
23 
27 #include "oldlist.h"
28 #include "efio.h"
29 #include "emalloc.h"
30 #include "featdefs.h"
31 #include "tessopt.h"
32 #include "ocrfeatures.h"
33 #include "clusttool.h"
34 #include "cluster.h"
35 #include <string.h>
36 #include <stdio.h>
37 #include <math.h>
38 #include "unichar.h"
39 #include "commontraining.h"
40 
41 #define PROGRAM_FEATURE_TYPE "cn"
42 
44 
48 int main (
49  int argc,
50  char **argv);
51 
56 void WriteNormProtos (
57  const char *Directory,
58  LIST LabeledProtoList,
59  CLUSTERER *Clusterer);
60 
61 /*
62 PARAMDESC *ConvertToPARAMDESC(
63  PARAM_DESC* Param_Desc,
64  int N);
65 */
66 
67 void WriteProtos(
68  FILE *File,
69  uinT16 N,
70  LIST ProtoList,
71  BOOL8 WriteSigProtos,
72  BOOL8 WriteInsigProtos);
73 
77 /* global variable to hold configuration parameters to control clustering */
78 //-M 0.025 -B 0.05 -I 0.8 -C 1e-3
80 {
81  elliptical, 0.025, 0.05, 0.8, 1e-3, 0
82 };
83 
84 
88 /*---------------------------------------------------------------------------*/
89 int main(int argc, char* argv[])
90 
91 /*
92 ** Parameters:
93 ** argc number of command line arguments
94 ** argv array of command line arguments
95 ** Globals: none
96 ** Operation:
97 ** This program reads in a text file consisting of feature
98 ** samples from a training page in the following format:
99 **
100 ** FontName CharName NumberOfFeatureTypes(N)
101 ** FeatureTypeName1 NumberOfFeatures(M)
102 ** Feature1
103 ** ...
104 ** FeatureM
105 ** FeatureTypeName2 NumberOfFeatures(M)
106 ** Feature1
107 ** ...
108 ** FeatureM
109 ** ...
110 ** FeatureTypeNameN NumberOfFeatures(M)
111 ** Feature1
112 ** ...
113 ** FeatureM
114 ** FontName CharName ...
115 **
116 ** It then appends these samples into a separate file for each
117 ** character. The name of the file is
118 **
119 ** DirectoryName/FontName/CharName.FeatureTypeName
120 **
121 ** The DirectoryName can be specified via a command
122 ** line argument. If not specified, it defaults to the
123 ** current directory. The format of the resulting files is:
124 **
125 ** NumberOfFeatures(M)
126 ** Feature1
127 ** ...
128 ** FeatureM
129 ** NumberOfFeatures(M)
130 ** ...
131 **
132 ** The output files each have a header which describes the
133 ** type of feature which the file contains. This header is
134 ** in the format required by the clusterer. A command line
135 ** argument can also be used to specify that only the first
136 ** N samples of each class should be used.
137 ** Return: none
138 ** Exceptions: none
139 ** History: Fri Aug 18 08:56:17 1989, DSJ, Created.
140 */
141 
142 {
143  // Set the global Config parameters before parsing the command line.
144  Config = CNConfig;
145 
146  const char *PageName;
147  FILE *TrainingPage;
148  LIST CharList = NIL_LIST;
149  CLUSTERER *Clusterer = NULL;
150  LIST ProtoList = NIL_LIST;
151  LIST NormProtoList = NIL_LIST;
152  LIST pCharList;
153  LABELEDLIST CharSample;
154  FEATURE_DEFS_STRUCT FeatureDefs;
155  InitFeatureDefs(&FeatureDefs);
156 
157  ParseArguments(&argc, &argv);
158  int num_fonts = 0;
159  while ((PageName = GetNextFilename(argc, argv)) != NULL) {
160  printf("Reading %s ...\n", PageName);
161  TrainingPage = Efopen(PageName, "rb");
163  100, NULL, TrainingPage, &CharList);
164  fclose(TrainingPage);
165  ++num_fonts;
166  }
167  printf("Clustering ...\n");
168  // To allow an individual font to form a separate cluster,
169  // reduce the min samples:
170  // Config.MinSamples = 0.5 / num_fonts;
171  pCharList = CharList;
172  iterate(pCharList) {
173  //Cluster
174  CharSample = (LABELEDLIST)first_node(pCharList);
175  Clusterer =
176  SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
177  float SavedMinSamples = Config.MinSamples;
178  // To disable the tendency to produce a single cluster for all fonts,
179  // make MagicSamples an impossible to achieve number:
180  // Config.MagicSamples = CharSample->SampleCount * 10;
181  Config.MagicSamples = CharSample->SampleCount;
182  while (Config.MinSamples > 0.001) {
183  ProtoList = ClusterSamples(Clusterer, &Config);
184  if (NumberOfProtos(ProtoList, 1, 0) > 0) {
185  break;
186  } else {
187  Config.MinSamples *= 0.95;
188  printf("0 significant protos for %s."
189  " Retrying clustering with MinSamples = %f%%\n",
190  CharSample->Label, Config.MinSamples);
191  }
192  }
193  Config.MinSamples = SavedMinSamples;
194  AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
195  }
196  FreeTrainingSamples(CharList);
197  if (Clusterer == NULL) { // To avoid a SIGSEGV
198  fprintf(stderr, "Error: NULL clusterer!\n");
199  return 1;
200  }
201  WriteNormProtos(FLAGS_D.c_str(), NormProtoList, Clusterer);
202  FreeNormProtoList(NormProtoList);
203  FreeProtoList(&ProtoList);
204  FreeClusterer(Clusterer);
205  printf ("\n");
206  return 0;
207 } // main
208 
209 
214 /*----------------------------------------------------------------------------*/
216  const char *Directory,
217  LIST LabeledProtoList,
218  CLUSTERER *Clusterer)
219 
220 /*
221 ** Parameters:
222 ** Directory directory to place sample files into
223 ** Operation:
224 ** This routine writes the specified samples into files which
225 ** are organized according to the font name and character name
226 ** of the samples.
227 ** Return: none
228 ** Exceptions: none
229 ** History: Fri Aug 18 16:17:06 1989, DSJ, Created.
230 */
231 
232 {
233  FILE *File;
234  STRING Filename;
235  LABELEDLIST LabeledProto;
236  int N;
237 
238  Filename = "";
239  if (Directory != NULL && Directory[0] != '\0')
240  {
241  Filename += Directory;
242  Filename += "/";
243  }
244  Filename += "normproto";
245  printf ("\nWriting %s ...", Filename.string());
246  File = Efopen (Filename.string(), "wb");
247  fprintf(File,"%0d\n",Clusterer->SampleSize);
248  WriteParamDesc(File,Clusterer->SampleSize,Clusterer->ParamDesc);
249  iterate(LabeledProtoList)
250  {
251  LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
252  N = NumberOfProtos(LabeledProto->List, true, false);
253  if (N < 1) {
254  printf ("\nError! Not enough protos for %s: %d protos"
255  " (%d significant protos"
256  ", %d insignificant protos)\n",
257  LabeledProto->Label, N,
258  NumberOfProtos(LabeledProto->List, 1, 0),
259  NumberOfProtos(LabeledProto->List, 0, 1));
260  exit(1);
261  }
262  fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
263  WriteProtos(File, Clusterer->SampleSize, LabeledProto->List, true, false);
264  }
265  fclose (File);
266 
267 } // WriteNormProtos
268 
269 /*-------------------------------------------------------------------------*/
271  FILE *File,
272  uinT16 N,
273  LIST ProtoList,
274  BOOL8 WriteSigProtos,
275  BOOL8 WriteInsigProtos)
276 {
277  PROTOTYPE *Proto;
278 
279  // write prototypes
280  iterate(ProtoList)
281  {
282  Proto = (PROTOTYPE *) first_node ( ProtoList );
283  if (( Proto->Significant && WriteSigProtos ) ||
284  ( ! Proto->Significant && WriteInsigProtos ) )
285  WritePrototype( File, N, Proto );
286  }
287 } // WriteProtos
void WritePrototype(FILE *File, uinT16 N, PROTOTYPE *Proto)
Definition: clusttool.cpp:348
void WriteNormProtos(const char *Directory, LIST LabeledProtoList, CLUSTERER *Clusterer)
Definition: cntraining.cpp:215
#define first_node(l)
Definition: oldlist.h:139
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
int main(int argc, char **argv)
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:121
struct LABELEDLISTNODE * LABELEDLIST
void ParseArguments(int *argc, char ***argv)
DECLARE_STRING_PARAM_FLAG(D)
unsigned char BOOL8
Definition: host.h:113
#define iterate(l)
Definition: oldlist.h:159
CLUSTERCONFIG Config
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
void FreeNormProtoList(LIST CharList)
void FreeTrainingSamples(LIST CharList)
unsigned Significant
Definition: cluster.h:68
void WriteProtos(FILE *File, uinT16 N, LIST ProtoList, BOOL8 WriteSigProtos, BOOL8 WriteInsigProtos)
Definition: cntraining.cpp:270
int MagicSamples
Definition: cluster.h:55
#define PROGRAM_FEATURE_TYPE
Definition: cntraining.cpp:41
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:508
FILE * Efopen(const char *Name, const char *Mode)
Definition: efio.cpp:32
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:564
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
FLOAT32 MinSamples
Definition: cluster.h:50
#define NIL_LIST
Definition: oldlist.h:126
PARAM_DESC * ParamDesc
Definition: cluster.h:88
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:536
CLUSTERCONFIG CNConfig
Definition: cntraining.cpp:79
Definition: strngs.h:44
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
inT16 SampleSize
Definition: cluster.h:87
int NumberOfProtos(LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
void WriteParamDesc(FILE *File, uinT16 N, PARAM_DESC ParamDesc[])
Definition: clusttool.cpp:318
unsigned short uinT16
Definition: host.h:101
const char * GetNextFilename(int argc, const char *const *argv)