tesseract  3.04.00
reject.cpp File Reference
#include "tessvars.h"
#include "scanutils.h"
#include <ctype.h>
#include <string.h>
#include "genericvector.h"
#include "reject.h"
#include "control.h"
#include "docqual.h"
#include "globaloc.h"
#include "globals.h"
#include "helpers.h"
#include "tesseractclass.h"

Go to the source code of this file.

Namespaces

 tesseract
 

Functions

 CLISTIZEH (STRING) CLISTIZE(STRING) namespace tesseract
 
void reject_blanks (WERD_RES *word)
 
void reject_poor_matches (WERD_RES *word)
 
float compute_reject_threshold (WERD_CHOICE *word)
 

Function Documentation

CLISTIZEH ( STRING  )

Definition at line 48 of file reject.cpp.

56  {
57 void Tesseract::set_done(WERD_RES *word, inT16 pass) {
58  word->done = word->tess_accepted &&
59  (strchr(word->best_choice->unichar_string().string(), ' ') == NULL);
60  bool word_is_ambig = word->best_choice->dangerous_ambig_found();
61  bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
62  word->best_choice->permuter() == FREQ_DAWG_PERM ||
64  if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
65  one_ell_conflict(word, FALSE)) {
66  if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
67  word->done = FALSE;
68  }
69  if (word->done && ((!word_from_dict &&
70  word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
71  if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
72  word->done = FALSE;
73  }
74  if (tessedit_rejection_debug) {
75  tprintf("set_done(): done=%d\n", word->done);
76  word->best_choice->print("");
77  }
78 }
79 
80 
81 /*************************************************************************
82  * make_reject_map()
83  *
84  * Sets the done flag to indicate whether the resylt is acceptable.
85  *
86  * Sets a reject map for the word.
87  *************************************************************************/
88 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, inT16 pass) {
89  int i;
90  int offset;
91 
92  flip_0O(word);
93  check_debug_pt(word, -1); // For trap only
94  set_done(word, pass); // Set acceptance
96  reject_blanks(word);
97  /*
98  0: Rays original heuristic - the baseline
99  */
100  if (tessedit_reject_mode == 0) {
101  if (!word->done)
102  reject_poor_matches(word);
103  } else if (tessedit_reject_mode == 5) {
104  /*
105  5: Reject I/1/l from words where there is no strong contextual confirmation;
106  the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
107  and the whole of any words which are very small
108  */
109  if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
111  } else {
112  one_ell_conflict(word, TRUE);
113  /*
114  Originally the code here just used the done flag. Now I have duplicated
115  and unpacked the conditions for setting the done flag so that each
116  mechanism can be turned on or off independently. This works WITHOUT
117  affecting the done flag setting.
118  */
119  if (rej_use_tess_accepted && !word->tess_accepted)
121 
122  if (rej_use_tess_blanks &&
123  (strchr (word->best_choice->unichar_string().string (), ' ') != NULL))
125 
126  WERD_CHOICE* best_choice = word->best_choice;
127  if (rej_use_good_perm) {
128  if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
129  best_choice->permuter() == FREQ_DAWG_PERM ||
130  best_choice->permuter() == USER_DAWG_PERM) &&
131  (!rej_use_sensible_wd ||
132  acceptable_word_string(*word->uch_set,
133  best_choice->unichar_string().string(),
134  best_choice->unichar_lengths().string()) !=
135  AC_UNACCEPTABLE)) {
136  // PASSED TEST
137  } else if (best_choice->permuter() == NUMBER_PERM) {
138  if (rej_alphas_in_number_perm) {
139  for (i = 0, offset = 0;
140  best_choice->unichar_string()[offset] != '\0';
141  offset += best_choice->unichar_lengths()[i++]) {
142  if (word->reject_map[i].accepted() &&
143  word->uch_set->get_isalpha(
144  best_choice->unichar_string().string() + offset,
145  best_choice->unichar_lengths()[i]))
146  word->reject_map[i].setrej_bad_permuter();
147  // rej alpha
148  }
149  }
150  } else {
152  }
153  }
154  /* Ambig word rejection was here once !!*/
155  }
156  } else {
157  tprintf("BAD tessedit_reject_mode\n");
158  err_exit();
159  }
160 
161  if (tessedit_image_border > -1)
162  reject_edge_blobs(word);
163 
164  check_debug_pt (word, 10);
165  if (tessedit_rejection_debug) {
166  tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
167  tprintf("Certainty: %f Rating: %f\n",
168  word->best_choice->certainty (), word->best_choice->rating ());
169  tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
170  }
171 
172  flip_hyphens(word);
173  check_debug_pt(word, 20);
174 }
175 } // namespace tesseract
float y_scale() const
Definition: normalis.h:272
#define tprintf(...)
Definition: tprintf.h:31
Definition: ocrrow.h:32
void rej_word_contains_blanks()
Definition: rejctmap.cpp:443
void reject_blanks(WERD_RES *word)
Definition: reject.cpp:178
inT32 length() const
Definition: strngs.cpp:188
const int kBlnXHeight
Definition: normalis.h:28
#define NULL
Definition: host.h:144
float certainty() const
Definition: ratngs.h:327
Unacceptable word.
Definition: control.h:36
const UNICHARSET * uch_set
Definition: pageres.h:192
#define FALSE
Definition: capi.h:29
void err_exit()
Definition: globaloc.cpp:74
void initialise(inT16 length)
Definition: rejctmap.cpp:318
void flip_hyphens(WERD_RES *word)
float rating() const
Definition: ratngs.h:324
const char * string() const
Definition: strngs.cpp:193
DENORM denorm
Definition: pageres.h:190
const STRING & unichar_string() const
Definition: ratngs.h:524
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
void rej_word_not_tess_accepted()
Definition: rejctmap.cpp:434
short inT16
Definition: host.h:100
uinT8 permuter() const
Definition: ratngs.h:343
void reject_poor_matches(WERD_RES *word)
Definition: reject.cpp:207
void print() const
Definition: ratngs.h:563
BOOL8 done
Definition: pageres.h:282
#define TRUE
Definition: capi.h:28
BOOL8 tess_accepted
Definition: pageres.h:280
const STRING & unichar_lengths() const
Definition: ratngs.h:531
void rej_word_small_xht()
Definition: rejctmap.cpp:416
void rej_word_bad_permuter()
Definition: rejctmap.cpp:452
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
bool dangerous_ambig_found() const
Definition: ratngs.h:360
void flip_0O(WERD_RES *word)
float compute_reject_threshold ( WERD_CHOICE word)

Definition at line 226 of file reject.cpp.

226  {
227  float threshold; // rejection threshold
228  float bestgap = 0.0f; // biggest gap
229  float gapstart; // bottom of gap
230  // super iterator
231  BLOB_CHOICE_IT choice_it; // real iterator
232 
233  int blob_count = word->length();
234  GenericVector<float> ratings;
235  ratings.init_to_size(blob_count, 0.0f);
236  for (int i = 0; i < blob_count; ++i) {
237  ratings[i] = word->certainty(i);
238  }
239  ratings.sort();
240  gapstart = ratings[0] - 1; // all reject if none better
241  if (blob_count >= 3) {
242  for (int index = 0; index < blob_count - 1; index++) {
243  if (ratings[index + 1] - ratings[index] > bestgap) {
244  bestgap = ratings[index + 1] - ratings[index];
245  // find biggest
246  gapstart = ratings[index];
247  }
248  }
249  }
250  threshold = gapstart + bestgap / 2;
251 
252  return threshold;
253 }
float certainty() const
Definition: ratngs.h:327
void init_to_size(int size, T t)
int length() const
Definition: ratngs.h:300
void reject_blanks ( WERD_RES word)

Definition at line 178 of file reject.cpp.

178  {
179  inT16 i;
180  inT16 offset;
181 
182  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
183  offset += word->best_choice->unichar_lengths()[i], i += 1) {
184  if (word->best_choice->unichar_string()[offset] == ' ')
185  //rej unrecognised blobs
186  word->reject_map[i].setrej_tess_failure ();
187  }
188 }
const STRING & unichar_string() const
Definition: ratngs.h:524
short inT16
Definition: host.h:100
const STRING & unichar_lengths() const
Definition: ratngs.h:531
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
void reject_poor_matches ( WERD_RES word)

Definition at line 207 of file reject.cpp.

207  {
208  float threshold = compute_reject_threshold(word->best_choice);
209  for (int i = 0; i < word->best_choice->length(); ++i) {
210  if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
211  word->reject_map[i].setrej_tess_failure();
212  else if (word->best_choice->certainty(i) < threshold)
213  word->reject_map[i].setrej_poor_match();
214  }
215 }
float compute_reject_threshold(WERD_CHOICE *word)
Definition: reject.cpp:226
float certainty() const
Definition: ratngs.h:327
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271