tesseract  3.04.00
language_model.cpp
Go to the documentation of this file.
1 // File: language_model.cpp
3 // Description: Functions that utilize the knowledge about the properties,
4 // structure and statistics of the language to help recognition.
5 // Author: Daria Antonova
6 // Created: Mon Nov 11 11:26:43 PST 2009
7 //
8 // (C) Copyright 2009, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #include <math.h>
22 
23 #include "language_model.h"
24 
25 #include "dawg.h"
26 #include "freelist.h"
27 #include "intproto.h"
28 #include "helpers.h"
29 #include "lm_state.h"
30 #include "lm_pain_points.h"
31 #include "matrix.h"
32 #include "params.h"
34 
35 #if defined(_MSC_VER) || defined(ANDROID)
36 double log2(double n) {
37  return log(n) / log(2.0);
38 }
39 #endif // _MSC_VER
40 
41 namespace tesseract {
42 
43 const float LanguageModel::kMaxAvgNgramCost = 25.0f;
44 
46  Dict *dict)
47  : INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
48  dict->getCCUtil()->params()),
49  BOOL_INIT_MEMBER(language_model_ngram_on, false,
50  "Turn on/off the use of character ngram model",
51  dict->getCCUtil()->params()),
52  INT_MEMBER(language_model_ngram_order, 8,
53  "Maximum order of the character ngram model",
54  dict->getCCUtil()->params()),
55  INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10,
56  "Maximum number of prunable (those for which"
57  " PrunablePath() is true) entries in each viterbi list"
58  " recorded in BLOB_CHOICEs",
59  dict->getCCUtil()->params()),
60  INT_MEMBER(language_model_viterbi_list_max_size, 500,
61  "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
62  dict->getCCUtil()->params()),
63  double_MEMBER(language_model_ngram_small_prob, 0.000001,
64  "To avoid overly small denominators use this as the "
65  "floor of the probability returned by the ngram model.",
66  dict->getCCUtil()->params()),
67  double_MEMBER(language_model_ngram_nonmatch_score, -40.0,
68  "Average classifier score of a non-matching unichar.",
69  dict->getCCUtil()->params()),
70  BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false,
71  "Use only the first UTF8 step of the given string"
72  " when computing log probabilities.",
73  dict->getCCUtil()->params()),
74  double_MEMBER(language_model_ngram_scale_factor, 0.03,
75  "Strength of the character ngram model relative to the"
76  " character classifier ",
77  dict->getCCUtil()->params()),
78  double_MEMBER(language_model_ngram_rating_factor, 16.0,
79  "Factor to bring log-probs into the same range as ratings"
80  " when multiplied by outline length ",
81  dict->getCCUtil()->params()),
82  BOOL_MEMBER(language_model_ngram_space_delimited_language, true,
83  "Words are delimited by space",
84  dict->getCCUtil()->params()),
85  INT_MEMBER(language_model_min_compound_length, 3,
86  "Minimum length of compound words",
87  dict->getCCUtil()->params()),
88  double_MEMBER(language_model_penalty_non_freq_dict_word, 0.1,
89  "Penalty for words not in the frequent word dictionary",
90  dict->getCCUtil()->params()),
91  double_MEMBER(language_model_penalty_non_dict_word, 0.15,
92  "Penalty for non-dictionary words",
93  dict->getCCUtil()->params()),
94  double_MEMBER(language_model_penalty_punc, 0.2,
95  "Penalty for inconsistent punctuation",
96  dict->getCCUtil()->params()),
97  double_MEMBER(language_model_penalty_case, 0.1,
98  "Penalty for inconsistent case",
99  dict->getCCUtil()->params()),
100  double_MEMBER(language_model_penalty_script, 0.5,
101  "Penalty for inconsistent script",
102  dict->getCCUtil()->params()),
103  double_MEMBER(language_model_penalty_chartype, 0.3,
104  "Penalty for inconsistent character type",
105  dict->getCCUtil()->params()),
106  // TODO(daria, rays): enable font consistency checking
107  // after improving font analysis.
108  double_MEMBER(language_model_penalty_font, 0.00,
109  "Penalty for inconsistent font",
110  dict->getCCUtil()->params()),
111  double_MEMBER(language_model_penalty_spacing, 0.05,
112  "Penalty for inconsistent spacing",
113  dict->getCCUtil()->params()),
114  double_MEMBER(language_model_penalty_increment, 0.01,
115  "Penalty increment",
116  dict->getCCUtil()->params()),
117  INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations",
118  dict->getCCUtil()->params()),
119  BOOL_INIT_MEMBER(language_model_use_sigmoidal_certainty, false,
120  "Use sigmoidal score for certainty",
121  dict->getCCUtil()->params()),
122  fontinfo_table_(fontinfo_table), dict_(dict),
123  fixed_pitch_(false), max_char_wh_ratio_(0.0),
124  acceptable_choice_found_(false) {
125  ASSERT_HOST(dict_ != NULL);
129 }
130 
134  delete dawg_args_->updated_dawgs;
135  delete dawg_args_;
136 }
137 
139  bool fixed_pitch, float max_char_wh_ratio,
140  float rating_cert_scale) {
141  fixed_pitch_ = fixed_pitch;
142  max_char_wh_ratio_ = max_char_wh_ratio;
143  rating_cert_scale_ = rating_cert_scale;
144  acceptable_choice_found_ = false;
146 
147  // Initialize vectors with beginning DawgInfos.
152 
153  // Fill prev_word_str_ with the last language_model_ngram_order
154  // unichars from prev_word.
156  if (prev_word != NULL && prev_word->unichar_string() != NULL) {
157  prev_word_str_ = prev_word->unichar_string();
159  } else {
160  prev_word_str_ = " ";
161  }
162  const char *str_ptr = prev_word_str_.string();
163  const char *str_end = str_ptr + prev_word_str_.length();
164  int step;
166  while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
167  str_ptr += step;
169  }
170  ASSERT_HOST(str_ptr == str_end);
171  }
172 }
173 
174 // Helper scans the collection of predecessors for competing siblings that
175 // have the same letter with the opposite case, setting competing_vse.
176 static void ScanParentsForCaseMix(const UNICHARSET& unicharset,
177  LanguageModelState* parent_node) {
178  if (parent_node == NULL) return;
179  ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
180  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
181  ViterbiStateEntry* vse = vit.data();
182  vse->competing_vse = NULL;
183  UNICHAR_ID unichar_id = vse->curr_b->unichar_id();
184  if (unicharset.get_isupper(unichar_id) ||
185  unicharset.get_islower(unichar_id)) {
186  UNICHAR_ID other_case = unicharset.get_other_case(unichar_id);
187  if (other_case == unichar_id) continue; // Not in unicharset.
188  // Find other case in same list. There could be multiple entries with
189  // the same unichar_id, but in theory, they should all point to the
190  // same BLOB_CHOICE, and that is what we will be using to decide
191  // which to keep.
192  ViterbiStateEntry_IT vit2(&parent_node->viterbi_state_entries);
193  for (vit2.mark_cycle_pt(); !vit2.cycled_list() &&
194  vit2.data()->curr_b->unichar_id() != other_case;
195  vit2.forward()) {}
196  if (!vit2.cycled_list()) {
197  vse->competing_vse = vit2.data();
198  }
199  }
200  }
201 }
202 
203 // Helper returns true if the given choice has a better case variant before
204 // it in the choice_list that is not distinguishable by size.
205 static bool HasBetterCaseVariant(const UNICHARSET& unicharset,
206  const BLOB_CHOICE* choice,
207  BLOB_CHOICE_LIST* choices) {
208  UNICHAR_ID choice_id = choice->unichar_id();
209  UNICHAR_ID other_case = unicharset.get_other_case(choice_id);
210  if (other_case == choice_id || other_case == INVALID_UNICHAR_ID)
211  return false; // Not upper or lower or not in unicharset.
212  if (unicharset.SizesDistinct(choice_id, other_case))
213  return false; // Can be separated by size.
214  BLOB_CHOICE_IT bc_it(choices);
215  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
216  BLOB_CHOICE* better_choice = bc_it.data();
217  if (better_choice->unichar_id() == other_case)
218  return true; // Found an earlier instance of other_case.
219  else if (better_choice == choice)
220  return false; // Reached the original choice.
221  }
222  return false; // Should never happen, but just in case.
223 }
224 
225 // UpdateState has the job of combining the ViterbiStateEntry lists on each
226 // of the choices on parent_list with each of the blob choices in curr_list,
227 // making a new ViterbiStateEntry for each sensible path.
228 // This could be a huge set of combinations, creating a lot of work only to
229 // be truncated by some beam limit, but only certain kinds of paths will
230 // continue at the next step:
231 // paths that are liked by the language model: either a DAWG or the n-gram
232 // model, where active.
233 // paths that represent some kind of top choice. The old permuter permuted
234 // the top raw classifier score, the top upper case word and the top lower-
235 // case word. UpdateState now concentrates its top-choice paths on top
236 // lower-case, top upper-case (or caseless alpha), and top digit sequence,
237 // with allowance for continuation of these paths through blobs where such
238 // a character does not appear in the choices list.
239 // GetNextParentVSE enforces some of these models to minimize the number of
240 // calls to AddViterbiStateEntry, even prior to looking at the language model.
241 // Thus an n-blob sequence of [l1I] will produce 3n calls to
242 // AddViterbiStateEntry instead of 3^n.
243 // Of course it isn't quite that simple as Title Case is handled by allowing
244 // lower case to continue an upper case initial, but it has to be detected
245 // in the combiner so it knows which upper case letters are initial alphas.
247  bool just_classified,
248  int curr_col, int curr_row,
249  BLOB_CHOICE_LIST *curr_list,
250  LanguageModelState *parent_node,
251  LMPainPoints *pain_points,
252  WERD_RES *word_res,
253  BestChoiceBundle *best_choice_bundle,
254  BlamerBundle *blamer_bundle) {
255  if (language_model_debug_level > 0) {
256  tprintf("\nUpdateState: col=%d row=%d %s",
257  curr_col, curr_row, just_classified ? "just_classified" : "");
259  tprintf("(parent=%p)\n", parent_node);
260  else
261  tprintf("\n");
262  }
263  // Initialize helper variables.
264  bool word_end = (curr_row+1 >= word_res->ratings->dimension());
265  bool new_changed = false;
266  float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
267  const UNICHARSET& unicharset = dict_->getUnicharset();
268  BLOB_CHOICE *first_lower = NULL;
269  BLOB_CHOICE *first_upper = NULL;
270  BLOB_CHOICE *first_digit = NULL;
271  bool has_alnum_mix = false;
272  if (parent_node != NULL) {
273  int result = SetTopParentLowerUpperDigit(parent_node);
274  if (result < 0) {
276  tprintf("No parents found to process\n");
277  return false;
278  }
279  if (result > 0)
280  has_alnum_mix = true;
281  }
282  if (!GetTopLowerUpperDigit(curr_list, &first_lower, &first_upper,
283  &first_digit))
284  has_alnum_mix = false;;
285  ScanParentsForCaseMix(unicharset, parent_node);
286  if (language_model_debug_level > 3 && parent_node != NULL) {
287  parent_node->Print("Parent viterbi list");
288  }
289  LanguageModelState *curr_state = best_choice_bundle->beam[curr_row];
290 
291  // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
292  ViterbiStateEntry_IT vit;
293  BLOB_CHOICE_IT c_it(curr_list);
294  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
295  BLOB_CHOICE* choice = c_it.data();
296  // TODO(antonova): make sure commenting this out if ok for ngram
297  // model scoring (I think this was introduced to fix ngram model quirks).
298  // Skip NULL unichars unless it is the only choice.
299  //if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
300  UNICHAR_ID unichar_id = choice->unichar_id();
301  if (unicharset.get_fragment(unichar_id)) {
302  continue; // Skip fragments.
303  }
304  // Set top choice flags.
305  LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;
306  if (c_it.at_first() || !new_changed)
307  blob_choice_flags |= kSmallestRatingFlag;
308  if (first_lower == choice) blob_choice_flags |= kLowerCaseFlag;
309  if (first_upper == choice) blob_choice_flags |= kUpperCaseFlag;
310  if (first_digit == choice) blob_choice_flags |= kDigitFlag;
311 
312  if (parent_node == NULL) {
313  // Process the beginning of a word.
314  // If there is a better case variant that is not distinguished by size,
315  // skip this blob choice, as we have no choice but to accept the result
316  // of the character classifier to distinguish between them, even if
317  // followed by an upper case.
318  // With words like iPoc, and other CamelBackWords, the lower-upper
319  // transition can only be achieved if the classifier has the correct case
320  // as the top choice, and leaving an initial I lower down the list
321  // increases the chances of choosing IPoc simply because it doesn't
322  // include such a transition. iPoc will beat iPOC and ipoc because
323  // the other words are baseline/x-height inconsistent.
324  if (HasBetterCaseVariant(unicharset, choice, curr_list))
325  continue;
326  // Upper counts as lower at the beginning of a word.
327  if (blob_choice_flags & kUpperCaseFlag)
328  blob_choice_flags |= kLowerCaseFlag;
329  new_changed |= AddViterbiStateEntry(
330  blob_choice_flags, denom, word_end, curr_col, curr_row,
331  choice, curr_state, NULL, pain_points,
332  word_res, best_choice_bundle, blamer_bundle);
333  } else {
334  // Get viterbi entries from each parent ViterbiStateEntry.
335  vit.set_to_list(&parent_node->viterbi_state_entries);
336  int vit_counter = 0;
337  vit.mark_cycle_pt();
338  ViterbiStateEntry* parent_vse = NULL;
339  LanguageModelFlagsType top_choice_flags;
340  while ((parent_vse = GetNextParentVSE(just_classified, has_alnum_mix,
341  c_it.data(), blob_choice_flags,
342  unicharset, word_res, &vit,
343  &top_choice_flags)) != NULL) {
344  // Skip pruned entries and do not look at prunable entries if already
345  // examined language_model_viterbi_list_max_num_prunable of those.
346  if (PrunablePath(*parent_vse) &&
348  (language_model_ngram_on && parent_vse->ngram_info->pruned))) {
349  continue;
350  }
351  // If the parent has no alnum choice, (ie choice is the first in a
352  // string of alnum), and there is a better case variant that is not
353  // distinguished by size, skip this blob choice/parent, as with the
354  // initial blob treatment above.
355  if (!parent_vse->HasAlnumChoice(unicharset) &&
356  HasBetterCaseVariant(unicharset, choice, curr_list))
357  continue;
358  // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
359  // looks good according to the Dawgs or character ngram model.
360  new_changed |= AddViterbiStateEntry(
361  top_choice_flags, denom, word_end, curr_col, curr_row,
362  c_it.data(), curr_state, parent_vse, pain_points,
363  word_res, best_choice_bundle, blamer_bundle);
364  }
365  }
366  }
367  return new_changed;
368 }
369 
370 // Finds the first lower and upper case letter and first digit in curr_list.
371 // For non-upper/lower languages, alpha counts as upper.
372 // Uses the first character in the list in place of empty results.
373 // Returns true if both alpha and digits are found.
374 bool LanguageModel::GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list,
375  BLOB_CHOICE **first_lower,
376  BLOB_CHOICE **first_upper,
377  BLOB_CHOICE **first_digit) const {
378  BLOB_CHOICE_IT c_it(curr_list);
379  const UNICHARSET &unicharset = dict_->getUnicharset();
380  BLOB_CHOICE *first_unichar = NULL;
381  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
382  UNICHAR_ID unichar_id = c_it.data()->unichar_id();
383  if (unicharset.get_fragment(unichar_id)) continue; // skip fragments
384  if (first_unichar == NULL) first_unichar = c_it.data();
385  if (*first_lower == NULL && unicharset.get_islower(unichar_id)) {
386  *first_lower = c_it.data();
387  }
388  if (*first_upper == NULL && unicharset.get_isalpha(unichar_id) &&
389  !unicharset.get_islower(unichar_id)) {
390  *first_upper = c_it.data();
391  }
392  if (*first_digit == NULL && unicharset.get_isdigit(unichar_id)) {
393  *first_digit = c_it.data();
394  }
395  }
396  ASSERT_HOST(first_unichar != NULL);
397  bool mixed = (*first_lower != NULL || *first_upper != NULL) &&
398  *first_digit != NULL;
399  if (*first_lower == NULL) *first_lower = first_unichar;
400  if (*first_upper == NULL) *first_upper = first_unichar;
401  if (*first_digit == NULL) *first_digit = first_unichar;
402  return mixed;
403 }
404 
405 // Forces there to be at least one entry in the overall set of the
406 // viterbi_state_entries of each element of parent_node that has the
407 // top_choice_flag set for lower, upper and digit using the same rules as
408 // GetTopLowerUpperDigit, setting the flag on the first found suitable
409 // candidate, whether or not the flag is set on some other parent.
410 // Returns 1 if both alpha and digits are found among the parents, -1 if no
411 // parents are found at all (a legitimate case), and 0 otherwise.
413  LanguageModelState *parent_node) const {
414  if (parent_node == NULL) return -1;
415  UNICHAR_ID top_id = INVALID_UNICHAR_ID;
416  ViterbiStateEntry* top_lower = NULL;
417  ViterbiStateEntry* top_upper = NULL;
418  ViterbiStateEntry* top_digit = NULL;
419  ViterbiStateEntry* top_choice = NULL;
420  float lower_rating = 0.0f;
421  float upper_rating = 0.0f;
422  float digit_rating = 0.0f;
423  float top_rating = 0.0f;
424  const UNICHARSET &unicharset = dict_->getUnicharset();
425  ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
426  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
427  ViterbiStateEntry* vse = vit.data();
428  // INVALID_UNICHAR_ID should be treated like a zero-width joiner, so scan
429  // back to the real character if needed.
430  ViterbiStateEntry* unichar_vse = vse;
431  UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id();
432  float rating = unichar_vse->curr_b->rating();
433  while (unichar_id == INVALID_UNICHAR_ID &&
434  unichar_vse->parent_vse != NULL) {
435  unichar_vse = unichar_vse->parent_vse;
436  unichar_id = unichar_vse->curr_b->unichar_id();
437  rating = unichar_vse->curr_b->rating();
438  }
439  if (unichar_id != INVALID_UNICHAR_ID) {
440  if (unicharset.get_islower(unichar_id)) {
441  if (top_lower == NULL || lower_rating > rating) {
442  top_lower = vse;
443  lower_rating = rating;
444  }
445  } else if (unicharset.get_isalpha(unichar_id)) {
446  if (top_upper == NULL || upper_rating > rating) {
447  top_upper = vse;
448  upper_rating = rating;
449  }
450  } else if (unicharset.get_isdigit(unichar_id)) {
451  if (top_digit == NULL || digit_rating > rating) {
452  top_digit = vse;
453  digit_rating = rating;
454  }
455  }
456  }
457  if (top_choice == NULL || top_rating > rating) {
458  top_choice = vse;
459  top_rating = rating;
460  top_id = unichar_id;
461  }
462  }
463  if (top_choice == NULL) return -1;
464  bool mixed = (top_lower != NULL || top_upper != NULL) &&
465  top_digit != NULL;
466  if (top_lower == NULL) top_lower = top_choice;
467  top_lower->top_choice_flags |= kLowerCaseFlag;
468  if (top_upper == NULL) top_upper = top_choice;
469  top_upper->top_choice_flags |= kUpperCaseFlag;
470  if (top_digit == NULL) top_digit = top_choice;
471  top_digit->top_choice_flags |= kDigitFlag;
472  top_choice->top_choice_flags |= kSmallestRatingFlag;
473  if (top_id != INVALID_UNICHAR_ID && dict_->compound_marker(top_id) &&
474  (top_choice->top_choice_flags &
476  // If the compound marker top choice carries any of the top alnum flags,
477  // then give it all of them, allowing words like I-295 to be chosen.
478  top_choice->top_choice_flags |=
480  }
481  return mixed ? 1 : 0;
482 }
483 
484 // Finds the next ViterbiStateEntry with which the given unichar_id can
485 // combine sensibly, taking into account any mixed alnum/mixed case
486 // situation, and whether this combination has been inspected before.
488  bool just_classified, bool mixed_alnum, const BLOB_CHOICE* bc,
489  LanguageModelFlagsType blob_choice_flags, const UNICHARSET& unicharset,
490  WERD_RES* word_res, ViterbiStateEntry_IT* vse_it,
491  LanguageModelFlagsType* top_choice_flags) const {
492  for (; !vse_it->cycled_list(); vse_it->forward()) {
493  ViterbiStateEntry* parent_vse = vse_it->data();
494  // Only consider the parent if it has been updated or
495  // if the current ratings cell has just been classified.
496  if (!just_classified && !parent_vse->updated) continue;
498  parent_vse->Print("Considering");
499  // If the parent is non-alnum, then upper counts as lower.
500  *top_choice_flags = blob_choice_flags;
501  if ((blob_choice_flags & kUpperCaseFlag) &&
502  !parent_vse->HasAlnumChoice(unicharset)) {
503  *top_choice_flags |= kLowerCaseFlag;
504  }
505  *top_choice_flags &= parent_vse->top_choice_flags;
506  UNICHAR_ID unichar_id = bc->unichar_id();
507  const BLOB_CHOICE* parent_b = parent_vse->curr_b;
508  UNICHAR_ID parent_id = parent_b->unichar_id();
509  // Digits do not bind to alphas if there is a mix in both parent and current
510  // or if the alpha is not the top choice.
511  if (unicharset.get_isdigit(unichar_id) &&
512  unicharset.get_isalpha(parent_id) &&
513  (mixed_alnum || *top_choice_flags == 0))
514  continue; // Digits don't bind to alphas.
515  // Likewise alphas do not bind to digits if there is a mix in both or if
516  // the digit is not the top choice.
517  if (unicharset.get_isalpha(unichar_id) &&
518  unicharset.get_isdigit(parent_id) &&
519  (mixed_alnum || *top_choice_flags == 0))
520  continue; // Alphas don't bind to digits.
521  // If there is a case mix of the same alpha in the parent list, then
522  // competing_vse is non-null and will be used to determine whether
523  // or not to bind the current blob choice.
524  if (parent_vse->competing_vse != NULL) {
525  const BLOB_CHOICE* competing_b = parent_vse->competing_vse->curr_b;
526  UNICHAR_ID other_id = competing_b->unichar_id();
527  if (language_model_debug_level >= 5) {
528  tprintf("Parent %s has competition %s\n",
529  unicharset.id_to_unichar(parent_id),
530  unicharset.id_to_unichar(other_id));
531  }
532  if (unicharset.SizesDistinct(parent_id, other_id)) {
533  // If other_id matches bc wrt position and size, and parent_id, doesn't,
534  // don't bind to the current parent.
535  if (bc->PosAndSizeAgree(*competing_b, word_res->x_height,
537  !bc->PosAndSizeAgree(*parent_b, word_res->x_height,
539  continue; // Competing blobchoice has a better vertical match.
540  }
541  }
542  vse_it->forward();
543  return parent_vse; // This one is good!
544  }
545  return NULL; // Ran out of possibilities.
546 }
547 
549  LanguageModelFlagsType top_choice_flags,
550  float denom,
551  bool word_end,
552  int curr_col, int curr_row,
553  BLOB_CHOICE *b,
554  LanguageModelState *curr_state,
555  ViterbiStateEntry *parent_vse,
556  LMPainPoints *pain_points,
557  WERD_RES *word_res,
558  BestChoiceBundle *best_choice_bundle,
559  BlamerBundle *blamer_bundle) {
560  ViterbiStateEntry_IT vit;
561  if (language_model_debug_level > 1) {
562  tprintf("AddViterbiStateEntry for unichar %s rating=%.4f"
563  " certainty=%.4f top_choice_flags=0x%x",
565  b->rating(), b->certainty(), top_choice_flags);
567  tprintf(" parent_vse=%p\n", parent_vse);
568  else
569  tprintf("\n");
570  }
571  // Check whether the list is full.
572  if (curr_state != NULL &&
573  curr_state->viterbi_state_entries_length >=
575  if (language_model_debug_level > 1) {
576  tprintf("AddViterbiStateEntry: viterbi list is full!\n");
577  }
578  return false;
579  }
580 
581  // Invoke Dawg language model component.
582  LanguageModelDawgInfo *dawg_info =
583  GenerateDawgInfo(word_end, curr_col, curr_row, *b, parent_vse);
584 
585  float outline_length =
587  // Invoke Ngram language model component.
588  LanguageModelNgramInfo *ngram_info = NULL;
590  ngram_info = GenerateNgramInfo(
592  denom, curr_col, curr_row, outline_length, parent_vse);
593  ASSERT_HOST(ngram_info != NULL);
594  }
595  bool liked_by_language_model = dawg_info != NULL ||
596  (ngram_info != NULL && !ngram_info->pruned);
597  // Quick escape if not liked by the language model, can't be consistent
598  // xheight, and not top choice.
599  if (!liked_by_language_model && top_choice_flags == 0) {
600  if (language_model_debug_level > 1) {
601  tprintf("Language model components very early pruned this entry\n");
602  }
603  delete ngram_info;
604  delete dawg_info;
605  return false;
606  }
607 
608  // Check consistency of the path and set the relevant consistency_info.
609  LMConsistencyInfo consistency_info(
610  parent_vse != NULL ? &parent_vse->consistency_info : NULL);
611  // Start with just the x-height consistency, as it provides significant
612  // pruning opportunity.
613  consistency_info.ComputeXheightConsistency(
615  // Turn off xheight consistent flag if not consistent.
616  if (consistency_info.InconsistentXHeight()) {
617  top_choice_flags &= ~kXhtConsistentFlag;
618  }
619 
620  // Quick escape if not liked by the language model, not consistent xheight,
621  // and not top choice.
622  if (!liked_by_language_model && top_choice_flags == 0) {
623  if (language_model_debug_level > 1) {
624  tprintf("Language model components early pruned this entry\n");
625  }
626  delete ngram_info;
627  delete dawg_info;
628  return false;
629  }
630 
631  // Compute the rest of the consistency info.
632  FillConsistencyInfo(curr_col, word_end, b, parent_vse,
633  word_res, &consistency_info);
634  if (dawg_info != NULL && consistency_info.invalid_punc) {
635  consistency_info.invalid_punc = false; // do not penalize dict words
636  }
637 
638  // Compute cost of associating the blobs that represent the current unichar.
639  AssociateStats associate_stats;
640  ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_,
641  parent_vse, word_res, &associate_stats);
642  if (parent_vse != NULL) {
643  associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
644  associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
645  }
646 
647  // Create the new ViterbiStateEntry compute the adjusted cost of the path.
648  ViterbiStateEntry *new_vse = new ViterbiStateEntry(
649  parent_vse, b, 0.0, outline_length,
650  consistency_info, associate_stats, top_choice_flags, dawg_info,
651  ngram_info, (language_model_debug_level > 0) ?
652  dict_->getUnicharset().id_to_unichar(b->unichar_id()) : NULL);
653  new_vse->cost = ComputeAdjustedPathCost(new_vse);
655  tprintf("Adjusted cost = %g\n", new_vse->cost);
656 
657  // Invoke Top Choice language model component to make the final adjustments
658  // to new_vse->top_choice_flags.
659  if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) {
660  GenerateTopChoiceInfo(new_vse, parent_vse, curr_state);
661  }
662 
663  // If language model components did not like this unichar - return.
664  bool keep = new_vse->top_choice_flags || liked_by_language_model;
665  if (!(top_choice_flags & kSmallestRatingFlag) && // no non-top choice paths
666  consistency_info.inconsistent_script) { // with inconsistent script
667  keep = false;
668  }
669  if (!keep) {
670  if (language_model_debug_level > 1) {
671  tprintf("Language model components did not like this entry\n");
672  }
673  delete new_vse;
674  return false;
675  }
676 
677  // Discard this entry if it represents a prunable path and
678  // language_model_viterbi_list_max_num_prunable such entries with a lower
679  // cost have already been recorded.
680  if (PrunablePath(*new_vse) &&
683  new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) {
684  if (language_model_debug_level > 1) {
685  tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n",
686  new_vse->cost,
688  }
689  delete new_vse;
690  return false;
691  }
692 
693  // Update best choice if needed.
694  if (word_end) {
695  UpdateBestChoice(new_vse, pain_points, word_res,
696  best_choice_bundle, blamer_bundle);
697  // Discard the entry if UpdateBestChoice() found flaws in it.
698  if (new_vse->cost >= WERD_CHOICE::kBadRating &&
699  new_vse != best_choice_bundle->best_vse) {
700  if (language_model_debug_level > 1) {
701  tprintf("Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
702  }
703  delete new_vse;
704  return false;
705  }
706  }
707 
708  // Add the new ViterbiStateEntry and to curr_state->viterbi_state_entries.
709  curr_state->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare,
710  false, new_vse);
711  curr_state->viterbi_state_entries_length++;
712  if (PrunablePath(*new_vse)) {
714  }
715 
716  // Update lms->viterbi_state_entries_prunable_max_cost and clear
717  // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
718  if ((curr_state->viterbi_state_entries_prunable_length >=
720  new_vse->top_choice_flags) {
721  ASSERT_HOST(!curr_state->viterbi_state_entries.empty());
722  int prunable_counter = language_model_viterbi_list_max_num_prunable;
723  vit.set_to_list(&(curr_state->viterbi_state_entries));
724  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
725  ViterbiStateEntry *curr_vse = vit.data();
726  // Clear the appropriate top choice flags of the entries in the
727  // list that have cost higher thank new_entry->cost
728  // (since they will not be top choices any more).
729  if (curr_vse->top_choice_flags && curr_vse != new_vse &&
730  curr_vse->cost > new_vse->cost) {
731  curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags);
732  }
733  if (prunable_counter > 0 && PrunablePath(*curr_vse)) --prunable_counter;
734  // Update curr_state->viterbi_state_entries_prunable_max_cost.
735  if (prunable_counter == 0) {
736  curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
737  if (language_model_debug_level > 1) {
738  tprintf("Set viterbi_state_entries_prunable_max_cost to %g\n",
740  }
741  prunable_counter = -1; // stop counting
742  }
743  }
744  }
745 
746  // Print the newly created ViterbiStateEntry.
747  if (language_model_debug_level > 2) {
748  new_vse->Print("New");
750  curr_state->Print("Updated viterbi list");
751  }
752 
753  return true;
754 }
755 
757  const ViterbiStateEntry *parent_vse,
758  LanguageModelState *lms) {
759  ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
760  for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->top_choice_flags &&
761  new_vse->cost >= vit.data()->cost; vit.forward()) {
762  // Clear the appropriate flags if the list already contains
763  // a top choice entry with a lower cost.
764  new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags);
765  }
766  if (language_model_debug_level > 2) {
767  tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
768  new_vse->top_choice_flags);
769  }
770 }
771 
773  bool word_end,
774  int curr_col, int curr_row,
775  const BLOB_CHOICE &b,
776  const ViterbiStateEntry *parent_vse) {
777  // Initialize active_dawgs from parent_vse if it is not NULL.
778  // Otherwise use very_beginning_active_dawgs_.
779  if (parent_vse == NULL) {
782  } else {
783  if (parent_vse->dawg_info == NULL) return NULL; // not a dict word path
785  dawg_args_->permuter = parent_vse->dawg_info->permuter;
786  }
787 
788  // Deal with hyphenated words.
789  if (word_end && dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) {
790  if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
792  COMPOUND_PERM);
793  }
794 
795  // Deal with compound words.
796  if (dict_->compound_marker(b.unichar_id()) &&
797  (parent_vse == NULL || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
798  if (language_model_debug_level > 0) tprintf("Found compound marker\n");
799  // Do not allow compound operators at the beginning and end of the word.
800  // Do not allow more than one compound operator per word.
801  // Do not allow compounding of words with lengths shorter than
802  // language_model_min_compound_length
803  if (parent_vse == NULL || word_end ||
805  parent_vse->length < language_model_min_compound_length) return NULL;
806 
807  int i;
808  // Check a that the path terminated before the current character is a word.
809  bool has_word_ending = false;
810  for (i = 0; i < parent_vse->dawg_info->active_dawgs->size(); ++i) {
811  const DawgPosition &pos = (*parent_vse->dawg_info->active_dawgs)[i];
812  const Dawg *pdawg = pos.dawg_index < 0
813  ? NULL : dict_->GetDawg(pos.dawg_index);
814  if (pdawg == NULL || pos.back_to_punc) continue;;
815  if (pdawg->type() == DAWG_TYPE_WORD && pos.dawg_ref != NO_EDGE &&
816  pdawg->end_of_word(pos.dawg_ref)) {
817  has_word_ending = true;
818  break;
819  }
820  }
821  if (!has_word_ending) return NULL;
822 
823  if (language_model_debug_level > 0) tprintf("Compound word found\n");
825  } // done dealing with compound words
826 
827  LanguageModelDawgInfo *dawg_info = NULL;
828 
829  // Call LetterIsOkay().
830  // Use the normalized IDs so that all shapes of ' can be allowed in words
831  // like don't.
832  const GenericVector<UNICHAR_ID>& normed_ids =
834  DawgPositionVector tmp_active_dawgs;
835  for (int i = 0; i < normed_ids.size(); ++i) {
837  tprintf("Test Letter OK for unichar %d, normed %d\n",
838  b.unichar_id(), normed_ids[i]);
839  dict_->LetterIsOkay(dawg_args_, normed_ids[i],
840  word_end && i == normed_ids.size() - 1);
841  if (dawg_args_->permuter == NO_PERM) {
842  break;
843  } else if (i < normed_ids.size() - 1) {
844  tmp_active_dawgs = *dawg_args_->updated_dawgs;
845  dawg_args_->active_dawgs = &tmp_active_dawgs;
846  }
848  tprintf("Letter was OK for unichar %d, normed %d\n",
849  b.unichar_id(), normed_ids[i]);
850  }
852  if (dawg_args_->permuter != NO_PERM) {
855  } else if (language_model_debug_level > 3) {
856  tprintf("Letter %s not OK!\n",
858  }
859 
860  return dawg_info;
861 }
862 
864  const char *unichar, float certainty, float denom,
865  int curr_col, int curr_row, float outline_length,
866  const ViterbiStateEntry *parent_vse) {
867  // Initialize parent context.
868  const char *pcontext_ptr = "";
869  int pcontext_unichar_step_len = 0;
870  if (parent_vse == NULL) {
871  pcontext_ptr = prev_word_str_.string();
872  pcontext_unichar_step_len = prev_word_unichar_step_len_;
873  } else {
874  pcontext_ptr = parent_vse->ngram_info->context.string();
875  pcontext_unichar_step_len =
877  }
878  // Compute p(unichar | parent context).
879  int unichar_step_len = 0;
880  bool pruned = false;
881  float ngram_cost;
882  float ngram_and_classifier_cost =
883  ComputeNgramCost(unichar, certainty, denom,
884  pcontext_ptr, &unichar_step_len,
885  &pruned, &ngram_cost);
886  // Normalize just the ngram_and_classifier_cost by outline_length.
887  // The ngram_cost is used by the params_model, so it needs to be left as-is,
888  // and the params model cost will be normalized by outline_length.
889  ngram_and_classifier_cost *=
890  outline_length / language_model_ngram_rating_factor;
891  // Add the ngram_cost of the parent.
892  if (parent_vse != NULL) {
893  ngram_and_classifier_cost +=
895  ngram_cost += parent_vse->ngram_info->ngram_cost;
896  }
897 
898  // Shorten parent context string by unichar_step_len unichars.
899  int num_remove = (unichar_step_len + pcontext_unichar_step_len -
901  if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
902  while (num_remove > 0 && *pcontext_ptr != '\0') {
903  pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
904  --num_remove;
905  }
906 
907  // Decide whether to prune this ngram path and update changed accordingly.
908  if (parent_vse != NULL && parent_vse->ngram_info->pruned) pruned = true;
909 
910  // Construct and return the new LanguageModelNgramInfo.
912  pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost,
913  ngram_and_classifier_cost);
914  ngram_info->context += unichar;
915  ngram_info->context_unichar_step_len += unichar_step_len;
917  return ngram_info;
918 }
919 
920 float LanguageModel::ComputeNgramCost(const char *unichar,
921  float certainty,
922  float denom,
923  const char *context,
924  int *unichar_step_len,
925  bool *found_small_prob,
926  float *ngram_cost) {
927  const char *context_ptr = context;
928  char *modified_context = NULL;
929  char *modified_context_end = NULL;
930  const char *unichar_ptr = unichar;
931  const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
932  float prob = 0.0f;
933  int step = 0;
934  while (unichar_ptr < unichar_end &&
935  (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
936  if (language_model_debug_level > 1) {
937  tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
938  dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
939  }
940  prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
941  ++(*unichar_step_len);
943  unichar_ptr += step;
944  // If there are multiple UTF8 characters present in unichar, context is
945  // updated to include the previously examined characters from str,
946  // unless use_only_first_uft8_step is true.
947  if (unichar_ptr < unichar_end) {
948  if (modified_context == NULL) {
949  int context_len = strlen(context);
950  modified_context =
951  new char[context_len + strlen(unichar_ptr) + step + 1];
952  strncpy(modified_context, context, context_len);
953  modified_context_end = modified_context + context_len;
954  context_ptr = modified_context;
955  }
956  strncpy(modified_context_end, unichar_ptr - step, step);
957  modified_context_end += step;
958  *modified_context_end = '\0';
959  }
960  }
961  prob /= static_cast<float>(*unichar_step_len); // normalize
962  if (prob < language_model_ngram_small_prob) {
963  if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob);
964  *found_small_prob = true;
966  }
967  *ngram_cost = -1.0*log2(prob);
968  float ngram_and_classifier_cost =
969  -1.0*log2(CertaintyScore(certainty)/denom) +
970  *ngram_cost * language_model_ngram_scale_factor;
971  if (language_model_debug_level > 1) {
972  tprintf("-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar,
973  unichar, context_ptr, CertaintyScore(certainty)/denom, prob,
974  ngram_and_classifier_cost);
975  }
976  if (modified_context != NULL) delete[] modified_context;
977  return ngram_and_classifier_cost;
978 }
979 
980 float LanguageModel::ComputeDenom(BLOB_CHOICE_LIST *curr_list) {
981  if (curr_list->empty()) return 1.0f;
982  float denom = 0.0f;
983  int len = 0;
984  BLOB_CHOICE_IT c_it(curr_list);
985  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
986  ASSERT_HOST(c_it.data() != NULL);
987  ++len;
988  denom += CertaintyScore(c_it.data()->certainty());
989  }
990  assert(len != 0);
991  // The ideal situation would be to have the classifier scores for
992  // classifying each position as each of the characters in the unicharset.
993  // Since we can not do this because of speed, we add a very crude estimate
994  // of what these scores for the "missing" classifications would sum up to.
995  denom += (dict_->getUnicharset().size() - len) *
997 
998  return denom;
999 }
1000 
1002  int curr_col,
1003  bool word_end,
1004  BLOB_CHOICE *b,
1005  ViterbiStateEntry *parent_vse,
1006  WERD_RES *word_res,
1007  LMConsistencyInfo *consistency_info) {
1008  const UNICHARSET &unicharset = dict_->getUnicharset();
1009  UNICHAR_ID unichar_id = b->unichar_id();
1010  BLOB_CHOICE* parent_b = parent_vse != NULL ? parent_vse->curr_b : NULL;
1011 
1012  // Check punctuation validity.
1013  if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++;
1014  if (dict_->GetPuncDawg() != NULL && !consistency_info->invalid_punc) {
1015  if (dict_->compound_marker(unichar_id) && parent_b != NULL &&
1016  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1017  unicharset.get_isdigit(parent_b->unichar_id()))) {
1018  // reset punc_ref for compound words
1019  consistency_info->punc_ref = NO_EDGE;
1020  } else {
1021  bool is_apos = dict_->is_apostrophe(unichar_id);
1022  bool prev_is_numalpha = (parent_b != NULL &&
1023  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1024  unicharset.get_isdigit(parent_b->unichar_id())));
1025  UNICHAR_ID pattern_unichar_id =
1026  (unicharset.get_isalpha(unichar_id) ||
1027  unicharset.get_isdigit(unichar_id) ||
1028  (is_apos && prev_is_numalpha)) ?
1029  Dawg::kPatternUnicharID : unichar_id;
1030  if (consistency_info->punc_ref == NO_EDGE ||
1031  pattern_unichar_id != Dawg::kPatternUnicharID ||
1032  dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
1035  consistency_info->punc_ref);
1036  consistency_info->punc_ref =
1037  (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
1038  node, pattern_unichar_id, word_end) : NO_EDGE;
1039  if (consistency_info->punc_ref == NO_EDGE) {
1040  consistency_info->invalid_punc = true;
1041  }
1042  }
1043  }
1044  }
1045 
1046  // Update case related counters.
1047  if (parent_vse != NULL && !word_end && dict_->compound_marker(unichar_id)) {
1048  // Reset counters if we are dealing with a compound word.
1049  consistency_info->num_lower = 0;
1050  consistency_info->num_non_first_upper = 0;
1051  }
1052  else if (unicharset.get_islower(unichar_id)) {
1053  consistency_info->num_lower++;
1054  } else if ((parent_b != NULL) && unicharset.get_isupper(unichar_id)) {
1055  if (unicharset.get_isupper(parent_b->unichar_id()) ||
1056  consistency_info->num_lower > 0 ||
1057  consistency_info->num_non_first_upper > 0) {
1058  consistency_info->num_non_first_upper++;
1059  }
1060  }
1061 
1062  // Initialize consistency_info->script_id (use script of unichar_id
1063  // if it is not Common, use script id recorded by the parent otherwise).
1064  // Set inconsistent_script to true if the script of the current unichar
1065  // is not consistent with that of the parent.
1066  consistency_info->script_id = unicharset.get_script(unichar_id);
1067  // Hiragana and Katakana can mix with Han.
1069  if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
1070  consistency_info->script_id == unicharset.hiragana_sid()) ||
1071  (unicharset.katakana_sid() != unicharset.null_sid() &&
1072  consistency_info->script_id == unicharset.katakana_sid())) {
1073  consistency_info->script_id = dict_->getUnicharset().han_sid();
1074  }
1075  }
1076 
1077  if (parent_vse != NULL &&
1078  (parent_vse->consistency_info.script_id !=
1079  dict_->getUnicharset().common_sid())) {
1080  int parent_script_id = parent_vse->consistency_info.script_id;
1081  // If script_id is Common, use script id of the parent instead.
1082  if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
1083  consistency_info->script_id = parent_script_id;
1084  }
1085  if (consistency_info->script_id != parent_script_id) {
1086  consistency_info->inconsistent_script = true;
1087  }
1088  }
1089 
1090  // Update chartype related counters.
1091  if (unicharset.get_isalpha(unichar_id)) {
1092  consistency_info->num_alphas++;
1093  } else if (unicharset.get_isdigit(unichar_id)) {
1094  consistency_info->num_digits++;
1095  } else if (!unicharset.get_ispunctuation(unichar_id)) {
1096  consistency_info->num_other++;
1097  }
1098 
1099  // Check font and spacing consistency.
1100  if (fontinfo_table_->size() > 0 && parent_b != NULL) {
1101  int fontinfo_id = -1;
1102  if (parent_b->fontinfo_id() == b->fontinfo_id() ||
1103  parent_b->fontinfo_id2() == b->fontinfo_id()) {
1104  fontinfo_id = b->fontinfo_id();
1105  } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
1106  parent_b->fontinfo_id2() == b->fontinfo_id2()) {
1107  fontinfo_id = b->fontinfo_id2();
1108  }
1109  if(language_model_debug_level > 1) {
1110  tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n",
1111  (parent_b->fontinfo_id() >= 0) ?
1112  fontinfo_table_->get(parent_b->fontinfo_id()).name : "" ,
1113  (parent_b->fontinfo_id2() >= 0) ?
1114  fontinfo_table_->get(parent_b->fontinfo_id2()).name : "",
1115  (b->fontinfo_id() >= 0) ?
1116  fontinfo_table_->get(b->fontinfo_id()).name : "",
1117  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1118  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1119  fontinfo_id);
1120  }
1121  if (!word_res->blob_widths.empty()) { // if we have widths/gaps info
1122  bool expected_gap_found = false;
1123  float expected_gap;
1124  int temp_gap;
1125  if (fontinfo_id >= 0) { // found a common font
1126  ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
1127  if (fontinfo_table_->get(fontinfo_id).get_spacing(
1128  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1129  expected_gap = temp_gap;
1130  expected_gap_found = true;
1131  }
1132  } else {
1133  consistency_info->inconsistent_font = true;
1134  // Get an average of the expected gaps in each font
1135  int num_addends = 0;
1136  expected_gap = 0;
1137  int temp_fid;
1138  for (int i = 0; i < 4; ++i) {
1139  if (i == 0) {
1140  temp_fid = parent_b->fontinfo_id();
1141  } else if (i == 1) {
1142  temp_fid = parent_b->fontinfo_id2();
1143  } else if (i == 2) {
1144  temp_fid = b->fontinfo_id();
1145  } else {
1146  temp_fid = b->fontinfo_id2();
1147  }
1148  ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
1149  if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing(
1150  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1151  expected_gap += temp_gap;
1152  num_addends++;
1153  }
1154  }
1155  expected_gap_found = (num_addends > 0);
1156  if (num_addends > 0) {
1157  expected_gap /= static_cast<float>(num_addends);
1158  }
1159  }
1160  if (expected_gap_found) {
1161  float actual_gap =
1162  static_cast<float>(word_res->GetBlobsGap(curr_col-1));
1163  float gap_ratio = expected_gap / actual_gap;
1164  // TODO(rays) The gaps seem to be way off most of the time, saved by
1165  // the error here that the ratio was compared to 1/2, when it should
1166  // have been 0.5f. Find the source of the gaps discrepancy and put
1167  // the 0.5f here in place of 0.0f.
1168  // Test on 2476595.sj, pages 0 to 6. (In French.)
1169  if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
1170  consistency_info->num_inconsistent_spaces++;
1171  }
1172  if (language_model_debug_level > 1) {
1173  tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %g\n",
1174  unicharset.id_to_unichar(parent_b->unichar_id()),
1175  parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id),
1176  unichar_id, curr_col, expected_gap, actual_gap);
1177  }
1178  }
1179  }
1180  }
1181 }
1182 
1184  ASSERT_HOST(vse != NULL);
1185  if (params_model_.Initialized()) {
1186  float features[PTRAIN_NUM_FEATURE_TYPES];
1187  ExtractFeaturesFromPath(*vse, features);
1188  float cost = params_model_.ComputeCost(features);
1189  if (language_model_debug_level > 3) {
1190  tprintf("ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
1191  if (language_model_debug_level >= 5) {
1192  for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {
1193  tprintf("%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
1194  }
1195  }
1196  }
1197  return cost * vse->outline_length;
1198  } else {
1199  float adjustment = 1.0f;
1200  if (vse->dawg_info == NULL || vse->dawg_info->permuter != FREQ_DAWG_PERM) {
1202  }
1203  if (vse->dawg_info == NULL) {
1206  adjustment += ((vse->length - language_model_min_compound_length) *
1208  }
1209  }
1210  if (vse->associate_stats.shape_cost > 0) {
1211  adjustment += vse->associate_stats.shape_cost /
1212  static_cast<float>(vse->length);
1213  }
1215  ASSERT_HOST(vse->ngram_info != NULL);
1216  return vse->ngram_info->ngram_and_classifier_cost * adjustment;
1217  } else {
1218  adjustment += ComputeConsistencyAdjustment(vse->dawg_info,
1219  vse->consistency_info);
1220  return vse->ratings_sum * adjustment;
1221  }
1222  }
1223 }
1224 
1226  ViterbiStateEntry *vse,
1227  LMPainPoints *pain_points,
1228  WERD_RES *word_res,
1229  BestChoiceBundle *best_choice_bundle,
1230  BlamerBundle *blamer_bundle) {
1231  bool truth_path;
1232  WERD_CHOICE *word = ConstructWord(vse, word_res, &best_choice_bundle->fixpt,
1233  blamer_bundle, &truth_path);
1234  ASSERT_HOST(word != NULL);
1235  if (dict_->stopper_debug_level >= 1) {
1236  STRING word_str;
1237  word->string_and_lengths(&word_str, NULL);
1238  vse->Print(word_str.string());
1239  }
1240  if (language_model_debug_level > 0) {
1241  word->print("UpdateBestChoice() constructed word");
1242  }
1243  // Record features from the current path if necessary.
1244  ParamsTrainingHypothesis curr_hyp;
1245  if (blamer_bundle != NULL) {
1246  if (vse->dawg_info != NULL) vse->dawg_info->permuter =
1247  static_cast<PermuterType>(word->permuter());
1248  ExtractFeaturesFromPath(*vse, curr_hyp.features);
1249  word->string_and_lengths(&(curr_hyp.str), NULL);
1250  curr_hyp.cost = vse->cost; // record cost for error rate computations
1251  if (language_model_debug_level > 0) {
1252  tprintf("Raw features extracted from %s (cost=%g) [ ",
1253  curr_hyp.str.string(), curr_hyp.cost);
1254  for (int deb_i = 0; deb_i < PTRAIN_NUM_FEATURE_TYPES; ++deb_i) {
1255  tprintf("%g ", curr_hyp.features[deb_i]);
1256  }
1257  tprintf("]\n");
1258  }
1259  // Record the current hypothesis in params_training_bundle.
1260  blamer_bundle->AddHypothesis(curr_hyp);
1261  if (truth_path)
1262  blamer_bundle->UpdateBestRating(word->rating());
1263  }
1264  if (blamer_bundle != NULL && blamer_bundle->GuidedSegsearchStillGoing()) {
1265  // The word was constructed solely for blamer_bundle->AddHypothesis, so
1266  // we no longer need it.
1267  delete word;
1268  return;
1269  }
1270  if (word_res->chopped_word != NULL && !word_res->chopped_word->blobs.empty())
1271  word->SetScriptPositions(false, word_res->chopped_word);
1272  // Update and log new raw_choice if needed.
1273  if (word_res->raw_choice == NULL ||
1274  word->rating() < word_res->raw_choice->rating()) {
1275  if (word_res->LogNewRawChoice(word) && language_model_debug_level > 0)
1276  tprintf("Updated raw choice\n");
1277  }
1278  // Set the modified rating for best choice to vse->cost and log best choice.
1279  word->set_rating(vse->cost);
1280  // Call LogNewChoice() for best choice from Dict::adjust_word() since it
1281  // computes adjust_factor that is used by the adaption code (e.g. by
1282  // ClassifyAdaptableWord() to compute adaption acceptance thresholds).
1283  // Note: the rating of the word is not adjusted.
1284  dict_->adjust_word(word, vse->dawg_info == NULL,
1285  vse->consistency_info.xht_decision, 0.0,
1286  false, language_model_debug_level > 0);
1287  // Hand ownership of the word over to the word_res.
1289  dict_->stopper_debug_level >= 1, word)) {
1290  // The word was so bad that it was deleted.
1291  return;
1292  }
1293  if (word_res->best_choice == word) {
1294  // Word was the new best.
1296  AcceptablePath(*vse)) {
1297  acceptable_choice_found_ = true;
1298  }
1299  // Update best_choice_bundle.
1300  best_choice_bundle->updated = true;
1301  best_choice_bundle->best_vse = vse;
1302  if (language_model_debug_level > 0) {
1303  tprintf("Updated best choice\n");
1304  word->print_state("New state ");
1305  }
1306  // Update hyphen state if we are dealing with a dictionary word.
1307  if (vse->dawg_info != NULL) {
1308  if (dict_->has_hyphen_end(*word)) {
1310  } else {
1311  dict_->reset_hyphen_vars(true);
1312  }
1313  }
1314 
1315  if (blamer_bundle != NULL) {
1317  vse->dawg_info != NULL && vse->top_choice_flags);
1318  }
1319  }
1320  if (wordrec_display_segmentations && word_res->chopped_word != NULL) {
1321  word->DisplaySegmentation(word_res->chopped_word);
1322  }
1323 }
1324 
1326  const ViterbiStateEntry &vse, float features[]) {
1327  memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
1328  // Record dictionary match info.
1329  int len = vse.length <= kMaxSmallWordUnichars ? 0 :
1330  vse.length <= kMaxMediumWordUnichars ? 1 : 2;
1331  if (vse.dawg_info != NULL) {
1332  int permuter = vse.dawg_info->permuter;
1333  if (permuter == NUMBER_PERM || permuter == USER_PATTERN_PERM) {
1334  if (vse.consistency_info.num_digits == vse.length) {
1335  features[PTRAIN_DIGITS_SHORT+len] = 1.0;
1336  } else {
1337  features[PTRAIN_NUM_SHORT+len] = 1.0;
1338  }
1339  } else if (permuter == DOC_DAWG_PERM) {
1340  features[PTRAIN_DOC_SHORT+len] = 1.0;
1341  } else if (permuter == SYSTEM_DAWG_PERM || permuter == USER_DAWG_PERM ||
1342  permuter == COMPOUND_PERM) {
1343  features[PTRAIN_DICT_SHORT+len] = 1.0;
1344  } else if (permuter == FREQ_DAWG_PERM) {
1345  features[PTRAIN_FREQ_SHORT+len] = 1.0;
1346  }
1347  }
1348  // Record shape cost feature (normalized by path length).
1349  features[PTRAIN_SHAPE_COST_PER_CHAR] =
1350  vse.associate_stats.shape_cost / static_cast<float>(vse.length);
1351  // Record ngram cost. (normalized by the path length).
1352  features[PTRAIN_NGRAM_COST_PER_CHAR] = 0.0;
1353  if (vse.ngram_info != NULL) {
1354  features[PTRAIN_NGRAM_COST_PER_CHAR] =
1355  vse.ngram_info->ngram_cost / static_cast<float>(vse.length);
1356  }
1357  // Record consistency-related features.
1358  // Disabled this feature for due to its poor performance.
1359  // features[PTRAIN_NUM_BAD_PUNC] = vse.consistency_info.NumInconsistentPunc();
1362  features[PTRAIN_NUM_BAD_CHAR_TYPE] = vse.dawg_info == NULL ?
1364  features[PTRAIN_NUM_BAD_SPACING] =
1366  // Disabled this feature for now due to its poor performance.
1367  // features[PTRAIN_NUM_BAD_FONT] = vse.consistency_info.inconsistent_font;
1368 
1369  // Classifier-related features.
1370  features[PTRAIN_RATING_PER_CHAR] =
1371  vse.ratings_sum / static_cast<float>(vse.outline_length);
1372 }
1373 
1375  ViterbiStateEntry *vse,
1376  WERD_RES *word_res,
1377  DANGERR *fixpt,
1378  BlamerBundle *blamer_bundle,
1379  bool *truth_path) {
1380  if (truth_path != NULL) {
1381  *truth_path =
1382  (blamer_bundle != NULL &&
1383  vse->length == blamer_bundle->correct_segmentation_length());
1384  }
1385  BLOB_CHOICE *curr_b = vse->curr_b;
1386  ViterbiStateEntry *curr_vse = vse;
1387 
1388  int i;
1389  bool compound = dict_->hyphenated(); // treat hyphenated words as compound
1390 
1391  // Re-compute the variance of the width-to-height ratios (since we now
1392  // can compute the mean over the whole word).
1393  float full_wh_ratio_mean = 0.0f;
1394  if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
1396  full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /
1397  static_cast<float>(vse->length));
1398  vse->associate_stats.full_wh_ratio_var = 0.0f;
1399  }
1400 
1401  // Construct a WERD_CHOICE by tracing parent pointers.
1402  WERD_CHOICE *word = new WERD_CHOICE(word_res->uch_set, vse->length);
1403  word->set_length(vse->length);
1404  int total_blobs = 0;
1405  for (i = (vse->length-1); i >= 0; --i) {
1406  if (blamer_bundle != NULL && truth_path != NULL && *truth_path &&
1407  !blamer_bundle->MatrixPositionCorrect(i, curr_b->matrix_cell())) {
1408  *truth_path = false;
1409  }
1410  // The number of blobs used for this choice is row - col + 1.
1411  int num_blobs = curr_b->matrix_cell().row - curr_b->matrix_cell().col + 1;
1412  total_blobs += num_blobs;
1413  word->set_blob_choice(i, num_blobs, curr_b);
1414  // Update the width-to-height ratio variance. Useful non-space delimited
1415  // languages to ensure that the blobs are of uniform width.
1416  // Skip leading and trailing punctuation when computing the variance.
1417  if ((full_wh_ratio_mean != 0.0f &&
1418  ((curr_vse != vse && curr_vse->parent_vse != NULL) ||
1419  !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) {
1421  pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
1422  if (language_model_debug_level > 2) {
1423  tprintf("full_wh_ratio_var += (%g-%g)^2\n",
1424  full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);
1425  }
1426  }
1427 
1428  // Mark the word as compound if compound permuter was set for any of
1429  // the unichars on the path (usually this will happen for unichars
1430  // that are compounding operators, like "-" and "/").
1431  if (!compound && curr_vse->dawg_info &&
1432  curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true;
1433 
1434  // Update curr_* pointers.
1435  curr_vse = curr_vse->parent_vse;
1436  if (curr_vse == NULL) break;
1437  curr_b = curr_vse->curr_b;
1438  }
1439  ASSERT_HOST(i == 0); // check that we recorded all the unichar ids.
1440  ASSERT_HOST(total_blobs == word_res->ratings->dimension());
1441  // Re-adjust shape cost to include the updated width-to-height variance.
1442  if (full_wh_ratio_mean != 0.0f) {
1444  }
1445 
1446  word->set_rating(vse->ratings_sum);
1447  word->set_certainty(vse->min_certainty);
1450  if (vse->dawg_info != NULL) {
1451  word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
1452  } else if (language_model_ngram_on && !vse->ngram_info->pruned) {
1453  word->set_permuter(NGRAM_PERM);
1454  } else if (vse->top_choice_flags) {
1456  } else {
1457  word->set_permuter(NO_PERM);
1458  }
1459  word->set_dangerous_ambig_found_(!dict_->NoDangerousAmbig(word, fixpt, true,
1460  word_res->ratings));
1461  return word;
1462 }
1463 
1464 } // namespace tesseract
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:472
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
bool AcceptablePath(const ViterbiStateEntry &vse)
bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
name_table name
inT16 fontinfo_id() const
Definition: ratngs.h:85
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:408
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
DawgPositionVector * beginning_active_dawgs_
int size() const
Definition: genericvector.h:72
int tessedit_truncate_wordchoice_log
Definition: dict.h:618
GenericVector< int > blob_widths
Definition: pageres.h:205
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
ViterbiStateEntry_LIST viterbi_state_entries
Definition: lm_state.h:207
TWERD * chopped_word
Definition: pageres.h:201
float CertaintyScore(float cert)
void print_state(const char *msg) const
Definition: ratngs.cpp:738
ViterbiStateEntry * best_vse
Definition: lm_state.h:234
#define tprintf(...)
Definition: tprintf.h:31
int LetterIsOkay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:350
int han_sid() const
Definition: unicharset.h:836
void set_x_heights(float min_height, float max_height)
Definition: ratngs.h:339
MATRIX * ratings
Definition: pageres.h:215
LanguageModelDawgInfo * dawg_info
Definition: lm_state.h:179
DawgType type() const
Definition: dawg.h:127
const MATRIX_COORD & matrix_cell()
Definition: ratngs.h:114
LanguageModelFlagsType top_choice_flags
Definition: lm_state.h:175
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:732
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134
AssociateStats associate_stats
Definition: lm_state.h:171
inT32 length() const
Definition: strngs.cpp:188
int null_sid() const
Definition: unicharset.h:831
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:304
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:125
void ComputeXheightConsistency(const BLOB_CHOICE *b, bool is_punc)
DawgPositionVector * active_dawgs
Definition: dict.h:81
static const LanguageModelFlagsType kXhtConsistentFlag
#define NULL
Definition: host.h:144
int stopper_debug_level
Definition: dict.h:612
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:32
LanguageModel(const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:611
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:596
const UNICHARSET * uch_set
Definition: pageres.h:192
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:501
bool PrunablePath(const ViterbiStateEntry &vse)
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:125
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
EDGE_REF dawg_ref
Definition: dawg.h:362
DawgPositionVector * active_dawgs
Definition: lm_state.h:65
void set_rating(float new_val)
Definition: ratngs.h:366
float x_height
Definition: pageres.h:295
int dimension() const
Definition: matrix.h:247
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
void set_length(int len)
Definition: ratngs.h:378
ViterbiStateEntry * competing_vse
Definition: lm_state.h:161
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:142
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:316
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:116
bool language_model_ngram_use_only_first_uft8_step
static int Compare(const void *e1, const void *e2)
Definition: lm_state.h:127
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const
DawgPositionVector * updated_dawgs
Definition: dict.h:82
PermuterType
Definition: ratngs.h:240
virtual bool end_of_word(EDGE_REF edge_ref) const =0
static const float kBadRating
Definition: ratngs.h:273
int language_model_viterbi_list_max_num_prunable
float viterbi_state_entries_prunable_max_cost
Definition: lm_state.h:210
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
void Print(const char *msg)
Definition: lm_state.cpp:70
float ComputeCost(const float features[]) const
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:107
float rating() const
Definition: ratngs.h:324
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:152
void SetScriptPositions(bool small_caps, TWERD *word)
Definition: ratngs.cpp:528
const char * string() const
Definition: strngs.cpp:193
int UNICHAR_ID
Definition: unichar.h:33
const STRING & unichar_string() const
Definition: ratngs.h:524
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:783
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:301
DawgPositionVector * very_beginning_active_dawgs_
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
bool language_model_ngram_space_delimited_language
PermuterType permuter
Definition: dict.h:83
double language_model_ngram_nonmatch_score
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
uinT8 permuter() const
Definition: ratngs.h:343
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:310
void Print(const char *msg) const
Definition: lm_state.cpp:27
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:412
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:625
LanguageModelNgramInfo * ngram_info
Definition: lm_state.h:183
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:363
Definition: strngs.h:44
ViterbiStateEntry * parent_vse
Definition: lm_state.h:158
static const LanguageModelFlagsType kDigitFlag
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
PointerVector< LanguageModelState > beam
Definition: lm_state.h:232
const UnicityTable< FontInfo > * fontinfo_table_
bool empty() const
Definition: genericvector.h:84
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
bool HasAlnumChoice(const UNICHARSET &unicharset)
Definition: lm_state.h:142
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
float rating() const
Definition: ratngs.h:79
unsigned char LanguageModelFlagsType
Definition: lm_state.h:37
static const LanguageModelFlagsType kUpperCaseFlag
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:290
int common_sid() const
Definition: unicharset.h:832
inT64 NODE_REF
Definition: dawg.h:55
void print() const
Definition: ratngs.h:563
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477
static const LanguageModelFlagsType kSmallestRatingFlag
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
Definition: stopper.cpp:51
static const LanguageModelFlagsType kLowerCaseFlag
float features[PTRAIN_NUM_FEATURE_TYPES]
Definition: cluster.h:45
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
double language_model_penalty_non_freq_dict_word
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:406
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:540
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:427
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
WERD_CHOICE * raw_choice
Definition: pageres.h:224
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
Definition: hyphen.cpp:49
void set_dangerous_ambig_found_(bool value)
Definition: ratngs.h:363
int size() const
Definition: unicharset.h:297
int hiragana_sid() const
Definition: unicharset.h:837
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
inT16 fontinfo_id2() const
Definition: ratngs.h:88
void set_best_choice_is_dict_and_top_choice(bool value)
Definition: blamer.h:135
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:631
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord)
Definition: blamer.h:131
WERD_CHOICE * best_choice
Definition: pageres.h:219
void DisplaySegmentation(TWERD *word)
Definition: ratngs.cpp:747
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:612
int NumInconsistentChartype() const
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:523
static const float kMaxAvgNgramCost
double language_model_penalty_non_dict_word
#define ASSERT_HOST(x)
Definition: errcode.h:84
float certainty() const
Definition: ratngs.h:82
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
static float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b)
Definition: associate.h:82
void set_certainty(float new_val)
Definition: ratngs.h:369
int katakana_sid() const
Definition: unicharset.h:838
int correct_segmentation_length() const
Definition: blamer.h:126
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo)
Definition: blamer.h:154
XHeightConsistencyEnum xht_decision
LMConsistencyInfo consistency_info
Definition: lm_state.h:170
void UpdateBestRating(float rating)
Definition: blamer.h:122
void set_permuter(uinT8 perm)
Definition: ratngs.h:372
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
Definition: ratngs.cpp:132
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)