tesseract  3.04.00
dict.h
Go to the documentation of this file.
1 // File: dict.h
3 // Description: dict class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifndef TESSERACT_DICT_DICT_H_
20 #define TESSERACT_DICT_DICT_H_
21 
22 #include "ambigs.h"
23 #include "dawg.h"
24 #include "dawg_cache.h"
25 #include "host.h"
26 #include "oldlist.h"
27 #include "ratngs.h"
28 #include "stopper.h"
29 #include "trie.h"
30 #include "unicharset.h"
32 
33 class MATRIX;
34 class WERD_RES;
35 
36 #define MAX_WERD_LENGTH (inT64) 128
37 #define NO_RATING -1
38 
44  float rating;
45  float certainty;
46 };
47 
48 namespace tesseract {
49 
51 
52 //
53 // Constants
54 //
55 static const int kRatingPad = 4;
56 static const char kDictWildcard[] = "\u2606"; // WHITE STAR
57 static const int kDictMaxWildcards = 2; // max wildcards for a word
58 // TODO(daria): If hyphens are different in different languages and can be
59 // inferred from training data we should load their values dynamically.
60 static const char kHyphenSymbol[] = "-";
61 static const char kSlashSymbol[] = "/";
62 static const char kQuestionSymbol[] = "?";
63 static const char kApostropheSymbol[] = "'";
64 static const float kSimCertaintyScale = -10.0; // similarity matcher scaling
65 static const float kSimCertaintyOffset = -10.0; // similarity matcher offset
66 static const float kSimilarityFloor = 100.0; // worst E*L product to stop on
67 static const int kDocDictMaxRepChars = 4;
68 
69 // Enum for describing whether the x-height for the word is consistent:
70 // 0 - everything is good.
71 // 1 - there are one or two secondary (but consistent) baselines
72 // [think subscript and superscript], or there is an oversized
73 // first character.
74 // 2 - the word is inconsistent.
76 
77 struct DawgArgs {
79  : active_dawgs(d), updated_dawgs(up), permuter(p) {}
80 
84 };
85 
86 class Dict {
87  public:
88  Dict(CCUtil* image_ptr);
89  ~Dict();
90  const CCUtil* getCCUtil() const {
91  return ccutil_;
92  }
94  return ccutil_;
95  }
96  const UNICHARSET& getUnicharset() const {
97  return getCCUtil()->unicharset;
98  }
100  return getCCUtil()->unicharset;
101  }
103  return getCCUtil()->unichar_ambigs;
104  }
105 
106  // Returns true if unichar_id is a word compounding character like - or /.
108  const GenericVector<UNICHAR_ID>& normed_ids =
109  getUnicharset().normed_ids(unichar_id);
110  return normed_ids.size() == 1 &&
111  (normed_ids[0] == hyphen_unichar_id_ ||
112  normed_ids[0] == slash_unichar_id_);
113  }
114  // Returns true if unichar_id is an apostrophe-like character that may
115  // separate prefix/suffix words from a main body word.
117  const GenericVector<UNICHAR_ID>& normed_ids =
118  getUnicharset().normed_ids(unichar_id);
119  return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
120  }
121 
122  /* hyphen.cpp ************************************************************/
123 
125  inline bool hyphenated() const { return
126  !last_word_on_line_ && hyphen_word_;
127  }
129  inline int hyphen_base_size() const {
130  return this->hyphenated() ? hyphen_word_->length() : 0;
131  }
135  inline void copy_hyphen_info(WERD_CHOICE *word) const {
136  if (this->hyphenated()) {
137  *word = *hyphen_word_;
138  if (hyphen_debug_level) word->print("copy_hyphen_info: ");
139  }
140  }
142  inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const {
143  if (!last_word_on_line_ || first_pos)
144  return false;
145  const GenericVector<UNICHAR_ID>& normed_ids =
146  getUnicharset().normed_ids(unichar_id);
147  return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
148  }
150  inline bool has_hyphen_end(const WERD_CHOICE &word) const {
151  int word_index = word.length() - 1;
152  return has_hyphen_end(word.unichar_id(word_index), word_index == 0);
153  }
157  void reset_hyphen_vars(bool last_word_on_line);
160  void set_hyphen_word(const WERD_CHOICE &word,
161  const DawgPositionVector &active_dawgs);
162 
163  /* permdawg.cpp ************************************************************/
164  // Note: Functions in permdawg.cpp are only used by NoDangerousAmbig().
165  // When this function is refactored, permdawg.cpp can be removed.
166 
169  inline void update_best_choice(const WERD_CHOICE &word,
170  WERD_CHOICE *best_choice) {
171  if (word.rating() < best_choice->rating()) {
172  *best_choice = word;
173  }
174  }
178  void init_active_dawgs(DawgPositionVector *active_dawgs,
179  bool ambigs_mode) const;
180  // Fill the given vector with the default collection of any-length dawgs
181  void default_dawgs(DawgPositionVector *anylength_dawgs,
182  bool suppress_patterns) const;
183 
184 
190  WERD_CHOICE *dawg_permute_and_select(
191  const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit);
195  void go_deeper_dawg_fxn(
196  const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
197  int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
198  bool word_ending, WERD_CHOICE *word, float certainties[],
199  float *limit, WERD_CHOICE *best_choice, int *attempts_left,
200  void *void_more_args);
201 
203  void (Dict::*go_deeper_fxn_)(const char *debug,
204  const BLOB_CHOICE_LIST_VECTOR &char_choices,
205  int char_choice_index,
206  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
207  bool word_ending, WERD_CHOICE *word,
208  float certainties[], float *limit,
209  WERD_CHOICE *best_choice, int *attempts_left,
210  void *void_more_args);
211  //
212  // Helper functions for dawg_permute_and_select().
213  //
214  void permute_choices(
215  const char *debug,
216  const BLOB_CHOICE_LIST_VECTOR &char_choices,
217  int char_choice_index,
218  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
219  WERD_CHOICE *word,
220  float certainties[],
221  float *limit,
222  WERD_CHOICE *best_choice,
223  int *attempts_left,
224  void *more_args);
225 
226  void append_choices(
227  const char *debug,
228  const BLOB_CHOICE_LIST_VECTOR &char_choices,
229  const BLOB_CHOICE &blob_choice,
230  int char_choice_index,
231  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
232  WERD_CHOICE *word,
233  float certainties[],
234  float *limit,
235  WERD_CHOICE *best_choice,
236  int *attempts_left,
237  void *more_args);
238 
239  bool fragment_state_okay(UNICHAR_ID curr_unichar_id,
240  float curr_rating, float curr_certainty,
241  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
242  const char *debug, int word_ending,
243  CHAR_FRAGMENT_INFO *char_frag_info);
244 
245  /* stopper.cpp *************************************************************/
246  bool NoDangerousAmbig(WERD_CHOICE *BestChoice,
247  DANGERR *fixpt,
248  bool fix_replaceable,
249  MATRIX* ratings);
250  // Replaces the corresponding wrong ngram in werd_choice with the correct
251  // one. The whole correct n-gram is inserted into the ratings matrix and
252  // the werd_choice: no more fragments!. Rating and certainty of new entries
253  // in matrix and werd_choice are the sum and mean of the wrong ngram
254  // respectively.
255  // E.g. for werd_choice mystring'' and ambiguity ''->": werd_choice becomes
256  // mystring", with a new entry in the ratings matrix for ".
257  void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
258  UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
259  MATRIX *ratings);
260 
262  int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice);
270  int UniformCertainties(const WERD_CHOICE& word);
272  bool AcceptableChoice(const WERD_CHOICE& best_choice,
273  XHeightConsistencyEnum xheight_consistency);
277  bool AcceptableResult(WERD_RES* word);
278  void EndDangerousAmbigs();
280  void DebugWordChoices();
282  void SettupStopperPass1();
284  void SettupStopperPass2();
285  /* context.cpp *************************************************************/
287  int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset);
290  bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset);
291 
292  /* dict.cpp ****************************************************************/
293 
296  static DawgCache *GlobalDawgCache();
297  void Load(DawgCache *dawg_cache);
298  void End();
299 
300  // Resets the document dictionary analogous to ResetAdaptiveClassifier.
302  if (pending_words_ != NULL)
303  pending_words_->clear();
304  if (document_words_ != NULL)
305  document_words_->clear();
306  }
307 
343  //
344  int def_letter_is_okay(void* void_dawg_args,
345  UNICHAR_ID unichar_id, bool word_end) const;
346 
347  int (Dict::*letter_is_okay_)(void* void_dawg_args,
348  UNICHAR_ID unichar_id, bool word_end) const;
350  int LetterIsOkay(void* void_dawg_args,
351  UNICHAR_ID unichar_id, bool word_end) const {
352  return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end);
353  }
354 
355 
357  double (Dict::*probability_in_context_)(const char* lang,
358  const char* context,
359  int context_bytes,
360  const char* character,
361  int character_bytes);
363  double ProbabilityInContext(const char* context,
364  int context_bytes,
365  const char* character,
366  int character_bytes) {
367  return (this->*probability_in_context_)(
368  getCCUtil()->lang.string(),
369  context, context_bytes,
370  character, character_bytes);
371  }
372 
375  const char* lang, const char* context, int context_bytes,
376  const char* character, int character_bytes) {
377  (void) context;
378  (void) context_bytes;
379  (void) character;
380  (void) character_bytes;
381  return 0.0;
382  }
383  double ngram_probability_in_context(const char* lang,
384  const char* context,
385  int context_bytes,
386  const char* character,
387  int character_bytes);
388 
389  // Interface with params model.
390  float (Dict::*params_model_classify_)(const char *lang, void *path);
391  float ParamsModelClassify(const char *lang, void *path);
392  // Call params_model_classify_ member function.
393  float CallParamsModelClassify(void *path) {
394  ASSERT_HOST(params_model_classify_ != NULL); // ASSERT_HOST -> assert
395  return (this->*params_model_classify_)(
396  getCCUtil()->lang.string(), path);
397  }
398 
399  inline void SetWildcardID(UNICHAR_ID id) { wildcard_unichar_id_ = id; }
400  inline const UNICHAR_ID WildcardID() const {
401  return wildcard_unichar_id_;
402  }
404  inline const int NumDawgs() const { return dawgs_.size(); }
406  inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
408  inline const Dawg *GetPuncDawg() const { return punc_dawg_; }
410  inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; }
412  static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) {
413  if (edge_ref == NO_EDGE) return 0; // beginning to explore the dawg
414  NODE_REF node = dawg->next_node(edge_ref);
415  if (node == 0) node = NO_EDGE; // end of word
416  return node;
417  }
418 
419  // Given a unichar from a string and a given dawg, return the unichar
420  // we should use to match in that dawg type. (for example, in the number
421  // dawg, all numbers are transformed to kPatternUnicharId).
422  inline UNICHAR_ID char_for_dawg(UNICHAR_ID ch, const Dawg *dawg) const {
423  if (!dawg) return ch;
424  switch (dawg->type()) {
425  case DAWG_TYPE_NUMBER:
426  return getUnicharset().get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
427  default:
428  return ch;
429  }
430  }
431 
437  void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info,
438  UNICHAR_ID unichar_id, bool word_end,
439  DawgPositionVector *updated_dawgs,
440  PermuterType *current_permuter) const;
441 
445 
447  inline static bool valid_word_permuter(uinT8 perm, bool numbers_ok) {
448  return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
449  perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
450  perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
451  (numbers_ok && perm == NUMBER_PERM));
452  }
453  int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;
454  int valid_word(const WERD_CHOICE &word) const {
455  return valid_word(word, false); // return NO_PERM for words with digits
456  }
457  int valid_word_or_number(const WERD_CHOICE &word) const {
458  return valid_word(word, true); // return NUMBER_PERM for valid numbers
459  }
461  int valid_word(const char *string) const {
462  WERD_CHOICE word(string, getUnicharset());
463  return valid_word(word);
464  }
465  // Do the two WERD_CHOICEs form a meaningful bigram?
466  bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;
471  bool valid_punctuation(const WERD_CHOICE &word);
473  int good_choice(const WERD_CHOICE &choice);
475  void add_document_word(const WERD_CHOICE &best_choice);
477  void adjust_word(WERD_CHOICE *word,
478  bool nonword, XHeightConsistencyEnum xheight_consistency,
479  float additional_adjust,
480  bool modify_rating,
481  bool debug);
483  inline void SetWordsegRatingAdjustFactor(float f) {
484  wordseg_rating_adjust_factor_ = f;
485  }
486 
487  private:
489  CCUtil* ccutil_;
496  UnicharAmbigs *dang_ambigs_table_;
498  UnicharAmbigs *replace_ambigs_table_;
500  FLOAT32 reject_offset_;
501  // Cached UNICHAR_IDs:
502  UNICHAR_ID wildcard_unichar_id_; // kDictWildcard.
503  UNICHAR_ID apostrophe_unichar_id_; // kApostropheSymbol.
504  UNICHAR_ID question_unichar_id_; // kQuestionSymbol.
505  UNICHAR_ID slash_unichar_id_; // kSlashSymbol.
506  UNICHAR_ID hyphen_unichar_id_; // kHyphenSymbol.
507  // Hyphen-related variables.
508  WERD_CHOICE *hyphen_word_;
509  DawgPositionVector hyphen_active_dawgs_;
510  bool last_word_on_line_;
511  // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary
512  // matching. The first member of each list is taken as canonical. For
513  // example, the first list contains hyphens and dashes with the first symbol
514  // being the ASCII hyphen minus.
515  GenericVector<GenericVectorEqEq<UNICHAR_ID> > equivalent_symbols_;
516  // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs.
517  DawgCache *dawg_cache_;
518  bool dawg_cache_is_ours_; // we should delete our own dawg_cache_
519  // Dawgs.
520  DawgVector dawgs_;
521  SuccessorListsVector successors_;
522  Trie *pending_words_;
523  // bigram_dawg_ points to a dawg of two-word bigrams which always supercede if
524  // any of them are present on the best choices list for a word pair.
525  // the bigrams are stored as space-separated words where:
526  // (1) leading and trailing punctuation has been removed from each word and
527  // (2) any digits have been replaced with '?' marks.
528  Dawg *bigram_dawg_;
531  // TODO(daria): need to support multiple languages in the future,
532  // so maybe will need to maintain a list of dawgs of each kind.
533  Dawg *freq_dawg_;
534  Dawg *unambig_dawg_;
535  Dawg *punc_dawg_;
536  Trie *document_words_;
539  float wordseg_rating_adjust_factor_;
540  // File for recording ambiguities discovered during dictionary search.
541  FILE *output_ambig_words_file_;
542 
543  public:
547  STRING_VAR_H(user_words_file, "", "A filename of user-provided words.");
548  STRING_VAR_H(user_words_suffix, "",
549  "A suffix of user-provided words located in tessdata.");
550  STRING_VAR_H(user_patterns_file, "",
551  "A filename of user-provided patterns.");
552  STRING_VAR_H(user_patterns_suffix, "",
553  "A suffix of user-provided patterns located in tessdata.");
554  BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");
555  BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");
556  BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg.");
557  BOOL_VAR_H(load_punc_dawg, true,
558  "Load dawg with punctuation patterns.");
559  BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns.");
560  BOOL_VAR_H(load_bigram_dawg, true,
561  "Load dawg with special word bigrams.");
562  double_VAR_H(xheight_penalty_subscripts, 0.125,
563  "Score penalty (0.1 = 10%) added if there are subscripts "
564  "or superscripts in a word, but it is otherwise OK.");
565  double_VAR_H(xheight_penalty_inconsistent, 0.25,
566  "Score penalty (0.1 = 10%) added if an xheight is "
567  "inconsistent.");
568  double_VAR_H(segment_penalty_dict_frequent_word, 1.0,
569  "Score multiplier for word matches which have good case and"
570  "are frequent in the given language (lower is better).");
571 
572  double_VAR_H(segment_penalty_dict_case_ok, 1.1,
573  "Score multiplier for word matches that have good case "
574  "(lower is better).");
575 
576  double_VAR_H(segment_penalty_dict_case_bad, 1.3125,
577  "Default score multiplier for word matches, which may have "
578  "case issues (lower is better).");
579 
580  // TODO(daria): remove this param when ngram permuter is deprecated.
581  double_VAR_H(segment_penalty_ngram_best_choice, 1.24,
582  "Multipler to for the best choice from the ngram model.");
583 
584  double_VAR_H(segment_penalty_dict_nonword, 1.25,
585  "Score multiplier for glyph fragment segmentations which "
586  "do not match a dictionary word (lower is better).");
587 
588  double_VAR_H(segment_penalty_garbage, 1.50,
589  "Score multiplier for poorly cased strings that are not in"
590  " the dictionary and generally look like garbage (lower is"
591  " better).");
592  STRING_VAR_H(output_ambig_words_file, "",
593  "Output file for ambiguities found in the dictionary");
594  INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info"
595  ", to 2 for more details, to 3 to see all the debug messages");
596  INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");
597  INT_VAR_H(max_viterbi_list_size, 10, "Maximum size of viterbi list.");
598  BOOL_VAR_H(use_only_first_uft8_step, false,
599  "Use only the first UTF8 step of the given string"
600  " when computing log probabilities.");
601  double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
602  double_VAR_H(stopper_nondict_certainty_base, -2.50,
603  "Certainty threshold for non-dict words");
604  double_VAR_H(stopper_phase2_certainty_rejection_offset, 1.0,
605  "Reject certainty offset");
606  INT_VAR_H(stopper_smallword_size, 2,
607  "Size of dict word to be treated as non-dict word");
608  double_VAR_H(stopper_certainty_per_char, -0.50,
609  "Certainty to add for each dict char above small word size.");
610  double_VAR_H(stopper_allowable_character_badness, 3.0,
611  "Max certaintly variation allowed in a word (in sigma)");
612  INT_VAR_H(stopper_debug_level, 0, "Stopper debug level");
613  BOOL_VAR_H(stopper_no_acceptable_choices, false,
614  "Make AcceptableChoice() always return false. Useful"
615  " when there is a need to explore all segmentations");
616  BOOL_VAR_H(save_raw_choices, false,
617  "Deprecated- backward compatability only");
618  INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
619  STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information"
620  " should be printed to stdout");
621  STRING_VAR_H(word_to_debug_lengths, "",
622  "Lengths of unichars in word_to_debug");
623  INT_VAR_H(fragments_debug, 0, "Debug character fragments");
624  BOOL_VAR_H(segment_nonalphabetic_script, false,
625  "Don't use any alphabetic-specific tricks."
626  "Set to true in the traineddata config file for"
627  " scripts that are cursive or inherently fixed-pitch");
628  BOOL_VAR_H(save_doc_words, 0, "Save Document Words");
629  double_VAR_H(doc_dict_pending_threshold, 0.0,
630  "Worst certainty for using pending dictionary");
631  double_VAR_H(doc_dict_certainty_threshold, -2.25, "Worst certainty"
632  " for words that can be inserted into the document dictionary");
633  INT_VAR_H(max_permuter_attempts, 10000, "Maximum number of different"
634  " character choices to consider during permutation."
635  " This limit is especially useful when user patterns"
636  " are specified, since overly generic patterns can result in"
637  " dawg search exploring an overly large number of options.");
638 };
639 } // namespace tesseract
640 
641 #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_
int valid_word_or_number(const WERD_CHOICE &word) const
Definition: dict.h:457
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
Definition: dict.h:374
float rating
Definition: dict.h:44
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:408
int size() const
Definition: genericvector.h:72
UNICHAR_ID unichar_id
Definition: dict.h:41
float certainty
Definition: dict.h:45
int LetterIsOkay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:350
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:135
DawgType type() const
Definition: dawg.h:127
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:129
const int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
Definition: dict.h:404
GenericVector< Dawg * > DawgVector
Definition: dict.h:50
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:447
DawgPositionVector * active_dawgs
Definition: dict.h:81
#define NULL
Definition: host.h:144
float CallParamsModelClassify(void *path)
Definition: dict.h:393
void ResetDocumentDictionary()
Definition: dict.h:301
void SetWordsegRatingAdjustFactor(float f)
Set wordseg_rating_adjust_factor_ to the given value.
Definition: dict.h:483
const CCUtil * getCCUtil() const
Definition: dict.h:90
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:169
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:125
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const CHAR_FRAGMENT * fragment
Definition: dict.h:42
inT64 EDGE_REF
Definition: dawg.h:54
#define STRING_VAR_H(name, val, comment)
Definition: params.h:271
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:142
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:116
DawgPositionVector * updated_dawgs
Definition: dict.h:82
const Dawg * GetUnambigDawg() const
Return the points to the unambiguous words dawg.
Definition: dict.h:410
PermuterType
Definition: ratngs.h:240
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:107
float rating() const
Definition: ratngs.h:324
CCUtil * getCCUtil()
Definition: dict.h:93
int UNICHAR_ID
Definition: unichar.h:33
DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)
Definition: dict.h:78
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
int num_fragments
Definition: dict.h:43
PermuterType permuter
Definition: dict.h:83
bool has_hyphen_end(const WERD_CHOICE &word) const
Same as above, but check the unichar at the end of the word.
Definition: dict.h:150
int valid_word(const WERD_CHOICE &word) const
Definition: dict.h:454
Definition: matrix.h:289
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:412
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:363
#define double_VAR_H(name, val, comment)
Definition: params.h:274
inT64 NODE_REF
Definition: dawg.h:55
void print() const
Definition: ratngs.h:563
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:102
const UNICHAR_ID WildcardID() const
Definition: dict.h:400
void SetWildcardID(UNICHAR_ID id)
Definition: dict.h:399
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:406
UNICHARSET & getUnicharset()
Definition: dict.h:99
unsigned char uinT8
Definition: host.h:99
int valid_word(const char *string) const
This function is used by api/tesseract_cube_combiner.cpp.
Definition: dict.h:461
UNICHAR_ID char_for_dawg(UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:422
#define BOOL_VAR_H(name, val, comment)
Definition: params.h:268
int length() const
Definition: ratngs.h:300
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
float FLOAT32
Definition: host.h:111
#define INT_VAR_H(name, val, comment)
Definition: params.h:265
#define ASSERT_HOST(x)
Definition: errcode.h:84
XHeightConsistencyEnum
Definition: dict.h:75