tesseract
3.04.00
|
#include <language_model.h>
Public Member Functions | |
LanguageModel (const UnicityTable< FontInfo > *fontinfo_table, Dict *dict) | |
~LanguageModel () | |
void | InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale) |
bool | UpdateState (bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
bool | AcceptableChoiceFound () |
void | SetAcceptableChoiceFound (bool val) |
ParamsModel & | getParamsModel () |
Static Public Member Functions | |
static void | ExtractFeaturesFromPath (const ViterbiStateEntry &vse, float features[]) |
Static Public Attributes | |
static const LanguageModelFlagsType | kSmallestRatingFlag = 0x1 |
static const LanguageModelFlagsType | kLowerCaseFlag = 0x2 |
static const LanguageModelFlagsType | kUpperCaseFlag = 0x4 |
static const LanguageModelFlagsType | kDigitFlag = 0x8 |
static const LanguageModelFlagsType | kXhtConsistentFlag = 0x10 |
static const float | kMaxAvgNgramCost = 25.0f |
Protected Member Functions | |
float | CertaintyScore (float cert) |
float | ComputeAdjustment (int num_problems, float penalty) |
float | ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info) |
float | ComputeAdjustedPathCost (ViterbiStateEntry *vse) |
bool | GetTopLowerUpperDigit (BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const |
int | SetTopParentLowerUpperDigit (LanguageModelState *parent_node) const |
ViterbiStateEntry * | GetNextParentVSE (bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const |
bool | AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
void | GenerateTopChoiceInfo (ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms) |
LanguageModelDawgInfo * | GenerateDawgInfo (bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse) |
LanguageModelNgramInfo * | GenerateNgramInfo (const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse) |
float | ComputeNgramCost (const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob) |
float | ComputeDenom (BLOB_CHOICE_LIST *curr_list) |
void | FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info) |
void | UpdateBestChoice (ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
WERD_CHOICE * | ConstructWord (ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path) |
void | ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats) |
bool | PrunablePath (const ViterbiStateEntry &vse) |
bool | AcceptablePath (const ViterbiStateEntry &vse) |
Protected Attributes | |
DawgArgs * | dawg_args_ |
float | rating_cert_scale_ |
const UnicityTable< FontInfo > * | fontinfo_table_ |
Dict * | dict_ |
bool | fixed_pitch_ |
float | max_char_wh_ratio_ |
STRING | prev_word_str_ |
int | prev_word_unichar_step_len_ |
DawgPositionVector * | very_beginning_active_dawgs_ |
DawgPositionVector * | beginning_active_dawgs_ |
bool | acceptable_choice_found_ |
bool | correct_segmentation_explored_ |
ParamsModel | params_model_ |
Definition at line 42 of file language_model.h.
tesseract::LanguageModel::LanguageModel | ( | const UnicityTable< FontInfo > * | fontinfo_table, |
Dict * | dict | ||
) |
Definition at line 45 of file language_model.cpp.
tesseract::LanguageModel::~LanguageModel | ( | ) |
Definition at line 131 of file language_model.cpp.
|
inline |
Definition at line 95 of file language_model.h.
|
inlineprotected |
Definition at line 301 of file language_model.h.
|
protected |
Definition at line 548 of file language_model.cpp.
|
inlineprotected |
Definition at line 104 of file language_model.h.
|
protected |
Definition at line 1183 of file language_model.cpp.
|
inlineprotected |
Definition at line 116 of file language_model.h.
|
inlineprotected |
Definition at line 272 of file language_model.h.
|
inlineprotected |
Definition at line 127 of file language_model.h.
|
protected |
Definition at line 980 of file language_model.cpp.
|
protected |
Definition at line 920 of file language_model.cpp.
|
protected |
Definition at line 1374 of file language_model.cpp.
|
static |
Definition at line 1325 of file language_model.cpp.
|
protected |
Definition at line 1001 of file language_model.cpp.
|
protected |
Definition at line 772 of file language_model.cpp.
|
protected |
Definition at line 863 of file language_model.cpp.
|
protected |
Definition at line 756 of file language_model.cpp.
|
protected |
Definition at line 487 of file language_model.cpp.
|
inline |
Definition at line 100 of file language_model.h.
|
protected |
Definition at line 374 of file language_model.cpp.
void tesseract::LanguageModel::InitForWord | ( | const WERD_CHOICE * | prev_word, |
bool | fixed_pitch, | ||
float | max_char_wh_ratio, | ||
float | rating_cert_scale | ||
) |
Definition at line 138 of file language_model.cpp.
|
inlineprotected |
Definition at line 291 of file language_model.h.
|
inline |
Definition at line 96 of file language_model.h.
|
protected |
Definition at line 412 of file language_model.cpp.
|
protected |
Definition at line 1225 of file language_model.cpp.
bool tesseract::LanguageModel::UpdateState | ( | bool | just_classified, |
int | curr_col, | ||
int | curr_row, | ||
BLOB_CHOICE_LIST * | curr_list, | ||
LanguageModelState * | parent_node, | ||
LMPainPoints * | pain_points, | ||
WERD_RES * | word_res, | ||
BestChoiceBundle * | best_choice_bundle, | ||
BlamerBundle * | blamer_bundle | ||
) |
Definition at line 246 of file language_model.cpp.
|
protected |
Definition at line 408 of file language_model.h.
|
protected |
Definition at line 396 of file language_model.h.
|
protected |
Definition at line 410 of file language_model.h.
|
protected |
Definition at line 356 of file language_model.h.
|
protected |
Definition at line 375 of file language_model.h.
|
protected |
Definition at line 382 of file language_model.h.
|
protected |
Definition at line 371 of file language_model.h.
|
static |
Definition at line 48 of file language_model.h.
|
static |
Definition at line 46 of file language_model.h.
|
static |
Definition at line 53 of file language_model.h.
|
static |
Definition at line 45 of file language_model.h.
|
static |
Definition at line 47 of file language_model.h.
|
static |
Definition at line 49 of file language_model.h.
int tesseract::LanguageModel::language_model_debug_level = 0 |
"Language model debug level"
Definition at line 308 of file language_model.h.
int tesseract::LanguageModel::language_model_min_compound_length = 3 |
"Minimum length of compound words"
Definition at line 335 of file language_model.h.
double tesseract::LanguageModel::language_model_ngram_nonmatch_score = -40.0 |
"Average classifier score of a non-matching unichar"
Definition at line 322 of file language_model.h.
bool tesseract::LanguageModel::language_model_ngram_on = false |
"Turn on/off the use of character ngram model"
Definition at line 310 of file language_model.h.
int tesseract::LanguageModel::language_model_ngram_order = 8 |
"Maximum order of the character ngram model"
Definition at line 312 of file language_model.h.
double tesseract::LanguageModel::language_model_ngram_rating_factor = 16.0 |
"Factor to bring log-probs into the same range as ratings" " when multiplied by outline length "
Definition at line 331 of file language_model.h.
double tesseract::LanguageModel::language_model_ngram_scale_factor = 0.03 |
"Strength of the character ngram model relative to the" " character classifier "
Definition at line 328 of file language_model.h.
double tesseract::LanguageModel::language_model_ngram_small_prob = 0.000001 |
"To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model"
Definition at line 320 of file language_model.h.
bool tesseract::LanguageModel::language_model_ngram_space_delimited_language = true |
"Words are delimited by space"
Definition at line 333 of file language_model.h.
bool tesseract::LanguageModel::language_model_ngram_use_only_first_uft8_step = false |
"Use only the first UTF8 step of the given string" " when computing log probabilities"
Definition at line 325 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_case = 0.1 |
"Penalty for inconsistent case"
Definition at line 344 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_chartype = 0.3 |
"Penalty for inconsistent character type"
Definition at line 348 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_font = 0.00 |
"Penalty for inconsistent font"
Definition at line 350 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_increment = 0.01 |
"Penalty increment"
Definition at line 353 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_non_dict_word = 0.15 |
"Penalty for non-dictionary words"
Definition at line 340 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_non_freq_dict_word = 0.1 |
"Penalty for words not in the frequent word dictionary"
Definition at line 338 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_punc = 0.2 |
"Penalty for inconsistent punctuation"
Definition at line 342 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_script = 0.5 |
"Penalty for inconsistent script"
Definition at line 346 of file language_model.h.
double tesseract::LanguageModel::language_model_penalty_spacing = 0.05 |
"Penalty for inconsistent spacing"
Definition at line 352 of file language_model.h.
bool tesseract::LanguageModel::language_model_use_sigmoidal_certainty = false |
"Use sigmoidal score for certainty"
Definition at line 356 of file language_model.h.
int tesseract::LanguageModel::language_model_viterbi_list_max_num_prunable = 10 |
"Maximum number of prunable (those for which PrunablePath() is" " true) entries in each viterbi list recorded in BLOB_CHOICEs"
Definition at line 315 of file language_model.h.
int tesseract::LanguageModel::language_model_viterbi_list_max_size = 500 |
"Maximum size of viterbi lists recorded in BLOB_CHOICEs"
Definition at line 317 of file language_model.h.
|
protected |
Definition at line 385 of file language_model.h.
|
protected |
Definition at line 413 of file language_model.h.
|
protected |
Definition at line 392 of file language_model.h.
|
protected |
Definition at line 393 of file language_model.h.
|
protected |
Definition at line 366 of file language_model.h.
|
protected |
Definition at line 395 of file language_model.h.
int tesseract::LanguageModel::wordrec_display_segmentations = 0 |
"Display Segmentations"
Definition at line 354 of file language_model.h.