42 int TessLangModel::max_edge_ = 4096;
45 const Dawg *TessLangModel::ood_dawg_ =
reinterpret_cast<Dawg *
>(
DAWG_OOD);
46 const Dawg *TessLangModel::number_dawg_ =
reinterpret_cast<Dawg *
>(
DAWG_NUMBER);
55 const int TessLangModel::num_max_repeat_[
kStateCnt] = {3, 32, 8, 3};
61 const string &data_file_path,
62 bool load_system_dawg,
68 LoadLangModelElements(lm_params);
72 if (load_system_dawg &&
78 cntxt_->
Lang().c_str(),
87 void TessLangModel::FreeEdges(
int edge_cnt,
LangModEdge **edge_array) {
88 if (edge_array !=
NULL) {
89 for (
int edge_idx = 0; edge_idx < edge_cnt; edge_idx++) {
90 if (edge_array[edge_idx] !=
NULL) {
91 delete edge_array[edge_idx];
111 for (
int edge_idx = 0; edge_idx < edge_cnt; edge_idx++) {
113 if (sequence[0] == edge_array[edge_idx]->EdgeString()[0]) {
115 if (sequence[1] == 0) {
117 if (eow_flag ==
false || edge_array[edge_idx]->IsEOW()) {
118 if (final_edge !=
NULL) {
119 (*final_edge) = edge_array[edge_idx];
120 edge_array[edge_idx] =
NULL;
123 FreeEdges(edge_cnt, edge_array);
129 final_edge) ==
true) {
130 FreeEdges(edge_cnt, edge_array);
137 FreeEdges(edge_cnt, edge_array);
147 if (final_edge !=
NULL) {
148 (*final_edge) =
NULL;
155 return lead_punc_.find(ch) != string::npos;
159 return trail_punc_.find(ch) != string::npos;
163 return digits_.find(ch) != string::npos;
179 if (tess_lm_edge ==
NULL) {
181 int dawg_cnt = NumDawgs();
183 (*edge_cnt) = dawg_cnt * max_edge_;
185 if (edge_array ==
NULL) {
189 for (
int dawg_idx = (*edge_cnt) = 0; dawg_idx < dawg_cnt; dawg_idx++) {
190 const Dawg *curr_dawg = GetDawg(dawg_idx);
194 (*edge_cnt) += FanOut(alt_list, curr_dawg, 0, 0,
NULL,
true,
195 edge_array + (*edge_cnt));
199 (*edge_cnt) += FanOut(alt_list, number_dawg_, 0, 0,
NULL,
true,
200 edge_array + (*edge_cnt));
204 (*edge_cnt) += FanOut(alt_list, ood_dawg_, 0, 0,
NULL,
true,
205 edge_array + (*edge_cnt));
208 for (
int edge_idx = 0; edge_idx < (*edge_cnt); edge_idx++) {
209 edge_array[edge_idx]->
SetRoot(
true);
213 (*edge_cnt) = max_edge_;
216 if (edge_array ==
NULL) {
221 (*edge_cnt) = FanOut(alt_list,
224 tess_lm_edge->
EdgeString(),
false, edge_array);
231 int TessLangModel::Edges(
const char *strng,
const Dawg *dawg,
237 for (edge_idx = 0; strng[edge_idx] != 0; edge_idx++) {
239 if (class_id != INVALID_UNICHAR_ID) {
243 if (edge_array[edge_cnt] ==
NULL) {
248 SetEdgeMask(edge_mask);
261 for (
int class_id = 0; class_id < class_cnt; class_id++) {
263 if ((alt_list ==
NULL ||
264 alt_list->
ClassCost(class_id) <= max_ood_shape_cost_)) {
267 if (edge_array[edge_cnt] ==
NULL) {
281 const char_32 *str,
bool root_flag,
287 if (dawg == reinterpret_cast<Dawg *>(
DAWG_OOD)) {
289 return OODEdges(alt_list, edge_ref, edge_mask, edge_array);
293 }
else if (dawg == reinterpret_cast<Dawg *>(
DAWG_NUMBER)) {
296 return NumberEdges(edge_ref, edge_array);
304 return Edges(trail_punc_.c_str(), dawg, edge_ref,
309 }
else if (root_flag ==
true || edge_ref == 0) {
320 bool eow_flag = (dawg->
end_of_word(edge_ref) != 0);
323 if (eow_flag ==
true) {
326 edge_cnt += Edges(trail_punc_.c_str(), dawg, edge_ref,
329 edge_cnt += Edges(
"-/", dawg, 0, 0, edge_array + edge_cnt);
335 if (next_node == 0 || next_node == NO_EDGE) {
345 edge_array + edge_cnt);
346 int strt_cnt = edge_cnt;
349 for (
int child = 0; child < child_edge_cnt; child++) {
351 SetEdgeMask(edge_mask);
355 if (root_flag ==
true) {
356 for (
int child = 0; child < child_edge_cnt; child++) {
360 if (has_case_ ==
true) {
362 if (edge_str !=
NULL && islower(edge_str[0]) != 0 &&
366 if (class_id != INVALID_UNICHAR_ID) {
371 if (edge_array[edge_cnt] !=
NULL) {
373 SetEdgeMask(edge_mask);
407 new_state = num_state_machine_[state][lit];
412 if (new_state == state) {
413 new_repeat_cnt = repeat_cnt + 1;
419 if (new_repeat_cnt > num_max_repeat_[state]) {
427 edge_cnt += Edges(literal_str_[lit]->c_str(), number_dawg_,
428 new_edge_ref, 0, edge_array + edge_cnt);
435 bool TessLangModel::LoadLangModelElements(
const string &lm_params) {
438 vector<string> str_vec;
440 for (
int entry = 0; entry < str_vec.size(); entry++) {
441 vector<string> tokens;
444 if (tokens.size() != 2)
446 if (tokens[0] ==
"LeadPunc") {
447 lead_punc_ = tokens[1];
448 }
else if (tokens[0] ==
"TrailPunc") {
449 trail_punc_ = tokens[1];
450 }
else if (tokens[0] ==
"NumLeadPunc") {
451 num_lead_punc_ = tokens[1];
452 }
else if (tokens[0] ==
"NumTrailPunc") {
453 num_trail_punc_ = tokens[1];
454 }
else if (tokens[0] ==
"Operators") {
455 operators_ = tokens[1];
456 }
else if (tokens[0] ==
"Digits") {
458 }
else if (tokens[0] ==
"Alphas") {
473 literal_str_[0] = &num_lead_punc_;
474 literal_str_[1] = &num_trail_punc_;
475 literal_str_[2] = &digits_;
476 literal_str_[3] = &operators_;
477 literal_str_[4] = &alphas_;
492 for (
int i = 0; i < len; ++i) {
494 if (class_id != INVALID_UNICHAR_ID) {
495 clean_str32[clean_len] = lm_str32[i];
499 clean_str32[clean_len] = 0;
500 if (clean_len < len) {
504 delete [] clean_str32;
507 int TessLangModel::NumDawgs()
const {
508 return (word_dawgs_ !=
NULL) ?
514 const Dawg *TessLangModel::GetDawg(
int index)
const {
515 if (word_dawgs_ !=
NULL) {
517 return (*word_dawgs_)[index];
519 ASSERT_HOST(index < cntxt_->TesseractObject()->getDict().NumDawgs());
TessLangModel(const string &lm_params, const string &data_file_path, bool load_system_dawg, TessdataManager *tessdata_manager, CubeRecoContext *cntxt)
LangModEdge ** GetEdges(CharAltList *alt_list, LangModEdge *edge, int *edge_cnt)
const string & Lang() const
const int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
bool SeekToStart(TessdataType tessdata_type)
const char_32 * EdgeString() const
GenericVector< Dawg * > DawgVector
virtual void SetRoot(bool flag)=0
static int StrLen(const char_32 *str)
#define NUMBER_LITERAL_SHIFT
int ClassID(const char_32 *str) const
EDGE_REF StartEdge() const
bool IsTrailingPunc(char_32 ch)
virtual bool end_of_word(EDGE_REF edge_ref) const =0
FILE * GetDataFilePtr() const
basic_string< char_32 > string_32
#define NUMBER_REPEAT_MASK
static int CreateChildren(CubeRecoContext *cntxt, const Dawg *edges, NODE_REF edge_reg, LangModEdge **lm_edges)
#define NUMBER_STATE_SHIFT
static void SplitStringUsing(const string &str, const string &delims, vector< string > *str_vec)
CharSet * CharacterSet() const
EDGE_REF EdgeMask() const
bool IsLeadingPunc(char_32 ch)
int ClassCost(int class_id) const
#define IsTrailingPuncEdge(edge_mask)
#define NUMBER_REPEAT_SHIFT
#define TrailingPuncEdgeMask(Cnt)
const Dawg * GetDawg() const
bool IsValidSequence(const char_32 *sequence, bool eow_flag, LangModEdge **final_edge=NULL)
tesseract::Tesseract * TesseractObject() const
#define NUMBER_STATE_MASK
#define TrailingPuncCount(edge_mask)
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
void RemoveInvalidCharacters(string *lm_str)
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
static int Prob2Cost(double prob_val)
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
#define LEAD_PUNC_EDGE_REF_MASK
static void UTF32ToUTF8(const char_32 *utf32_str, string *str)