tesseract  3.04.00
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::WordUnigrams Class Reference

#include <word_unigrams.h>

Public Member Functions

 WordUnigrams ()
 
 ~WordUnigrams ()
 
int Cost (const char_32 *str32, LangModel *lang_mod, CharSet *char_set) const
 

Static Public Member Functions

static WordUnigramsCreate (const string &data_file_path, const string &lang)
 

Protected Member Functions

int CostInternal (const char *str) const
 

Detailed Description

Definition at line 34 of file word_unigrams.h.

Constructor & Destructor Documentation

tesseract::WordUnigrams::WordUnigrams ( )

Definition at line 32 of file word_unigrams.cpp.

32  {
33  costs_ = NULL;
34  words_ = NULL;
35  word_cnt_ = 0;
36 }
#define NULL
Definition: host.h:144
tesseract::WordUnigrams::~WordUnigrams ( )

Definition at line 38 of file word_unigrams.cpp.

38  {
39  if (words_ != NULL) {
40  if (words_[0] != NULL) {
41  delete []words_[0];
42  }
43 
44  delete []words_;
45  words_ = NULL;
46  }
47 
48  if (costs_ != NULL) {
49  delete []costs_;
50  }
51 }
#define NULL
Definition: host.h:144

Member Function Documentation

int tesseract::WordUnigrams::Cost ( const char_32 str32,
LangModel lang_mod,
CharSet char_set 
) const

Definition at line 150 of file word_unigrams.cpp.

152  {
153  if (!key_str32)
154  return 0;
155  // convert string to UTF8 to split into space-separated words
156  string key_str;
157  CubeUtils::UTF32ToUTF8(key_str32, &key_str);
158  vector<string> words;
159  CubeUtils::SplitStringUsing(key_str, " \t", &words);
160 
161  // no words => no cost
162  if (words.size() <= 0) {
163  return 0;
164  }
165 
166  // aggregate the costs of all the words
167  int cost = 0;
168  for (int word_idx = 0; word_idx < words.size(); word_idx++) {
169  // convert each word back to UTF32 for analyzing case and punctuation
170  string_32 str32;
171  CubeUtils::UTF8ToUTF32(words[word_idx].c_str(), &str32);
172  int len = CubeUtils::StrLen(str32.c_str());
173 
174  // strip all trailing punctuation
175  string clean_str;
176  int clean_len = len;
177  bool trunc = false;
178  while (clean_len > 0 &&
179  lang_mod->IsTrailingPunc(str32.c_str()[clean_len - 1])) {
180  --clean_len;
181  trunc = true;
182  }
183 
184  // If either the original string was not truncated (no trailing
185  // punctuation) or the entire string was removed (all characters
186  // are trailing punctuation), evaluate original word as is;
187  // otherwise, copy all but the trailing punctuation characters
188  char_32 *clean_str32 = NULL;
189  if (clean_len == 0 || !trunc) {
190  clean_str32 = CubeUtils::StrDup(str32.c_str());
191  } else {
192  clean_str32 = new char_32[clean_len + 1];
193  for (int i = 0; i < clean_len; ++i) {
194  clean_str32[i] = str32[i];
195  }
196  clean_str32[clean_len] = '\0';
197  }
198  ASSERT_HOST(clean_str32 != NULL);
199 
200  string str8;
201  CubeUtils::UTF32ToUTF8(clean_str32, &str8);
202  int word_cost = CostInternal(str8.c_str());
203 
204  // if case invariant, get costs of all-upper-case and all-lower-case
205  // versions and return the min cost
206  if (clean_len >= kMinLengthNumOrCaseInvariant &&
207  CubeUtils::IsCaseInvariant(clean_str32, char_set)) {
208  char_32 *lower_32 = CubeUtils::ToLower(clean_str32, char_set);
209  if (lower_32) {
210  string lower_8;
211  CubeUtils::UTF32ToUTF8(lower_32, &lower_8);
212  word_cost = MIN(word_cost, CostInternal(lower_8.c_str()));
213  delete [] lower_32;
214  }
215  char_32 *upper_32 = CubeUtils::ToUpper(clean_str32, char_set);
216  if (upper_32) {
217  string upper_8;
218  CubeUtils::UTF32ToUTF8(upper_32, &upper_8);
219  word_cost = MIN(word_cost, CostInternal(upper_8.c_str()));
220  delete [] upper_32;
221  }
222  }
223 
224  if (clean_len >= kMinLengthNumOrCaseInvariant) {
225  // if characters are all numeric, incur 0 word cost
226  bool is_numeric = true;
227  for (int i = 0; i < clean_len; ++i) {
228  if (!lang_mod->IsDigit(clean_str32[i]))
229  is_numeric = false;
230  }
231  if (is_numeric)
232  word_cost = 0;
233  }
234  delete [] clean_str32;
235  cost += word_cost;
236  } // word_idx
237 
238  // return the mean cost
239  return static_cast<int>(cost / static_cast<double>(words.size()));
240 }
#define MIN(x, y)
Definition: ndminx.h:28
static char_32 * ToLower(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:324
#define ASSERT_HOST(x)
Definition: errcode.h:84
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
Definition: cube_utils.cpp:244
int CostInternal(const char *str) const
static int StrLen(const char_32 *str)
Definition: cube_utils.cpp:48
static void UTF32ToUTF8(const char_32 *utf32_str, string *str)
Definition: cube_utils.cpp:258
static void SplitStringUsing(const string &str, const string &delims, vector< string > *str_vec)
Definition: cube_utils.cpp:210
#define NULL
Definition: host.h:144
signed int char_32
Definition: string_32.h:40
static char_32 * ToUpper(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:357
static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:270
basic_string< char_32 > string_32
Definition: string_32.h:41
static char_32 * StrDup(const char_32 *str)
Definition: cube_utils.cpp:80
int tesseract::WordUnigrams::CostInternal ( const char *  str) const
protected

Definition at line 243 of file word_unigrams.cpp.

243  {
244  if (strlen(key_str) == 0)
245  return not_in_list_cost_;
246  int hi = word_cnt_ - 1;
247  int lo = 0;
248  while (lo <= hi) {
249  int current = (hi + lo) / 2;
250  int comp = strcmp(key_str, words_[current]);
251  // a match
252  if (comp == 0) {
253  return costs_[current];
254  }
255  if (comp < 0) {
256  // go lower
257  hi = current - 1;
258  } else {
259  // go higher
260  lo = current + 1;
261  }
262  }
263  return not_in_list_cost_;
264 }
WordUnigrams * tesseract::WordUnigrams::Create ( const string &  data_file_path,
const string &  lang 
)
static

Definition at line 55 of file word_unigrams.cpp.

56  {
57  string file_name;
58  string str;
59 
60  file_name = data_file_path + lang;
61  file_name += ".cube.word-freq";
62 
63  // load the string into memory
64  if (CubeUtils::ReadFileToString(file_name, &str) == false) {
65  return NULL;
66  }
67 
68  // split into lines
69  vector<string> str_vec;
70  CubeUtils::SplitStringUsing(str, "\r\n \t", &str_vec);
71  if (str_vec.size() < 2) {
72  return NULL;
73  }
74 
75  // allocate memory
76  WordUnigrams *word_unigrams_obj = new WordUnigrams();
77  if (word_unigrams_obj == NULL) {
78  fprintf(stderr, "Cube ERROR (WordUnigrams::Create): could not create "
79  "word unigrams object.\n");
80  return NULL;
81  }
82 
83  int full_len = str.length();
84  int word_cnt = str_vec.size() / 2;
85  word_unigrams_obj->words_ = new char*[word_cnt];
86  word_unigrams_obj->costs_ = new int[word_cnt];
87 
88  if (word_unigrams_obj->words_ == NULL ||
89  word_unigrams_obj->costs_ == NULL) {
90  fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error allocating "
91  "word unigram fields.\n");
92  delete word_unigrams_obj;
93  return NULL;
94  }
95 
96  word_unigrams_obj->words_[0] = new char[full_len];
97  if (word_unigrams_obj->words_[0] == NULL) {
98  fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error allocating "
99  "word unigram fields.\n");
100  delete word_unigrams_obj;
101  return NULL;
102  }
103 
104  // construct sorted list of words and costs
105  word_unigrams_obj->word_cnt_ = 0;
106  char *char_buff = word_unigrams_obj->words_[0];
107  word_cnt = 0;
108  int max_cost = 0;
109 
110  for (int wrd = 0; wrd < str_vec.size(); wrd += 2) {
111  word_unigrams_obj->words_[word_cnt] = char_buff;
112 
113  strcpy(char_buff, str_vec[wrd].c_str());
114  char_buff += (str_vec[wrd].length() + 1);
115 
116  if (sscanf(str_vec[wrd + 1].c_str(), "%d",
117  word_unigrams_obj->costs_ + word_cnt) != 1) {
118  fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error reading "
119  "word unigram data.\n");
120  delete word_unigrams_obj;
121  return NULL;
122  }
123  // update max cost
124  max_cost = MAX(max_cost, word_unigrams_obj->costs_[word_cnt]);
125  word_cnt++;
126  }
127  word_unigrams_obj->word_cnt_ = word_cnt;
128 
129  // compute the not-in-list-cost by assuming that a word not in the list
130  // [ahmadab]: This can be computed as follows:
131  // - Given that the distribution of words follow Zipf's law:
132  // (F = K / (rank ^ S)), where s is slightly > 1.0
133  // - Number of words in the list is N
134  // - The mean frequency of a word that did not appear in the list is the
135  // area under the rest of the Zipf's curve divided by 2 (the mean)
136  // - The area would be the bound integral from N to infinity =
137  // (K * S) / (N ^ (S + 1)) ~= K / (N ^ 2)
138  // - Given that cost = -LOG(prob), the cost of an unlisted word would be
139  // = max_cost + 2*LOG(N)
140  word_unigrams_obj->not_in_list_cost_ = max_cost +
141  (2 * CubeUtils::Prob2Cost(1.0 / word_cnt));
142  // success
143  return word_unigrams_obj;
144 }
#define MAX(x, y)
Definition: ndminx.h:24
static int Prob2Cost(double prob_val)
Definition: cube_utils.cpp:35
static bool ReadFileToString(const string &file_name, string *str)
Definition: cube_utils.cpp:177
static void SplitStringUsing(const string &str, const string &delims, vector< string > *str_vec)
Definition: cube_utils.cpp:210
#define NULL
Definition: host.h:144

The documentation for this class was generated from the following files: