tesseract  3.04.00
cube_utils.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: cube_utils.cpp
3  * Description: Implementation of the Cube Utilities Class
4  * Author: Ahmad Abdulkader
5  * Created: 2008
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <math.h>
21 #include <string>
22 #include <vector>
23 #include "cube_utils.h"
24 #include "char_set.h"
25 #include "unichar.h"
26 
27 namespace tesseract {
29 }
30 
32 }
33 
34 // convert a prob to a cost (-ve log prob)
35 int CubeUtils::Prob2Cost(double prob_val) {
36  if (prob_val < MIN_PROB) {
37  return MIN_PROB_COST;
38  }
39  return static_cast<int>(-log(prob_val) * PROB2COST_SCALE);
40 }
41 
42 // converts a cost to probability
43 double CubeUtils::Cost2Prob(int cost) {
44  return exp(-cost / PROB2COST_SCALE);
45 }
46 
47 // computes the length of a NULL terminated char_32 string
48 int CubeUtils::StrLen(const char_32 *char_32_ptr) {
49  if (char_32_ptr == NULL) {
50  return 0;
51  }
52  int len = -1;
53  while (char_32_ptr[++len]);
54  return len;
55 }
56 
57 // compares two char_32 strings
58 int CubeUtils::StrCmp(const char_32 *str1, const char_32 *str2) {
59  const char_32 *pch1 = str1;
60  const char_32 *pch2 = str2;
61 
62  for (; (*pch1) != 0 && (*pch2) != 0; pch1++, pch2++) {
63  if ((*pch1) != (*pch2)) {
64  return (*pch1) - (*pch2);
65  }
66  }
67 
68  if ((*pch1) == 0) {
69  if ((*pch2) == 0) {
70  return 0;
71  } else {
72  return -1;
73  }
74  } else {
75  return 1;
76  }
77 }
78 
79 // Duplicates a 32-bit char buffer
81  int len = StrLen(str32);
82  char_32 *new_str = new char_32[len + 1];
83  if (new_str == NULL) {
84  return NULL;
85  }
86  memcpy(new_str, str32, len * sizeof(*str32));
87  new_str[len] = 0;
88  return new_str;
89 }
90 
91 // creates a char samp from a specified portion of the image
92 CharSamp *CubeUtils::CharSampleFromPix(Pix *pix, int left, int top,
93  int wid, int hgt) {
94  // get the raw img data from the image
95  unsigned char *temp_buff = GetImageData(pix, left, top, wid, hgt);
96  if (temp_buff == NULL) {
97  return NULL;
98  }
99 
100  // create a char samp from temp buffer
101  CharSamp *char_samp = CharSamp::FromRawData(left, top, wid, hgt, temp_buff);
102 
103  // clean up temp buffer
104  delete []temp_buff;
105  return char_samp;
106 }
107 
108 // create a B/W image from a char_sample
110  // parameter check
111  if (char_samp == NULL) {
112  return NULL;
113  }
114 
115  // get the raw data
116  int stride = char_samp->Stride();
117  int wid = char_samp->Width();
118  int hgt = char_samp->Height();
119 
120  Pix *pix = pixCreate(wid, hgt, 1);
121  if (pix == NULL) {
122  return NULL;
123  }
124 
125  // copy the contents
126  unsigned char *line = char_samp->RawData();
127  for (int y = 0; y < hgt ; y++, line += stride) {
128  for (int x = 0; x < wid; x++) {
129  if (line[x] != 0) {
130  pixSetPixel(pix, x, y, 0);
131  } else {
132  pixSetPixel(pix, x, y, 255);
133  }
134  }
135  }
136 
137  return pix;
138 }
139 
140 // creates a raw buffer from the specified location of the pix
141 unsigned char *CubeUtils::GetImageData(Pix *pix, int left, int top,
142  int wid, int hgt) {
143  // skip invalid dimensions
144  if (left < 0 || top < 0 || wid < 0 || hgt < 0 ||
145  (left + wid) > pix->w || (top + hgt) > pix->h ||
146  pix->d != 1) {
147  return NULL;
148  }
149 
150  // copy the char img to a temp buffer
151  unsigned char *temp_buff = new unsigned char[wid * hgt];
152  if (temp_buff == NULL) {
153  return NULL;
154  }
155  l_int32 w;
156  l_int32 h;
157  l_int32 d;
158  l_int32 wpl;
159  l_uint32 *line;
160  l_uint32 *data;
161 
162  pixGetDimensions(pix, &w, &h, &d);
163  wpl = pixGetWpl(pix);
164  data = pixGetData(pix);
165  line = data + (top * wpl);
166 
167  for (int y = 0, off = 0; y < hgt ; y++) {
168  for (int x = 0; x < wid; x++, off++) {
169  temp_buff[off] = GET_DATA_BIT(line, x + left) ? 0 : 255;
170  }
171  line += wpl;
172  }
173  return temp_buff;
174 }
175 
176 // read file contents to a string
177 bool CubeUtils::ReadFileToString(const string &file_name, string *str) {
178  str->clear();
179  FILE *fp = fopen(file_name.c_str(), "rb");
180  if (fp == NULL) {
181  return false;
182  }
183 
184  // get the size of the size
185  fseek(fp, 0, SEEK_END);
186  int file_size = ftell(fp);
187  if (file_size < 1) {
188  fclose(fp);
189  return false;
190  }
191  // adjust string size
192  str->reserve(file_size);
193  // read the contents
194  rewind(fp);
195  char *buff = new char[file_size];
196  if (buff == NULL) {
197  fclose(fp);
198  return false;
199  }
200  int read_bytes = fread(buff, 1, static_cast<int>(file_size), fp);
201  if (read_bytes == file_size) {
202  str->append(buff, file_size);
203  }
204  delete []buff;
205  fclose(fp);
206  return (read_bytes == file_size);
207 }
208 
209 // splits a string into vectors based on specified delimiters
210 void CubeUtils::SplitStringUsing(const string &str,
211  const string &delims,
212  vector<string> *str_vec) {
213  // Optimize the common case where delims is a single character.
214  if (delims[0] != '\0' && delims[1] == '\0') {
215  char c = delims[0];
216  const char* p = str.data();
217  const char* end = p + str.size();
218  while (p != end) {
219  if (*p == c) {
220  ++p;
221  } else {
222  const char* start = p;
223  while (++p != end && *p != c);
224  str_vec->push_back(string(start, p - start));
225  }
226  }
227  return;
228  }
229 
230  string::size_type begin_index, end_index;
231  begin_index = str.find_first_not_of(delims);
232  while (begin_index != string::npos) {
233  end_index = str.find_first_of(delims, begin_index);
234  if (end_index == string::npos) {
235  str_vec->push_back(str.substr(begin_index));
236  return;
237  }
238  str_vec->push_back(str.substr(begin_index, (end_index - begin_index)));
239  begin_index = str.find_first_not_of(delims, end_index);
240  }
241 }
242 
243 // UTF-8 to UTF-32 convesion functions
244 void CubeUtils::UTF8ToUTF32(const char *utf8_str, string_32 *str32) {
245  str32->clear();
246  int len = strlen(utf8_str);
247  int step = 0;
248  for (int ch = 0; ch < len; ch += step) {
249  step = UNICHAR::utf8_step(utf8_str + ch);
250  if (step > 0) {
251  UNICHAR uni_ch(utf8_str + ch, step);
252  (*str32) += uni_ch.first_uni();
253  }
254  }
255 }
256 
257 // UTF-8 to UTF-32 convesion functions
258 void CubeUtils::UTF32ToUTF8(const char_32 *utf32_str, string *str) {
259  str->clear();
260  for (const char_32 *ch_32 = utf32_str; (*ch_32) != 0; ch_32++) {
261  UNICHAR uni_ch((*ch_32));
262  char *utf8 = uni_ch.utf8_str();
263  if (utf8 != NULL) {
264  (*str) += utf8;
265  delete []utf8;
266  }
267  }
268 }
269 
270 bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set) {
271  bool all_one_case = true;
272  bool capitalized;
273  bool prev_upper;
274  bool prev_lower;
275  bool first_upper;
276  bool first_lower;
277  bool cur_upper;
278  bool cur_lower;
279 
280  string str8;
281  if (!char_set) {
282  // If cube char_set is missing, use C-locale-dependent functions
283  // on UTF8 characters to determine case properties.
284  first_upper = isupper(str32[0]);
285  first_lower = islower(str32[0]);
286  if (first_upper)
287  capitalized = true;
288  prev_upper = first_upper;
289  prev_lower = islower(str32[0]);
290  for (int c = 1; str32[c] != 0; ++c) {
291  cur_upper = isupper(str32[c]);
292  cur_lower = islower(str32[c]);
293  if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
294  all_one_case = false;
295  if (cur_upper)
296  capitalized = false;
297  prev_upper = cur_upper;
298  prev_lower = cur_lower;
299  }
300  } else {
301  UNICHARSET *unicharset = char_set->InternalUnicharset();
302  // Use UNICHARSET functions to determine case properties
303  first_upper = unicharset->get_isupper(char_set->ClassID(str32[0]));
304  first_lower = unicharset->get_islower(char_set->ClassID(str32[0]));
305  if (first_upper)
306  capitalized = true;
307  prev_upper = first_upper;
308  prev_lower = unicharset->get_islower(char_set->ClassID(str32[0]));
309 
310  for (int c = 1; c < StrLen(str32); ++c) {
311  cur_upper = unicharset->get_isupper(char_set->ClassID(str32[c]));
312  cur_lower = unicharset->get_islower(char_set->ClassID(str32[c]));
313  if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
314  all_one_case = false;
315  if (cur_upper)
316  capitalized = false;
317  prev_upper = cur_upper;
318  prev_lower = cur_lower;
319  }
320  }
321  return all_one_case || capitalized;
322 }
323 
324 char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set) {
325  if (!char_set) {
326  return NULL;
327  }
328  UNICHARSET *unicharset = char_set->InternalUnicharset();
329  int len = StrLen(str32);
330  char_32 *lower = new char_32[len + 1];
331  if (!lower)
332  return NULL;
333  for (int i = 0; i < len; ++i) {
334  char_32 ch = str32[i];
335  if (ch == INVALID_UNICHAR_ID) {
336  delete [] lower;
337  return NULL;
338  }
339  // convert upper-case characters to lower-case
340  if (unicharset->get_isupper(char_set->ClassID(ch))) {
341  UNICHAR_ID uid_lower = unicharset->get_other_case(char_set->ClassID(ch));
342  const char_32 *str32_lower = char_set->ClassString(uid_lower);
343  // expect lower-case version of character to be a single character
344  if (!str32_lower || StrLen(str32_lower) != 1) {
345  delete [] lower;
346  return NULL;
347  }
348  lower[i] = str32_lower[0];
349  } else {
350  lower[i] = ch;
351  }
352  }
353  lower[len] = 0;
354  return lower;
355 }
356 
357 char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set) {
358  if (!char_set) {
359  return NULL;
360  }
361  UNICHARSET *unicharset = char_set->InternalUnicharset();
362  int len = StrLen(str32);
363  char_32 *upper = new char_32[len + 1];
364  if (!upper)
365  return NULL;
366  for (int i = 0; i < len; ++i) {
367  char_32 ch = str32[i];
368  if (ch == INVALID_UNICHAR_ID) {
369  delete [] upper;
370  return NULL;
371  }
372  // convert lower-case characters to upper-case
373  if (unicharset->get_islower(char_set->ClassID(ch))) {
374  UNICHAR_ID uid_upper = unicharset->get_other_case(char_set->ClassID(ch));
375  const char_32 *str32_upper = char_set->ClassString(uid_upper);
376  // expect upper-case version of character to be a single character
377  if (!str32_upper || StrLen(str32_upper) != 1) {
378  delete [] upper;
379  return NULL;
380  }
381  upper[i] = str32_upper[0];
382  } else {
383  upper[i] = ch;
384  }
385  }
386  upper[len] = 0;
387  return upper;
388 }
389 } // namespace tesseract
const char_32 * ClassString(int class_id) const
Definition: char_set.h:104
unsigned short Height() const
Definition: bmp_8.h:50
static char_32 * ToLower(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:324
static double Cost2Prob(int cost)
Definition: cube_utils.cpp:43
static char_32 * ToUpper(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:357
#define MIN_PROB
Definition: cube_const.h:28
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134
#define NULL
Definition: host.h:144
static int StrLen(const char_32 *str)
Definition: cube_utils.cpp:48
int ClassID(const char_32 *str) const
Definition: char_set.h:54
static char_32 * StrDup(const char_32 *str)
Definition: cube_utils.cpp:80
unsigned char * RawData() const
Definition: bmp_8.h:51
unsigned short Width() const
Definition: bmp_8.h:48
char * utf8_str() const
Definition: unichar.cpp:125
int first_uni() const
Definition: unichar.cpp:97
static CharSamp * CharSampleFromPix(Pix *pix, int left, int top, int wid, int hgt)
Definition: cube_utils.cpp:92
static int StrCmp(const char_32 *str1, const char_32 *str2)
Definition: cube_utils.cpp:58
static bool ReadFileToString(const string &file_name, string *str)
Definition: cube_utils.cpp:177
basic_string< char_32 > string_32
Definition: string_32.h:41
UNICHARSET * InternalUnicharset()
Definition: char_set.h:121
static Pix * PixFromCharSample(CharSamp *char_samp)
Definition: cube_utils.cpp:109
int UNICHAR_ID
Definition: unichar.h:33
static void SplitStringUsing(const string &str, const string &delims, vector< string > *str_vec)
Definition: cube_utils.cpp:210
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
unsigned short Stride() const
Definition: bmp_8.h:49
signed int char_32
Definition: string_32.h:40
static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:270
#define PROB2COST_SCALE
Definition: cube_const.h:24
#define MIN_PROB_COST
Definition: cube_const.h:26
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
Definition: cube_utils.cpp:244
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:631
static int Prob2Cost(double prob_val)
Definition: cube_utils.cpp:35
static CharSamp * FromRawData(int left, int top, int wid, int hgt, unsigned char *data)
Definition: char_samp.cpp:273
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
static void UTF32ToUTF8(const char_32 *utf32_str, string *str)
Definition: cube_utils.cpp:258