tesseract  3.04.00
cluster.cpp File Reference
#include "const.h"
#include "cluster.h"
#include "emalloc.h"
#include "genericheap.h"
#include "helpers.h"
#include "kdpair.h"
#include "matrix.h"
#include "tprintf.h"
#include "danerror.h"
#include "freelist.h"
#include <math.h>

Go to the source code of this file.

Classes

struct  TEMPCLUSTER
 
struct  STATISTICS
 
struct  BUCKETS
 
struct  CHISTRUCT
 
struct  ClusteringContext
 

Macros

#define HOTELLING   1
 
#define FTABLE_X   10
 
#define FTABLE_Y   100
 
#define MINVARIANCE   0.0004
 
#define MINSAMPLESPERBUCKET   5
 
#define MINSAMPLES   (MINBUCKETS * MINSAMPLESPERBUCKET)
 
#define MINSAMPLESNEEDED   1
 
#define BUCKETTABLESIZE   1024
 
#define NORMALEXTENT   3.0
 
#define Odd(N)   ((N)%2)
 
#define Mirror(N, R)   ((R) - (N) - 1)
 
#define Abs(N)   ( ( (N) < 0 ) ? ( -(N) ) : (N) )
 
#define SqrtOf2Pi   2.506628275
 
#define LOOKUPTABLESIZE   8
 
#define MAXDEGREESOFFREEDOM   MAXBUCKETS
 
#define MAXNEIGHBORS   2
 
#define MAXDISTANCE   MAX_FLOAT32
 
#define CHIACCURACY   0.01
 
#define MINALPHA   (1e-200)
 
#define INITIALDELTA   0.1
 
#define DELTARATIO   0.1
 
#define ILLEGAL_CHAR   2
 

Typedefs

typedef tesseract::KDPairInc< float, TEMPCLUSTER * > ClusterPair
 
typedef tesseract::GenericHeap< ClusterPairClusterHeap
 
typedef FLOAT64(* DENSITYFUNC) (inT32)
 
typedef FLOAT64(* SOLVEFUNC) (CHISTRUCT *, double)
 

Functions

void CreateClusterTree (CLUSTERER *Clusterer)
 
void MakePotentialClusters (ClusteringContext *context, CLUSTER *Cluster, inT32 Level)
 
CLUSTERFindNearestNeighbor (KDTREE *Tree, CLUSTER *Cluster, FLOAT32 *Distance)
 
CLUSTERMakeNewCluster (CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster)
 
inT32 MergeClusters (inT16 N, register PARAM_DESC ParamDesc[], register inT32 n1, register inT32 n2, register FLOAT32 m[], register FLOAT32 m1[], register FLOAT32 m2[])
 
void ComputePrototypes (CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
 
PROTOTYPEMakePrototype (CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster)
 
PROTOTYPEMakeDegenerateProto (uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, inT32 MinSamples)
 
PROTOTYPETestEllipticalProto (CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPEMakeSphericalProto (CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
 
PROTOTYPEMakeEllipticalProto (CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
 
PROTOTYPEMakeMixedProto (CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *NormalBuckets, FLOAT64 Confidence)
 
void MakeDimRandom (uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc)
 
void MakeDimUniform (uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics)
 
STATISTICSComputeStatistics (inT16 N, PARAM_DESC ParamDesc[], CLUSTER *Cluster)
 
PROTOTYPENewSphericalProto (uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPENewEllipticalProto (inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPENewMixedProto (inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPENewSimpleProto (inT16 N, CLUSTER *Cluster)
 
BOOL8 Independent (PARAM_DESC ParamDesc[], inT16 N, FLOAT32 *CoVariance, FLOAT32 Independence)
 
BUCKETSGetBuckets (CLUSTERER *clusterer, DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
 
BUCKETSMakeBuckets (DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
 
uinT16 OptimumNumberOfBuckets (uinT32 SampleCount)
 
FLOAT64 ComputeChiSquared (uinT16 DegreesOfFreedom, FLOAT64 Alpha)
 
FLOAT64 NormalDensity (inT32 x)
 
FLOAT64 UniformDensity (inT32 x)
 
FLOAT64 Integral (FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx)
 
void FillBuckets (BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
 
uinT16 NormalBucket (PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
 
uinT16 UniformBucket (PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
 
BOOL8 DistributionOK (BUCKETS *Buckets)
 
void FreeStatistics (STATISTICS *Statistics)
 
void FreeBuckets (BUCKETS *Buckets)
 
void FreeCluster (CLUSTER *Cluster)
 
uinT16 DegreesOfFreedom (DISTRIBUTION Distribution, uinT16 HistogramBuckets)
 
int NumBucketsMatch (void *arg1, void *arg2)
 
int ListEntryMatch (void *arg1, void *arg2)
 
void AdjustBuckets (BUCKETS *Buckets, uinT32 NewSampleCount)
 
void InitBuckets (BUCKETS *Buckets)
 
int AlphaMatch (void *arg1, void *arg2)
 
CHISTRUCTNewChiStruct (uinT16 DegreesOfFreedom, FLOAT64 Alpha)
 
FLOAT64 Solve (SOLVEFUNC Function, void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
 
FLOAT64 ChiArea (CHISTRUCT *ChiParams, FLOAT64 x)
 
BOOL8 MultipleCharSamples (CLUSTERER *Clusterer, CLUSTER *Cluster, FLOAT32 MaxIllegal)
 
double InvertMatrix (const float *input, int size, float *inv)
 
CLUSTERERMakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[])
 
SAMPLEMakeSample (CLUSTERER *Clusterer, const FLOAT32 *Feature, inT32 CharID)
 
LIST ClusterSamples (CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
 
void FreeClusterer (CLUSTERER *Clusterer)
 
void FreeProtoList (LIST *ProtoList)
 
void FreePrototype (void *arg)
 
CLUSTERNextSample (LIST *SearchState)
 
FLOAT32 Mean (PROTOTYPE *Proto, uinT16 Dimension)
 
FLOAT32 StandardDeviation (PROTOTYPE *Proto, uinT16 Dimension)
 
inT32 MergeClusters (inT16 N, PARAM_DESC ParamDesc[], inT32 n1, inT32 n2, FLOAT32 m[], FLOAT32 m1[], FLOAT32 m2[])
 

Variables

const double FTable [FTABLE_Y][FTABLE_X]
 

Macro Definition Documentation

#define Abs (   N)    ( ( (N) < 0 ) ? ( -(N) ) : (N) )

Definition at line 208 of file cluster.cpp.

#define BUCKETTABLESIZE   1024

Definition at line 160 of file cluster.cpp.

#define CHIACCURACY   0.01
#define DELTARATIO   0.1
#define FTABLE_X   10

Definition at line 31 of file cluster.cpp.

#define FTABLE_Y   100

Definition at line 32 of file cluster.cpp.

#define HOTELLING   1

Definition at line 30 of file cluster.cpp.

#define ILLEGAL_CHAR   2
#define INITIALDELTA   0.1
#define LOOKUPTABLESIZE   8

Definition at line 228 of file cluster.cpp.

#define MAXDEGREESOFFREEDOM   MAXBUCKETS

Definition at line 229 of file cluster.cpp.

#define MAXDISTANCE   MAX_FLOAT32
#define MAXNEIGHBORS   2
#define MINALPHA   (1e-200)
#define MINSAMPLES   (MINBUCKETS * MINSAMPLESPERBUCKET)

Definition at line 151 of file cluster.cpp.

#define MINSAMPLESNEEDED   1

Definition at line 152 of file cluster.cpp.

#define MINSAMPLESPERBUCKET   5

Definition at line 150 of file cluster.cpp.

#define MINVARIANCE   0.0004

Definition at line 142 of file cluster.cpp.

#define Mirror (   N,
 
)    ((R) - (N) - 1)

Definition at line 207 of file cluster.cpp.

#define NORMALEXTENT   3.0

Definition at line 161 of file cluster.cpp.

#define Odd (   N)    ((N)%2)

Definition at line 206 of file cluster.cpp.

#define SqrtOf2Pi   2.506628275

Definition at line 218 of file cluster.cpp.

Typedef Documentation

Definition at line 169 of file cluster.cpp.

Definition at line 168 of file cluster.cpp.

typedef FLOAT64(* DENSITYFUNC) (inT32)

Definition at line 203 of file cluster.cpp.

typedef FLOAT64(* SOLVEFUNC) (CHISTRUCT *, double)

Definition at line 204 of file cluster.cpp.

Function Documentation

void AdjustBuckets ( BUCKETS Buckets,
uinT32  NewSampleCount 
)

Definition at line 2361 of file cluster.cpp.

2361  {
2362 /*
2363  ** Parameters:
2364  ** Buckets histogram data structure to adjust
2365  ** NewSampleCount new sample count to adjust to
2366  ** Operation:
2367  ** This routine multiplies each ExpectedCount histogram entry
2368  ** by NewSampleCount/OldSampleCount so that the histogram
2369  ** is now adjusted to the new sample count.
2370  ** Return: none
2371  ** Exceptions: none
2372  ** History: Thu Aug 3 14:31:14 1989, DSJ, Created.
2373  */
2374  int i;
2375  FLOAT64 AdjustFactor;
2376 
2377  AdjustFactor = (((FLOAT64) NewSampleCount) /
2378  ((FLOAT64) Buckets->SampleCount));
2379 
2380  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2381  Buckets->ExpectedCount[i] *= AdjustFactor;
2382  }
2383 
2384  Buckets->SampleCount = NewSampleCount;
2385 
2386 } // AdjustBuckets
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
FLOAT32 * ExpectedCount
Definition: cluster.cpp:186
uinT32 SampleCount
Definition: cluster.cpp:180
double FLOAT64
Definition: host.h:112
int AlphaMatch ( void *  arg1,
void *  arg2 
)

Definition at line 2411 of file cluster.cpp.

2412  { //CHISTRUCT *SearchKey)
2413 /*
2414  ** Parameters:
2415  ** ChiStruct chi-squared struct being tested for a match
2416  ** SearchKey chi-squared struct that is the search key
2417  ** Operation:
2418  ** This routine is used to search a list of structures which
2419  ** hold pre-computed chi-squared values for a chi-squared
2420  ** value whose corresponding alpha field matches the alpha
2421  ** field of SearchKey.
2422  ** It is called by the list search routines.
2423  ** Return: TRUE if ChiStruct's Alpha matches SearchKey's Alpha
2424  ** Exceptions: none
2425  ** History: Thu Aug 3 14:17:33 1989, DSJ, Created.
2426  */
2427  CHISTRUCT *ChiStruct = (CHISTRUCT *) arg1;
2428  CHISTRUCT *SearchKey = (CHISTRUCT *) arg2;
2429 
2430  return (ChiStruct->Alpha == SearchKey->Alpha);
2431 
2432 } // AlphaMatch
FLOAT64 Alpha
Definition: cluster.cpp:191
FLOAT64 ChiArea ( CHISTRUCT ChiParams,
FLOAT64  x 
)

Definition at line 2525 of file cluster.cpp.

2525  {
2526 /*
2527  ** Parameters:
2528  ** ChiParams contains degrees of freedom and alpha
2529  ** x value of chi-squared to evaluate
2530  ** Operation:
2531  ** This routine computes the area under a chi density curve
2532  ** from 0 to x, minus the desired area under the curve. The
2533  ** number of degrees of freedom of the chi curve is specified
2534  ** in the ChiParams structure. The desired area is also
2535  ** specified in the ChiParams structure as Alpha ( or 1 minus
2536  ** the desired area ). This routine is intended to be passed
2537  ** to the Solve() function to find the value of chi-squared
2538  ** which will yield a desired area under the right tail of
2539  ** the chi density curve. The function will only work for
2540  ** even degrees of freedom. The equations are based on
2541  ** integrating the chi density curve in parts to obtain
2542  ** a series that can be used to compute the area under the
2543  ** curve.
2544  ** Return: Error between actual and desired area under the chi curve.
2545  ** Exceptions: none
2546  ** History: Fri Aug 4 12:48:41 1989, DSJ, Created.
2547  */
2548  int i, N;
2549  FLOAT64 SeriesTotal;
2550  FLOAT64 Denominator;
2551  FLOAT64 PowerOfx;
2552 
2553  N = ChiParams->DegreesOfFreedom / 2 - 1;
2554  SeriesTotal = 1;
2555  Denominator = 1;
2556  PowerOfx = 1;
2557  for (i = 1; i <= N; i++) {
2558  Denominator *= 2 * i;
2559  PowerOfx *= x;
2560  SeriesTotal += PowerOfx / Denominator;
2561  }
2562  return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha);
2563 
2564 } // ChiArea
uinT16 DegreesOfFreedom
Definition: cluster.cpp:190
double FLOAT64
Definition: host.h:112
FLOAT64 Alpha
Definition: cluster.cpp:191
LIST ClusterSamples ( CLUSTERER Clusterer,
CLUSTERCONFIG Config 
)

ClusterSamples *********************************************************** Parameters: Clusterer data struct containing samples to be clustered Config parameters which control clustering process Operation: This routine first checks to see if the samples in this clusterer have already been clustered before; if so, it does not bother to recreate the cluster tree. It simply recomputes the prototypes based on the new Config info. If the samples have not been clustered before, the samples in the KD tree are formed into a cluster tree and then the prototypes are computed from the cluster tree. In either case this routine returns a pointer to a list of prototypes that best represent the samples given the constraints specified in Config. Return: Pointer to a list of prototypes Exceptions: None History: 5/29/89, DSJ, Created.

Definition at line 508 of file cluster.cpp.

508  {
509  //only create cluster tree if samples have never been clustered before
510  if (Clusterer->Root == NULL)
511  CreateClusterTree(Clusterer);
512 
513  //deallocate the old prototype list if one exists
514  FreeProtoList (&Clusterer->ProtoList);
515  Clusterer->ProtoList = NIL_LIST;
516 
517  //compute prototypes starting at the root node in the tree
518  ComputePrototypes(Clusterer, Config);
519  return (Clusterer->ProtoList);
520 } // ClusterSamples
void CreateClusterTree(CLUSTERER *Clusterer)
Definition: cluster.cpp:698
#define NULL
Definition: host.h:144
void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:931
CLUSTER * Root
Definition: cluster.h:91
#define NIL_LIST
Definition: oldlist.h:126
LIST ProtoList
Definition: cluster.h:92
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:564
FLOAT64 ComputeChiSquared ( uinT16  DegreesOfFreedom,
FLOAT64  Alpha 
)

Definition at line 1884 of file cluster.cpp.

1907 {
1908  static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
1909 
1910  CHISTRUCT *OldChiSquared;
1911  CHISTRUCT SearchKey;
1912 
1913  // limit the minimum alpha that can be used - if alpha is too small
1914  // it may not be possible to compute chi-squared.
1915  Alpha = ClipToRange(Alpha, MINALPHA, 1.0);
1916  if (Odd (DegreesOfFreedom))
1917  DegreesOfFreedom++;
1918 
1919  /* find the list of chi-squared values which have already been computed
1920  for the specified number of degrees of freedom. Search the list for
1921  the desired chi-squared. */
1922  SearchKey.Alpha = Alpha;
1923  OldChiSquared = (CHISTRUCT *) first_node (search (ChiWith[DegreesOfFreedom],
1924  &SearchKey, AlphaMatch));
1925 
1926  if (OldChiSquared == NULL) {
1927  OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha);
1928  OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared,
1929  (FLOAT64) DegreesOfFreedom,
1930  (FLOAT64) CHIACCURACY);
1931  ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom],
1932  OldChiSquared);
1933  }
1934  else {
1935  // further optimization might move OldChiSquared to front of list
1936  }
1937 
1938  return (OldChiSquared->ChiSquared);
1939 
1940 } // ComputeChiSquared
#define MAXDEGREESOFFREEDOM
Definition: cluster.cpp:229
#define CHIACCURACY
LIST push(LIST list, void *element)
Definition: oldlist.cpp:323
FLOAT64 ChiSquared
Definition: cluster.cpp:192
#define NULL
Definition: host.h:144
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:413
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2290
#define MINALPHA
CHISTRUCT * NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:2436
double FLOAT64
Definition: host.h:112
FLOAT64 Alpha
Definition: cluster.cpp:191
FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x)
Definition: cluster.cpp:2525
#define Odd(N)
Definition: cluster.cpp:206
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:115
#define first_node(l)
Definition: oldlist.h:139
int AlphaMatch(void *arg1, void *arg2)
Definition: cluster.cpp:2411
FLOAT64 Solve(SOLVEFUNC Function, void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
Definition: cluster.cpp:2461
void ComputePrototypes ( CLUSTERER Clusterer,
CLUSTERCONFIG Config 
)

ComputePrototypes ******************************************************* Parameters: Clusterer data structure holding cluster tree Config parameters used to control prototype generation Operation: This routine decides which clusters in the cluster tree should be represented by prototypes, forms a list of these prototypes, and places the list in the Clusterer data structure. Return: None Exceptions: None History: 5/30/89, DSJ, Created.

Definition at line 931 of file cluster.cpp.

931  {
932  LIST ClusterStack = NIL_LIST;
933  CLUSTER *Cluster;
934  PROTOTYPE *Prototype;
935 
936  // use a stack to keep track of clusters waiting to be processed
937  // initially the only cluster on the stack is the root cluster
938  if (Clusterer->Root != NULL)
939  ClusterStack = push (NIL_LIST, Clusterer->Root);
940 
941  // loop until we have analyzed all clusters which are potential prototypes
942  while (ClusterStack != NIL_LIST) {
943  // remove the next cluster to be analyzed from the stack
944  // try to make a prototype from the cluster
945  // if successful, put it on the proto list, else split the cluster
946  Cluster = (CLUSTER *) first_node (ClusterStack);
947  ClusterStack = pop (ClusterStack);
948  Prototype = MakePrototype(Clusterer, Config, Cluster);
949  if (Prototype != NULL) {
950  Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype);
951  }
952  else {
953  ClusterStack = push (ClusterStack, Cluster->Right);
954  ClusterStack = push (ClusterStack, Cluster->Left);
955  }
956  }
957 } // ComputePrototypes
PROTOTYPE * MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster)
Definition: cluster.cpp:978
LIST push(LIST list, void *element)
Definition: oldlist.cpp:323
#define NULL
Definition: host.h:144
LIST pop(LIST list)
Definition: oldlist.cpp:305
Definition: cluster.h:32
CLUSTER * Root
Definition: cluster.h:91
struct sample * Right
Definition: cluster.h:37
struct sample * Left
Definition: cluster.h:36
#define NIL_LIST
Definition: oldlist.h:126
#define first_node(l)
Definition: oldlist.h:139
LIST ProtoList
Definition: cluster.h:92
STATISTICS * ComputeStatistics ( inT16  N,
PARAM_DESC  ParamDesc[],
CLUSTER Cluster 
)

ComputeStatistics ********************************************************* Parameters: N number of dimensions ParamDesc array of dimension descriptions Cluster cluster whose stats are to be computed Operation: This routine searches the cluster tree for all leaf nodes which are samples in the specified cluster. It computes a full covariance matrix for these samples as well as keeping track of the ranges (min and max) for each dimension. A special data structure is allocated to return this information to the caller. An incremental algorithm for computing statistics is not used because it will not work with circular dimensions. Return: Pointer to new data structure containing statistics Exceptions: None History: 6/2/89, DSJ, Created.

Definition at line 1426 of file cluster.cpp.

1426  {
1427  STATISTICS *Statistics;
1428  int i, j;
1429  FLOAT32 *CoVariance;
1430  FLOAT32 *Distance;
1431  LIST SearchState;
1432  SAMPLE *Sample;
1433  uinT32 SampleCountAdjustedForBias;
1434 
1435  // allocate memory to hold the statistics results
1436  Statistics = (STATISTICS *) Emalloc (sizeof (STATISTICS));
1437  Statistics->CoVariance = (FLOAT32 *) Emalloc (N * N * sizeof (FLOAT32));
1438  Statistics->Min = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1439  Statistics->Max = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1440 
1441  // allocate temporary memory to hold the sample to mean distances
1442  Distance = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1443 
1444  // initialize the statistics
1445  Statistics->AvgVariance = 1.0;
1446  CoVariance = Statistics->CoVariance;
1447  for (i = 0; i < N; i++) {
1448  Statistics->Min[i] = 0.0;
1449  Statistics->Max[i] = 0.0;
1450  for (j = 0; j < N; j++, CoVariance++)
1451  *CoVariance = 0;
1452  }
1453  // find each sample in the cluster and merge it into the statistics
1454  InitSampleSearch(SearchState, Cluster);
1455  while ((Sample = NextSample (&SearchState)) != NULL) {
1456  for (i = 0; i < N; i++) {
1457  Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
1458  if (ParamDesc[i].Circular) {
1459  if (Distance[i] > ParamDesc[i].HalfRange)
1460  Distance[i] -= ParamDesc[i].Range;
1461  if (Distance[i] < -ParamDesc[i].HalfRange)
1462  Distance[i] += ParamDesc[i].Range;
1463  }
1464  if (Distance[i] < Statistics->Min[i])
1465  Statistics->Min[i] = Distance[i];
1466  if (Distance[i] > Statistics->Max[i])
1467  Statistics->Max[i] = Distance[i];
1468  }
1469  CoVariance = Statistics->CoVariance;
1470  for (i = 0; i < N; i++)
1471  for (j = 0; j < N; j++, CoVariance++)
1472  *CoVariance += Distance[i] * Distance[j];
1473  }
1474  // normalize the variances by the total number of samples
1475  // use SampleCount-1 instead of SampleCount to get an unbiased estimate
1476  // also compute the geometic mean of the diagonal variances
1477  // ensure that clusters with only 1 sample are handled correctly
1478  if (Cluster->SampleCount > 1)
1479  SampleCountAdjustedForBias = Cluster->SampleCount - 1;
1480  else
1481  SampleCountAdjustedForBias = 1;
1482  CoVariance = Statistics->CoVariance;
1483  for (i = 0; i < N; i++)
1484  for (j = 0; j < N; j++, CoVariance++) {
1485  *CoVariance /= SampleCountAdjustedForBias;
1486  if (j == i) {
1487  if (*CoVariance < MINVARIANCE)
1488  *CoVariance = MINVARIANCE;
1489  Statistics->AvgVariance *= *CoVariance;
1490  }
1491  }
1492  Statistics->AvgVariance = (float)pow((double)Statistics->AvgVariance,
1493  1.0 / N);
1494 
1495  // release temporary memory and return
1496  memfree(Distance);
1497  return (Statistics);
1498 } // ComputeStatistics
FLOAT32 Mean[1]
Definition: cluster.h:39
#define NULL
Definition: host.h:144
unsigned int uinT32
Definition: host.h:103
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:618
Definition: cluster.h:32
#define MINVARIANCE
Definition: cluster.cpp:142
FLOAT32 * Min
Definition: cluster.cpp:174
FLOAT32 AvgVariance
Definition: cluster.cpp:172
FLOAT32 * CoVariance
Definition: cluster.cpp:173
void * Emalloc(int Size)
Definition: emalloc.cpp:35
FLOAT32 * Max
Definition: cluster.cpp:175
FLOAT32 Range
Definition: ocrfeatures.h:51
void memfree(void *element)
Definition: freelist.cpp:30
#define InitSampleSearch(S, C)
Definition: cluster.h:105
float FLOAT32
Definition: host.h:111
unsigned SampleCount
Definition: cluster.h:35
void CreateClusterTree ( CLUSTERER Clusterer)

CreateClusterTree ******************************************************* Parameters: Clusterer data structure holdings samples to be clustered Operation: This routine performs a bottoms-up clustering on the samples held in the kd-tree of the Clusterer data structure. The result is a cluster tree. Each node in the tree represents a cluster which conceptually contains a subset of the samples. More precisely, the cluster contains all of the samples which are contained in its two sub-clusters. The leaves of the tree are the individual samples themselves; they have no sub-clusters. The root node of the tree conceptually contains all of the samples. Return: None (the Clusterer data structure is changed) Exceptions: None History: 5/29/89, DSJ, Created.

Definition at line 698 of file cluster.cpp.

698  {
699  ClusteringContext context;
700  ClusterPair HeapEntry;
701  TEMPCLUSTER *PotentialCluster;
702 
703  // each sample and its nearest neighbor form a "potential" cluster
704  // save these in a heap with the "best" potential clusters on top
705  context.tree = Clusterer->KDTree;
706  context.candidates = (TEMPCLUSTER *)
707  Emalloc(Clusterer->NumberOfSamples * sizeof(TEMPCLUSTER));
708  context.next = 0;
709  context.heap = new ClusterHeap(Clusterer->NumberOfSamples);
710  KDWalk(context.tree, (void_proc)MakePotentialClusters, &context);
711 
712  // form potential clusters into actual clusters - always do "best" first
713  while (context.heap->Pop(&HeapEntry)) {
714  PotentialCluster = HeapEntry.data;
715 
716  // if main cluster of potential cluster is already in another cluster
717  // then we don't need to worry about it
718  if (PotentialCluster->Cluster->Clustered) {
719  continue;
720  }
721 
722  // if main cluster is not yet clustered, but its nearest neighbor is
723  // then we must find a new nearest neighbor
724  else if (PotentialCluster->Neighbor->Clustered) {
725  PotentialCluster->Neighbor =
726  FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
727  &HeapEntry.key);
728  if (PotentialCluster->Neighbor != NULL) {
729  context.heap->Push(&HeapEntry);
730  }
731  }
732 
733  // if neither cluster is already clustered, form permanent cluster
734  else {
735  PotentialCluster->Cluster =
736  MakeNewCluster(Clusterer, PotentialCluster);
737  PotentialCluster->Neighbor =
738  FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
739  &HeapEntry.key);
740  if (PotentialCluster->Neighbor != NULL) {
741  context.heap->Push(&HeapEntry);
742  }
743  }
744  }
745 
746  // the root node in the cluster tree is now the only node in the kd-tree
747  Clusterer->Root = (CLUSTER *) RootOf(Clusterer->KDTree);
748 
749  // free up the memory used by the K-D tree, heap, and temp clusters
750  FreeKDTree(context.tree);
751  Clusterer->KDTree = NULL;
752  delete context.heap;
753  memfree(context.candidates);
754 } // CreateClusterTree
CLUSTER * FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, FLOAT32 *Distance)
Definition: cluster.cpp:802
CLUSTER * Cluster
Definition: cluster.cpp:164
KDTREE * KDTree
Definition: cluster.h:90
tesseract::GenericHeap< ClusterPair > ClusterHeap
Definition: cluster.cpp:169
#define NULL
Definition: host.h:144
void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, inT32 Level)
Definition: cluster.cpp:768
CLUSTER * MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster)
Definition: cluster.cpp:841
inT32 NumberOfSamples
Definition: cluster.h:89
void FreeKDTree(KDTREE *Tree)
Definition: kdtree.cpp:346
TEMPCLUSTER * candidates
Definition: cluster.cpp:198
Definition: cluster.h:32
CLUSTER * Root
Definition: cluster.h:91
ClusterHeap * heap
Definition: cluster.cpp:197
unsigned Clustered
Definition: cluster.h:33
CLUSTER * Neighbor
Definition: cluster.cpp:165
void * Emalloc(int Size)
Definition: emalloc.cpp:35
void KDWalk(KDTREE *Tree, void_proc action, void *context)
Definition: kdtree.cpp:339
bool Pop(Pair *entry)
Definition: genericheap.h:116
void memfree(void *element)
Definition: freelist.cpp:30
void Push(Pair *entry)
Definition: genericheap.h:95
void(* void_proc)(...)
Definition: cutil.h:66
#define RootOf(T)
Definition: kdtree.h:58
uinT16 DegreesOfFreedom ( DISTRIBUTION  Distribution,
uinT16  HistogramBuckets 
)

Definition at line 2290 of file cluster.cpp.

2290  {
2291 /*
2292  ** Parameters:
2293  ** Distribution distribution being tested for
2294  ** HistogramBuckets number of buckets in chi-square test
2295  ** Operation:
2296  ** This routine computes the degrees of freedom that should
2297  ** be used in a chi-squared test with the specified number of
2298  ** histogram buckets. The result is always rounded up to
2299  ** the next even number so that the value of chi-squared can be
2300  ** computed more easily. This will cause the value of
2301  ** chi-squared to be higher than the optimum value, resulting
2302  ** in the chi-square test being more lenient than optimum.
2303  ** Return: The number of degrees of freedom for a chi-square test
2304  ** Exceptions: none
2305  ** History: Thu Aug 3 14:04:18 1989, DSJ, Created.
2306  */
2307  static uinT8 DegreeOffsets[] = { 3, 3, 1 };
2308 
2309  uinT16 AdjustedNumBuckets;
2310 
2311  AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution];
2312  if (Odd (AdjustedNumBuckets))
2313  AdjustedNumBuckets++;
2314  return (AdjustedNumBuckets);
2315 
2316 } // DegreesOfFreedom
unsigned short uinT16
Definition: host.h:101
#define Odd(N)
Definition: cluster.cpp:206
unsigned char uinT8
Definition: host.h:99
BOOL8 DistributionOK ( BUCKETS Buckets)

Definition at line 2191 of file cluster.cpp.

2191  {
2192 /*
2193  ** Parameters:
2194  ** Buckets histogram data to perform chi-square test on
2195  ** Operation:
2196  ** This routine performs a chi-square goodness of fit test
2197  ** on the histogram data in the Buckets data structure. TRUE
2198  ** is returned if the histogram matches the probability
2199  ** distribution which was specified when the Buckets
2200  ** structure was originally created. Otherwise FALSE is
2201  ** returned.
2202  ** Return:
2203  ** TRUE if samples match distribution, FALSE otherwise
2204  ** Exceptions:
2205  ** None
2206  ** History:
2207  ** 6/5/89, DSJ, Created.
2208  */
2209  FLOAT32 FrequencyDifference;
2210  FLOAT32 TotalDifference;
2211  int i;
2212 
2213  // compute how well the histogram matches the expected histogram
2214  TotalDifference = 0.0;
2215  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2216  FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
2217  TotalDifference += (FrequencyDifference * FrequencyDifference) /
2218  Buckets->ExpectedCount[i];
2219  }
2220 
2221  // test to see if the difference is more than expected
2222  if (TotalDifference > Buckets->ChiSquared)
2223  return FALSE;
2224  else
2225  return TRUE;
2226 } // DistributionOK
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
#define FALSE
Definition: capi.h:29
FLOAT32 * ExpectedCount
Definition: cluster.cpp:186
FLOAT64 ChiSquared
Definition: cluster.cpp:182
#define TRUE
Definition: capi.h:28
float FLOAT32
Definition: host.h:111
uinT32 * Count
Definition: cluster.cpp:185
void FillBuckets ( BUCKETS Buckets,
CLUSTER Cluster,
uinT16  Dim,
PARAM_DESC ParamDesc,
FLOAT32  Mean,
FLOAT32  StdDev 
)

Definition at line 2019 of file cluster.cpp.

2024  {
2025 /*
2026  ** Parameters:
2027  ** Buckets histogram buckets to count samples
2028  ** Cluster cluster whose samples are being analyzed
2029  ** Dim dimension of samples which is being analyzed
2030  ** ParamDesc description of the dimension
2031  ** Mean "mean" of the distribution
2032  ** StdDev "standard deviation" of the distribution
2033  ** Operation:
2034  ** This routine counts the number of cluster samples which
2035  ** fall within the various histogram buckets in Buckets. Only
2036  ** one dimension of each sample is examined. The exact meaning
2037  ** of the Mean and StdDev parameters depends on the
2038  ** distribution which is being analyzed (this info is in the
2039  ** Buckets data structure). For normal distributions, Mean
2040  ** and StdDev have the expected meanings. For uniform and
2041  ** random distributions the Mean is the center point of the
2042  ** range and the StdDev is 1/2 the range. A dimension with
2043  ** zero standard deviation cannot be statistically analyzed.
2044  ** In this case, a pseudo-analysis is used.
2045  ** Return:
2046  ** None (the Buckets data structure is filled in)
2047  ** Exceptions:
2048  ** None
2049  ** History:
2050  ** 6/5/89, DSJ, Created.
2051  */
2052  uinT16 BucketID;
2053  int i;
2054  LIST SearchState;
2055  SAMPLE *Sample;
2056 
2057  // initialize the histogram bucket counts to 0
2058  for (i = 0; i < Buckets->NumberOfBuckets; i++)
2059  Buckets->Count[i] = 0;
2060 
2061  if (StdDev == 0.0) {
2062  /* if the standard deviation is zero, then we can't statistically
2063  analyze the cluster. Use a pseudo-analysis: samples exactly on
2064  the mean are distributed evenly across all buckets. Samples greater
2065  than the mean are placed in the last bucket; samples less than the
2066  mean are placed in the first bucket. */
2067 
2068  InitSampleSearch(SearchState, Cluster);
2069  i = 0;
2070  while ((Sample = NextSample (&SearchState)) != NULL) {
2071  if (Sample->Mean[Dim] > Mean)
2072  BucketID = Buckets->NumberOfBuckets - 1;
2073  else if (Sample->Mean[Dim] < Mean)
2074  BucketID = 0;
2075  else
2076  BucketID = i;
2077  Buckets->Count[BucketID] += 1;
2078  i++;
2079  if (i >= Buckets->NumberOfBuckets)
2080  i = 0;
2081  }
2082  }
2083  else {
2084  // search for all samples in the cluster and add to histogram buckets
2085  InitSampleSearch(SearchState, Cluster);
2086  while ((Sample = NextSample (&SearchState)) != NULL) {
2087  switch (Buckets->Distribution) {
2088  case normal:
2089  BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim],
2090  Mean, StdDev);
2091  break;
2092  case D_random:
2093  case uniform:
2094  BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim],
2095  Mean, StdDev);
2096  break;
2097  default:
2098  BucketID = 0;
2099  }
2100  Buckets->Count[Buckets->Bucket[BucketID]] += 1;
2101  }
2102  }
2103 } // FillBuckets
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
uinT16 NormalBucket(PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2107
FLOAT32 Mean[1]
Definition: cluster.h:39
#define NULL
Definition: host.h:144
uinT16 Bucket[BUCKETTABLESIZE]
Definition: cluster.cpp:184
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:618
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:643
Definition: cluster.h:32
unsigned short uinT16
Definition: host.h:101
uinT16 UniformBucket(PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2149
#define InitSampleSearch(S, C)
Definition: cluster.h:105
DISTRIBUTION Distribution
Definition: cluster.cpp:179
Definition: cluster.h:59
uinT32 * Count
Definition: cluster.cpp:185
CLUSTER * FindNearestNeighbor ( KDTREE Tree,
CLUSTER Cluster,
FLOAT32 Distance 
)

FindNearestNeighbor ********************************************************* Parameters: Tree kd-tree to search in for nearest neighbor Cluster cluster whose nearest neighbor is to be found Distance ptr to variable to report distance found Operation: This routine searches the specified kd-tree for the nearest neighbor of the specified cluster. It actually uses the kd routines to find the 2 nearest neighbors since one of them will be the original cluster. A pointer to the nearest neighbor is returned, if it can be found, otherwise NULL is returned. The distance between the 2 nodes is placed in the specified variable. Return: Pointer to the nearest neighbor of Cluster, or NULL Exceptions: none History: 5/29/89, DSJ, Created. 7/13/89, DSJ, Removed visibility of kd-tree node data struct

Definition at line 802 of file cluster.cpp.

805 {
806  CLUSTER *Neighbor[MAXNEIGHBORS];
807  FLOAT32 Dist[MAXNEIGHBORS];
808  int NumberOfNeighbors;
809  inT32 i;
810  CLUSTER *BestNeighbor;
811 
812  // find the 2 nearest neighbors of the cluster
814  &NumberOfNeighbors, (void **)Neighbor, Dist);
815 
816  // search for the nearest neighbor that is not the cluster itself
817  *Distance = MAXDISTANCE;
818  BestNeighbor = NULL;
819  for (i = 0; i < NumberOfNeighbors; i++) {
820  if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
821  *Distance = Dist[i];
822  BestNeighbor = Neighbor[i];
823  }
824  }
825  return BestNeighbor;
826 } // FindNearestNeighbor
int inT32
Definition: host.h:102
#define MAXNEIGHBORS
void KDNearestNeighborSearch(KDTREE *Tree, FLOAT32 Query[], int QuerySize, FLOAT32 MaxDistance, int *NumberOfResults, void **NBuffer, FLOAT32 DBuffer[])
Definition: kdtree.cpp:307
FLOAT32 Mean[1]
Definition: cluster.h:39
#define NULL
Definition: host.h:144
Definition: cluster.h:32
#define MAXDISTANCE
float FLOAT32
Definition: host.h:111
void FreeBuckets ( BUCKETS Buckets)

Definition at line 2252 of file cluster.cpp.

2252  {
2253 /*
2254  ** Parameters:
2255  ** buckets pointer to data structure to be freed
2256  ** Operation:
2257  ** This routine properly frees the memory used by a BUCKETS.
2258  */
2259  Efree(buckets->Count);
2260  Efree(buckets->ExpectedCount);
2261  Efree(buckets);
2262 } // FreeBuckets
void Efree(void *ptr)
Definition: emalloc.cpp:85
void FreeCluster ( CLUSTER Cluster)

Definition at line 2266 of file cluster.cpp.

2266  {
2267 /*
2268  ** Parameters:
2269  ** Cluster pointer to cluster to be freed
2270  ** Operation:
2271  ** This routine frees the memory consumed by the specified
2272  ** cluster and all of its subclusters. This is done by
2273  ** recursive calls to FreeCluster().
2274  ** Return:
2275  ** None
2276  ** Exceptions:
2277  ** None
2278  ** History:
2279  ** 6/6/89, DSJ, Created.
2280  */
2281  if (Cluster != NULL) {
2282  FreeCluster (Cluster->Left);
2283  FreeCluster (Cluster->Right);
2284  memfree(Cluster);
2285  }
2286 } // FreeCluster
#define NULL
Definition: host.h:144
void FreeCluster(CLUSTER *Cluster)
Definition: cluster.cpp:2266
struct sample * Right
Definition: cluster.h:37
struct sample * Left
Definition: cluster.h:36
void memfree(void *element)
Definition: freelist.cpp:30
void FreeClusterer ( CLUSTERER Clusterer)

FreeClusterer ************************************************************* Parameters: Clusterer pointer to data structure to be freed Operation: This routine frees all of the memory allocated to the specified data structure. It will not, however, free the memory used by the prototype list. The pointers to the clusters for each prototype in the list will be set to NULL to indicate that the cluster data structures no longer exist. Any sample lists that have been obtained via calls to GetSamples are no longer valid. Return: None Exceptions: None History: 6/6/89, DSJ, Created.

Definition at line 536 of file cluster.cpp.

536  {
537  if (Clusterer != NULL) {
538  memfree (Clusterer->ParamDesc);
539  if (Clusterer->KDTree != NULL)
540  FreeKDTree (Clusterer->KDTree);
541  if (Clusterer->Root != NULL)
542  FreeCluster (Clusterer->Root);
543  // Free up all used buckets structures.
544  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
545  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
546  if (Clusterer->bucket_cache[d][c] != NULL)
547  FreeBuckets(Clusterer->bucket_cache[d][c]);
548  }
549 
550  memfree(Clusterer);
551  }
552 } // FreeClusterer
KDTREE * KDTree
Definition: cluster.h:90
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1-MINBUCKETS]
Definition: cluster.h:95
#define NULL
Definition: host.h:144
void FreeCluster(CLUSTER *Cluster)
Definition: cluster.cpp:2266
void FreeKDTree(KDTREE *Tree)
Definition: kdtree.cpp:346
#define MAXBUCKETS
Definition: cluster.h:27
PARAM_DESC * ParamDesc
Definition: cluster.h:88
CLUSTER * Root
Definition: cluster.h:91
#define MINBUCKETS
Definition: cluster.h:26
void FreeBuckets(BUCKETS *Buckets)
Definition: cluster.cpp:2252
void memfree(void *element)
Definition: freelist.cpp:30
void FreeProtoList ( LIST ProtoList)

FreeProtoList ************************************************************ Parameters: ProtoList pointer to list of prototypes to be freed Operation: This routine frees all of the memory allocated to the specified list of prototypes. The clusters which are pointed to by the prototypes are not freed. Return: None Exceptions: None History: 6/6/89, DSJ, Created.

Definition at line 564 of file cluster.cpp.

564  {
565  destroy_nodes(*ProtoList, FreePrototype);
566 } // FreeProtoList
void FreePrototype(void *arg)
Definition: cluster.cpp:579
void destroy_nodes(LIST list, void_dest destructor)
Definition: oldlist.cpp:204
void FreePrototype ( void *  arg)

FreePrototype ************************************************************ Parameters: Prototype prototype data structure to be deallocated Operation: This routine deallocates the memory consumed by the specified prototype and modifies the corresponding cluster so that it is no longer marked as a prototype. The cluster is NOT deallocated by this routine. Return: None Exceptions: None History: 5/30/89, DSJ, Created.

Definition at line 579 of file cluster.cpp.

579  { //PROTOTYPE *Prototype)
580  PROTOTYPE *Prototype = (PROTOTYPE *) arg;
581 
582  // unmark the corresponding cluster (if there is one
583  if (Prototype->Cluster != NULL)
584  Prototype->Cluster->Prototype = FALSE;
585 
586  // deallocate the prototype statistics and then the prototype itself
587  if (Prototype->Distrib != NULL)
588  memfree (Prototype->Distrib);
589  if (Prototype->Mean != NULL)
590  memfree (Prototype->Mean);
591  if (Prototype->Style != spherical) {
592  if (Prototype->Variance.Elliptical != NULL)
593  memfree (Prototype->Variance.Elliptical);
594  if (Prototype->Magnitude.Elliptical != NULL)
595  memfree (Prototype->Magnitude.Elliptical);
596  if (Prototype->Weight.Elliptical != NULL)
597  memfree (Prototype->Weight.Elliptical);
598  }
599  memfree(Prototype);
600 } // FreePrototype
FLOAT32 * Mean
Definition: cluster.h:78
CLUSTER * Cluster
Definition: cluster.h:76
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOATUNION Variance
Definition: cluster.h:81
#define NULL
Definition: host.h:144
unsigned Prototype
Definition: cluster.h:34
#define FALSE
Definition: capi.h:29
FLOAT32 * Elliptical
Definition: cluster.h:64
FLOATUNION Weight
Definition: cluster.h:83
FLOATUNION Magnitude
Definition: cluster.h:82
unsigned Style
Definition: cluster.h:74
void memfree(void *element)
Definition: freelist.cpp:30
void FreeStatistics ( STATISTICS Statistics)

Definition at line 2230 of file cluster.cpp.

2230  {
2231 /*
2232  ** Parameters:
2233  ** Statistics pointer to data structure to be freed
2234  ** Operation:
2235  ** This routine frees the memory used by the statistics
2236  ** data structure.
2237  ** Return:
2238  ** None
2239  ** Exceptions:
2240  ** None
2241  ** History:
2242  ** 6/5/89, DSJ, Created.
2243  */
2244  memfree (Statistics->CoVariance);
2245  memfree (Statistics->Min);
2246  memfree (Statistics->Max);
2247  memfree(Statistics);
2248 } // FreeStatistics
FLOAT32 * Min
Definition: cluster.cpp:174
FLOAT32 * CoVariance
Definition: cluster.cpp:173
FLOAT32 * Max
Definition: cluster.cpp:175
void memfree(void *element)
Definition: freelist.cpp:30
BUCKETS * GetBuckets ( CLUSTERER clusterer,
DISTRIBUTION  Distribution,
uinT32  SampleCount,
FLOAT64  Confidence 
)

GetBuckets ************************************************************** Parameters: Clusterer which keeps a bucket_cache for us. Distribution type of probability distribution to test for SampleCount number of samples that are available Confidence probability of a Type I error Operation: This routine returns a histogram data structure which can be used by other routines to place samples into histogram buckets, and then apply a goodness of fit test to the histogram data to determine if the samples belong to the specified probability distribution. The routine keeps a list of bucket data structures which have already been created so that it minimizes the computation time needed to create a new bucket. Return: Bucket data structure Exceptions: none History: Thu Aug 3 12:58:10 1989, DSJ, Created.

Definition at line 1709 of file cluster.cpp.

1712  {
1713  // Get an old bucket structure with the same number of buckets.
1714  uinT16 NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
1715  BUCKETS *Buckets =
1716  clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];
1717 
1718  // If a matching bucket structure is not found, make one and save it.
1719  if (Buckets == NULL) {
1720  Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
1721  clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] =
1722  Buckets;
1723  } else {
1724  // Just adjust the existing buckets.
1725  if (SampleCount != Buckets->SampleCount)
1726  AdjustBuckets(Buckets, SampleCount);
1727  if (Confidence != Buckets->Confidence) {
1728  Buckets->Confidence = Confidence;
1729  Buckets->ChiSquared = ComputeChiSquared(
1730  DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets),
1731  Confidence);
1732  }
1733  InitBuckets(Buckets);
1734  }
1735  return Buckets;
1736 } // GetBuckets
uinT16 OptimumNumberOfBuckets(uinT32 SampleCount)
Definition: cluster.cpp:1843
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1-MINBUCKETS]
Definition: cluster.h:95
#define NULL
Definition: host.h:144
BUCKETS * MakeBuckets(DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
Definition: cluster.cpp:1759
FLOAT64 Confidence
Definition: cluster.cpp:181
void InitBuckets(BUCKETS *Buckets)
Definition: cluster.cpp:2390
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2290
void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount)
Definition: cluster.cpp:2361
FLOAT64 ChiSquared
Definition: cluster.cpp:182
uinT32 SampleCount
Definition: cluster.cpp:180
#define MINBUCKETS
Definition: cluster.h:26
FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:1884
unsigned short uinT16
Definition: host.h:101
BOOL8 Independent ( PARAM_DESC  ParamDesc[],
inT16  N,
FLOAT32 CoVariance,
FLOAT32  Independence 
)

Independent *************************************************************** Parameters: ParamDesc descriptions of each feature space dimension N number of dimensions CoVariance ptr to a covariance matrix Independence max off-diagonal correlation coefficient Operation: This routine returns TRUE if the specified covariance matrix indicates that all N dimensions are independent of one another. One dimension is judged to be independent of another when the magnitude of the corresponding correlation coefficient is less than the specified Independence factor. The correlation coefficient is calculated as: (see Duda and Hart, pg. 247) coeff[ij] = stddev[ij] / sqrt (stddev[ii] * stddev[jj]) The covariance matrix is assumed to be symmetric (which should always be true). Return: TRUE if dimensions are independent, FALSE otherwise Exceptions: None History: 6/4/89, DSJ, Created.

Definition at line 1660 of file cluster.cpp.

1661  {
1662  int i, j;
1663  FLOAT32 *VARii; // points to ith on-diagonal element
1664  FLOAT32 *VARjj; // points to jth on-diagonal element
1665  FLOAT32 CorrelationCoeff;
1666 
1667  VARii = CoVariance;
1668  for (i = 0; i < N; i++, VARii += N + 1) {
1669  if (ParamDesc[i].NonEssential)
1670  continue;
1671 
1672  VARjj = VARii + N + 1;
1673  CoVariance = VARii + 1;
1674  for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
1675  if (ParamDesc[j].NonEssential)
1676  continue;
1677 
1678  if ((*VARii == 0.0) || (*VARjj == 0.0))
1679  CorrelationCoeff = 0.0;
1680  else
1681  CorrelationCoeff =
1682  sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj)));
1683  if (CorrelationCoeff > Independence)
1684  return (FALSE);
1685  }
1686  }
1687  return (TRUE);
1688 } // Independent
#define FALSE
Definition: capi.h:29
#define TRUE
Definition: capi.h:28
float FLOAT32
Definition: host.h:111
void InitBuckets ( BUCKETS Buckets)

Definition at line 2390 of file cluster.cpp.

2390  {
2391 /*
2392  ** Parameters:
2393  ** Buckets histogram data structure to init
2394  ** Operation:
2395  ** This routine sets the bucket counts in the specified histogram
2396  ** to zero.
2397  ** Return: none
2398  ** Exceptions: none
2399  ** History: Thu Aug 3 14:31:14 1989, DSJ, Created.
2400  */
2401  int i;
2402 
2403  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2404  Buckets->Count[i] = 0;
2405  }
2406 
2407 } // InitBuckets
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
uinT32 * Count
Definition: cluster.cpp:185
FLOAT64 Integral ( FLOAT64  f1,
FLOAT64  f2,
FLOAT64  Dx 
)

Definition at line 1998 of file cluster.cpp.

1998  {
1999 /*
2000  ** Parameters:
2001  ** f1 value of function at x1
2002  ** f2 value of function at x2
2003  ** Dx x2 - x1 (should always be positive)
2004  ** Operation:
2005  ** This routine computes a trapezoidal approximation to the
2006  ** integral of a function over a small delta in x.
2007  ** Return:
2008  ** Approximation of the integral of the function from x1 to x2.
2009  ** Exceptions:
2010  ** None
2011  ** History:
2012  ** 6/5/89, DSJ, Created.
2013  */
2014  return (f1 + f2) * Dx / 2.0;
2015 } // Integral
double InvertMatrix ( const float *  input,
int  size,
float *  inv 
)

Definition at line 2652 of file cluster.cpp.

2652  {
2653  // Allocate memory for the 2D arrays.
2654  GENERIC_2D_ARRAY<double> U(size, size, 0.0);
2655  GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
2656  GENERIC_2D_ARRAY<double> L(size, size, 0.0);
2657 
2658  // Initialize the working matrices. U starts as input, L as I and U_inv as O.
2659  int row;
2660  int col;
2661  for (row = 0; row < size; row++) {
2662  for (col = 0; col < size; col++) {
2663  U[row][col] = input[row*size + col];
2664  L[row][col] = row == col ? 1.0 : 0.0;
2665  U_inv[row][col] = 0.0;
2666  }
2667  }
2668 
2669  // Compute forward matrix by inversion by LU decomposition of input.
2670  for (col = 0; col < size; ++col) {
2671  // Find best pivot
2672  int best_row = 0;
2673  double best_pivot = -1.0;
2674  for (row = col; row < size; ++row) {
2675  if (Abs(U[row][col]) > best_pivot) {
2676  best_pivot = Abs(U[row][col]);
2677  best_row = row;
2678  }
2679  }
2680  // Exchange pivot rows.
2681  if (best_row != col) {
2682  for (int k = 0; k < size; ++k) {
2683  double tmp = U[best_row][k];
2684  U[best_row][k] = U[col][k];
2685  U[col][k] = tmp;
2686  tmp = L[best_row][k];
2687  L[best_row][k] = L[col][k];
2688  L[col][k] = tmp;
2689  }
2690  }
2691  // Now do the pivot itself.
2692  for (row = col + 1; row < size; ++row) {
2693  double ratio = -U[row][col] / U[col][col];
2694  for (int j = col; j < size; ++j) {
2695  U[row][j] += U[col][j] * ratio;
2696  }
2697  for (int k = 0; k < size; ++k) {
2698  L[row][k] += L[col][k] * ratio;
2699  }
2700  }
2701  }
2702  // Next invert U.
2703  for (col = 0; col < size; ++col) {
2704  U_inv[col][col] = 1.0 / U[col][col];
2705  for (row = col - 1; row >= 0; --row) {
2706  double total = 0.0;
2707  for (int k = col; k > row; --k) {
2708  total += U[row][k] * U_inv[k][col];
2709  }
2710  U_inv[row][col] = -total / U[row][row];
2711  }
2712  }
2713  // Now the answer is U_inv.L.
2714  for (row = 0; row < size; row++) {
2715  for (col = 0; col < size; col++) {
2716  double sum = 0.0;
2717  for (int k = row; k < size; ++k) {
2718  sum += U_inv[row][k] * L[k][col];
2719  }
2720  inv[row*size + col] = sum;
2721  }
2722  }
2723  // Check matrix product.
2724  double error_sum = 0.0;
2725  for (row = 0; row < size; row++) {
2726  for (col = 0; col < size; col++) {
2727  double sum = 0.0;
2728  for (int k = 0; k < size; ++k) {
2729  sum += input[row*size + k] * inv[k *size + col];
2730  }
2731  if (row != col) {
2732  error_sum += Abs(sum);
2733  }
2734  }
2735  }
2736  return error_sum;
2737 }
#define Abs(N)
Definition: cluster.cpp:208
int ListEntryMatch ( void *  arg1,
void *  arg2 
)

Definition at line 2343 of file cluster.cpp.

2344  { //Key
2345 /*
2346  ** Parameters: none
2347  ** Operation:
2348  ** This routine is used to search a list for a list node
2349  ** whose contents match Key. It is called by the list
2350  ** delete_d routine.
2351  ** Return: TRUE if ListNode matches Key
2352  ** Exceptions: none
2353  ** History: Thu Aug 3 14:23:58 1989, DSJ, Created.
2354  */
2355  return (arg1 == arg2);
2356 
2357 } // ListEntryMatch
BUCKETS * MakeBuckets ( DISTRIBUTION  Distribution,
uinT32  SampleCount,
FLOAT64  Confidence 
)

Makebuckets ************************************************************* Parameters: Distribution type of probability distribution to test for SampleCount number of samples that are available Confidence probability of a Type I error Operation: This routine creates a histogram data structure which can be used by other routines to place samples into histogram buckets, and then apply a goodness of fit test to the histogram data to determine if the samples belong to the specified probability distribution. The buckets are allocated in such a way that the expected frequency of samples in each bucket is approximately the same. In order to make this possible, a mapping table is computed which maps "normalized" samples into the appropriate bucket. Return: Pointer to new histogram data structure Exceptions: None History: 6/4/89, DSJ, Created.

Definition at line 1759 of file cluster.cpp.

1761  {
1762  const DENSITYFUNC DensityFunction[] =
1763  { NormalDensity, UniformDensity, UniformDensity };
1764  int i, j;
1765  BUCKETS *Buckets;
1766  FLOAT64 BucketProbability;
1767  FLOAT64 NextBucketBoundary;
1768  FLOAT64 Probability;
1769  FLOAT64 ProbabilityDelta;
1770  FLOAT64 LastProbDensity;
1771  FLOAT64 ProbDensity;
1772  uinT16 CurrentBucket;
1773  BOOL8 Symmetrical;
1774 
1775  // allocate memory needed for data structure
1776  Buckets = reinterpret_cast<BUCKETS*>(Emalloc(sizeof(BUCKETS)));
1777  Buckets->NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
1778  Buckets->SampleCount = SampleCount;
1779  Buckets->Confidence = Confidence;
1780  Buckets->Count = reinterpret_cast<uinT32*>(
1781  Emalloc(Buckets->NumberOfBuckets * sizeof(uinT32)));
1782  Buckets->ExpectedCount = reinterpret_cast<FLOAT32*>(
1783  Emalloc(Buckets->NumberOfBuckets * sizeof(FLOAT32)));
1784 
1785  // initialize simple fields
1786  Buckets->Distribution = Distribution;
1787  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
1788  Buckets->Count[i] = 0;
1789  Buckets->ExpectedCount[i] = 0.0;
1790  }
1791 
1792  // all currently defined distributions are symmetrical
1793  Symmetrical = TRUE;
1794  Buckets->ChiSquared = ComputeChiSquared(
1795  DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
1796 
1797  if (Symmetrical) {
1798  // allocate buckets so that all have approx. equal probability
1799  BucketProbability = 1.0 / (FLOAT64) (Buckets->NumberOfBuckets);
1800 
1801  // distribution is symmetric so fill in upper half then copy
1802  CurrentBucket = Buckets->NumberOfBuckets / 2;
1803  if (Odd (Buckets->NumberOfBuckets))
1804  NextBucketBoundary = BucketProbability / 2;
1805  else
1806  NextBucketBoundary = BucketProbability;
1807 
1808  Probability = 0.0;
1809  LastProbDensity =
1810  (*DensityFunction[(int) Distribution]) (BUCKETTABLESIZE / 2);
1811  for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
1812  ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1);
1813  ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0);
1814  Probability += ProbabilityDelta;
1815  if (Probability > NextBucketBoundary) {
1816  if (CurrentBucket < Buckets->NumberOfBuckets - 1)
1817  CurrentBucket++;
1818  NextBucketBoundary += BucketProbability;
1819  }
1820  Buckets->Bucket[i] = CurrentBucket;
1821  Buckets->ExpectedCount[CurrentBucket] +=
1822  (FLOAT32) (ProbabilityDelta * SampleCount);
1823  LastProbDensity = ProbDensity;
1824  }
1825  // place any leftover probability into the last bucket
1826  Buckets->ExpectedCount[CurrentBucket] +=
1827  (FLOAT32) ((0.5 - Probability) * SampleCount);
1828 
1829  // copy upper half of distribution to lower half
1830  for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--)
1831  Buckets->Bucket[i] =
1832  Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);
1833 
1834  // copy upper half of expected counts to lower half
1835  for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--)
1836  Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
1837  }
1838  return Buckets;
1839 } // MakeBuckets
uinT16 OptimumNumberOfBuckets(uinT32 SampleCount)
Definition: cluster.cpp:1843
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
unsigned int uinT32
Definition: host.h:103
FLOAT64 Confidence
Definition: cluster.cpp:181
uinT16 Bucket[BUCKETTABLESIZE]
Definition: cluster.cpp:184
FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx)
Definition: cluster.cpp:1998
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2290
#define BUCKETTABLESIZE
Definition: cluster.cpp:160
FLOAT64 NormalDensity(inT32 x)
Definition: cluster.cpp:1944
FLOAT64(* DENSITYFUNC)(inT32)
Definition: cluster.cpp:203
FLOAT32 * ExpectedCount
Definition: cluster.cpp:186
FLOAT64 ChiSquared
Definition: cluster.cpp:182
uinT32 SampleCount
Definition: cluster.cpp:180
FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:1884
double FLOAT64
Definition: host.h:112
unsigned short uinT16
Definition: host.h:101
unsigned char BOOL8
Definition: host.h:113
#define TRUE
Definition: capi.h:28
void * Emalloc(int Size)
Definition: emalloc.cpp:35
#define Odd(N)
Definition: cluster.cpp:206
DISTRIBUTION Distribution
Definition: cluster.cpp:179
FLOAT64 UniformDensity(inT32 x)
Definition: cluster.cpp:1973
float FLOAT32
Definition: host.h:111
#define Mirror(N, R)
Definition: cluster.cpp:207
uinT32 * Count
Definition: cluster.cpp:185
CLUSTERER* MakeClusterer ( inT16  SampleSize,
const PARAM_DESC  ParamDesc[] 
)

MakeClusterer ********************************************************** Parameters: SampleSize number of dimensions in feature space ParamDesc description of each dimension Operation: This routine creates a new clusterer data structure, initializes it, and returns a pointer to it. Return: pointer to the new clusterer data structure Exceptions: None History: 5/29/89, DSJ, Created.

Definition at line 399 of file cluster.cpp.

399  {
400  CLUSTERER *Clusterer;
401  int i;
402 
403  // allocate main clusterer data structure and init simple fields
404  Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER));
405  Clusterer->SampleSize = SampleSize;
406  Clusterer->NumberOfSamples = 0;
407  Clusterer->NumChar = 0;
408 
409  // init fields which will not be used initially
410  Clusterer->Root = NULL;
411  Clusterer->ProtoList = NIL_LIST;
412 
413  // maintain a copy of param descriptors in the clusterer data structure
414  Clusterer->ParamDesc =
415  (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC));
416  for (i = 0; i < SampleSize; i++) {
417  Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
418  Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
419  Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
420  Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
421  Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
422  Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
423  Clusterer->ParamDesc[i].MidRange =
424  (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
425  }
426 
427  // allocate a kd tree to hold the samples
428  Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
429 
430  // Initialize cache of histogram buckets to minimize recomputing them.
431  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
432  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
433  Clusterer->bucket_cache[d][c] = NULL;
434  }
435 
436  return Clusterer;
437 } // MakeClusterer
KDTREE * KDTree
Definition: cluster.h:90
inT16 SampleSize
Definition: cluster.h:87
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1-MINBUCKETS]
Definition: cluster.h:95
#define NULL
Definition: host.h:144
FLOAT32 HalfRange
Definition: ocrfeatures.h:52
FLOAT32 MidRange
Definition: ocrfeatures.h:53
inT32 NumberOfSamples
Definition: cluster.h:89
#define MAXBUCKETS
Definition: cluster.h:27
PARAM_DESC * ParamDesc
Definition: cluster.h:88
CLUSTER * Root
Definition: cluster.h:91
inT8 NonEssential
Definition: ocrfeatures.h:48
#define MINBUCKETS
Definition: cluster.h:26
#define NIL_LIST
Definition: oldlist.h:126
inT32 NumChar
Definition: cluster.h:93
inT8 Circular
Definition: ocrfeatures.h:47
void * Emalloc(int Size)
Definition: emalloc.cpp:35
KDTREE * MakeKDTree(inT16 KeySize, const PARAM_DESC KeyDesc[])
Definition: kdtree.cpp:184
FLOAT32 Range
Definition: ocrfeatures.h:51
FLOAT32 Min
Definition: ocrfeatures.h:49
FLOAT32 Max
Definition: ocrfeatures.h:50
LIST ProtoList
Definition: cluster.h:92
PROTOTYPE * MakeDegenerateProto ( uinT16  N,
CLUSTER Cluster,
STATISTICS Statistics,
PROTOSTYLE  Style,
inT32  MinSamples 
)

MakeDegenerateProto ****************************************************** Parameters: N number of dimensions Cluster cluster being analyzed Statistics statistical info about cluster Style type of prototype to be generated MinSamples minimum number of samples in a cluster Operation: This routine checks for clusters which are degenerate and therefore cannot be analyzed in a statistically valid way. A cluster is defined as degenerate if it does not have at least MINSAMPLESNEEDED samples in it. If the cluster is found to be degenerate, a prototype of the specified style is generated and marked as insignificant. A cluster is also degenerate if it does not have at least MinSamples samples in it. If the cluster is not degenerate, NULL is returned. Return: Pointer to degenerate prototype or NULL. Exceptions: None History: 6/20/89, DSJ, Created. 7/12/89, DSJ, Changed name and added check for 0 stddev. 8/8/89, DSJ, Removed check for 0 stddev (handled elsewhere).

Definition at line 1071 of file cluster.cpp.

1076  {
1077  PROTOTYPE *Proto = NULL;
1078 
1079  if (MinSamples < MINSAMPLESNEEDED)
1080  MinSamples = MINSAMPLESNEEDED;
1081 
1082  if (Cluster->SampleCount < MinSamples) {
1083  switch (Style) {
1084  case spherical:
1085  Proto = NewSphericalProto (N, Cluster, Statistics);
1086  break;
1087  case elliptical:
1088  case automatic:
1089  Proto = NewEllipticalProto (N, Cluster, Statistics);
1090  break;
1091  case mixed:
1092  Proto = NewMixedProto (N, Cluster, Statistics);
1093  break;
1094  }
1095  Proto->Significant = FALSE;
1096  }
1097  return (Proto);
1098 } // MakeDegenerateProto
PROTOTYPE * NewSphericalProto(uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1514
#define NULL
Definition: host.h:144
#define FALSE
Definition: capi.h:29
PROTOTYPE * NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1593
unsigned Significant
Definition: cluster.h:68
#define MINSAMPLESNEEDED
Definition: cluster.cpp:152
Definition: cluster.h:45
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1548
unsigned SampleCount
Definition: cluster.h:35
void MakeDimRandom ( uinT16  i,
PROTOTYPE Proto,
PARAM_DESC ParamDesc 
)

Definition at line 1364 of file cluster.cpp.

1364  {
1365  Proto->Distrib[i] = D_random;
1366  Proto->Mean[i] = ParamDesc->MidRange;
1367  Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
1368 
1369  // subtract out the previous magnitude of this dimension from the total
1370  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1371  Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
1372  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1373  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1374 
1375  // note that the proto Weight is irrelevant for D_random protos
1376 } // MakeDimRandom
FLOAT32 * Mean
Definition: cluster.h:78
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOATUNION Variance
Definition: cluster.h:81
FLOAT32 TotalMagnitude
Definition: cluster.h:79
FLOAT32 HalfRange
Definition: ocrfeatures.h:52
FLOAT32 MidRange
Definition: ocrfeatures.h:53
FLOAT32 * Elliptical
Definition: cluster.h:64
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 Range
Definition: ocrfeatures.h:51
FLOAT32 LogMagnitude
Definition: cluster.h:80
void MakeDimUniform ( uinT16  i,
PROTOTYPE Proto,
STATISTICS Statistics 
)

MakeDimUniform *********************************************************** Parameters: i index of dimension to be changed Proto prototype whose dimension is to be altered Statistics statistical info about prototype Operation: This routine alters the ith dimension of the specified mixed prototype to be uniform. Return: None Exceptions: None History: 6/20/89, DSJ, Created.

Definition at line 1389 of file cluster.cpp.

1389  {
1390  Proto->Distrib[i] = uniform;
1391  Proto->Mean[i] = Proto->Cluster->Mean[i] +
1392  (Statistics->Min[i] + Statistics->Max[i]) / 2;
1393  Proto->Variance.Elliptical[i] =
1394  (Statistics->Max[i] - Statistics->Min[i]) / 2;
1395  if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1396  Proto->Variance.Elliptical[i] = MINVARIANCE;
1397 
1398  // subtract out the previous magnitude of this dimension from the total
1399  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1400  Proto->Magnitude.Elliptical[i] =
1401  1.0 / (2.0 * Proto->Variance.Elliptical[i]);
1402  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1403  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1404 
1405  // note that the proto Weight is irrelevant for uniform protos
1406 } // MakeDimUniform
FLOAT32 * Mean
Definition: cluster.h:78
CLUSTER * Cluster
Definition: cluster.h:76
FLOAT32 Mean[1]
Definition: cluster.h:39
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOATUNION Variance
Definition: cluster.h:81
FLOAT32 TotalMagnitude
Definition: cluster.h:79
FLOAT32 * Elliptical
Definition: cluster.h:64
#define MINVARIANCE
Definition: cluster.cpp:142
FLOAT32 * Min
Definition: cluster.cpp:174
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 * Max
Definition: cluster.cpp:175
FLOAT32 LogMagnitude
Definition: cluster.h:80
PROTOTYPE * MakeEllipticalProto ( CLUSTERER Clusterer,
CLUSTER Cluster,
STATISTICS Statistics,
BUCKETS Buckets 
)

MakeEllipticalProto **************************************************** Parameters: Clusterer data struct containing samples being clustered Cluster cluster to be made into an elliptical prototype Statistics statistical info about cluster Buckets histogram struct used to analyze distribution Operation: This routine tests the specified cluster to see if it can be approximated by an elliptical normal distribution. If it can be, then a new prototype is formed and returned to the caller. If it can't be, then NULL is returned to the caller. Return: Pointer to new elliptical prototype or NULL. Exceptions: None History: 6/12/89, DSJ, Created.

Definition at line 1258 of file cluster.cpp.

1261  {
1262  PROTOTYPE *Proto = NULL;
1263  int i;
1264 
1265  // check that each dimension is a normal distribution
1266  for (i = 0; i < Clusterer->SampleSize; i++) {
1267  if (Clusterer->ParamDesc[i].NonEssential)
1268  continue;
1269 
1270  FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1271  Cluster->Mean[i],
1272  sqrt ((FLOAT64) Statistics->
1273  CoVariance[i * (Clusterer->SampleSize + 1)]));
1274  if (!DistributionOK (Buckets))
1275  break;
1276  }
1277  // if all dimensions matched a normal distribution, make a proto
1278  if (i >= Clusterer->SampleSize)
1279  Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1280  return (Proto);
1281 } // MakeEllipticalProto
FLOAT32 Mean[1]
Definition: cluster.h:39
inT16 SampleSize
Definition: cluster.h:87
#define NULL
Definition: host.h:144
BOOL8 DistributionOK(BUCKETS *Buckets)
Definition: cluster.cpp:2191
PARAM_DESC * ParamDesc
Definition: cluster.h:88
inT8 NonEssential
Definition: ocrfeatures.h:48
double FLOAT64
Definition: host.h:112
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1548
void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2019
PROTOTYPE * MakeMixedProto ( CLUSTERER Clusterer,
CLUSTER Cluster,
STATISTICS Statistics,
BUCKETS NormalBuckets,
FLOAT64  Confidence 
)

MakeMixedProto *********************************************************** Parameters: Clusterer data struct containing samples being clustered Cluster cluster to be made into a prototype Statistics statistical info about cluster NormalBuckets histogram struct used to analyze distribution Confidence confidence level for alternate distributions Operation: This routine tests each dimension of the specified cluster to see what distribution would best approximate that dimension. Each dimension is compared to the following distributions in order: normal, random, uniform. If each dimension can be represented by one of these distributions, then a new prototype is formed and returned to the caller. If it can't be, then NULL is returned to the caller. Return: Pointer to new mixed prototype or NULL. Exceptions: None History: 6/12/89, DSJ, Created.

Definition at line 1302 of file cluster.cpp.

1306  {
1307  PROTOTYPE *Proto;
1308  int i;
1309  BUCKETS *UniformBuckets = NULL;
1310  BUCKETS *RandomBuckets = NULL;
1311 
1312  // create a mixed proto to work on - initially assume all dimensions normal*/
1313  Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics);
1314 
1315  // find the proper distribution for each dimension
1316  for (i = 0; i < Clusterer->SampleSize; i++) {
1317  if (Clusterer->ParamDesc[i].NonEssential)
1318  continue;
1319 
1320  FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1321  Proto->Mean[i],
1322  sqrt ((FLOAT64) Proto->Variance.Elliptical[i]));
1323  if (DistributionOK (NormalBuckets))
1324  continue;
1325 
1326  if (RandomBuckets == NULL)
1327  RandomBuckets =
1328  GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence);
1329  MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i]));
1330  FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1331  Proto->Mean[i], Proto->Variance.Elliptical[i]);
1332  if (DistributionOK (RandomBuckets))
1333  continue;
1334 
1335  if (UniformBuckets == NULL)
1336  UniformBuckets =
1337  GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence);
1338  MakeDimUniform(i, Proto, Statistics);
1339  FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1340  Proto->Mean[i], Proto->Variance.Elliptical[i]);
1341  if (DistributionOK (UniformBuckets))
1342  continue;
1343  break;
1344  }
1345  // if any dimension failed to match a distribution, discard the proto
1346  if (i < Clusterer->SampleSize) {
1347  FreePrototype(Proto);
1348  Proto = NULL;
1349  }
1350  return (Proto);
1351 } // MakeMixedProto
FLOAT32 * Mean
Definition: cluster.h:78
void FreePrototype(void *arg)
Definition: cluster.cpp:579
FLOATUNION Variance
Definition: cluster.h:81
inT16 SampleSize
Definition: cluster.h:87
#define NULL
Definition: host.h:144
PROTOTYPE * NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1593
BOOL8 DistributionOK(BUCKETS *Buckets)
Definition: cluster.cpp:2191
BUCKETS * GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
Definition: cluster.cpp:1709
FLOAT32 * Elliptical
Definition: cluster.h:64
PARAM_DESC * ParamDesc
Definition: cluster.h:88
inT8 NonEssential
Definition: ocrfeatures.h:48
double FLOAT64
Definition: host.h:112
void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics)
Definition: cluster.cpp:1389
void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc)
Definition: cluster.cpp:1364
void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2019
unsigned SampleCount
Definition: cluster.h:35
CLUSTER * MakeNewCluster ( CLUSTERER Clusterer,
TEMPCLUSTER TempCluster 
)

MakeNewCluster ************************************************************* Parameters: Clusterer current clustering environment TempCluster potential cluster to make permanent Operation: This routine creates a new permanent cluster from the clusters specified in TempCluster. The 2 clusters in TempCluster are marked as "clustered" and deleted from the kd-tree. The new cluster is then added to the kd-tree. Return: Pointer to the new permanent cluster Exceptions: none History: 5/29/89, DSJ, Created. 7/13/89, DSJ, Removed visibility of kd-tree node data struct

Definition at line 841 of file cluster.cpp.

841  {
842  CLUSTER *Cluster;
843 
844  // allocate the new cluster and initialize it
845  Cluster = (CLUSTER *) Emalloc(
846  sizeof(CLUSTER) + (Clusterer->SampleSize - 1) * sizeof(FLOAT32));
847  Cluster->Clustered = FALSE;
848  Cluster->Prototype = FALSE;
849  Cluster->Left = TempCluster->Cluster;
850  Cluster->Right = TempCluster->Neighbor;
851  Cluster->CharID = -1;
852 
853  // mark the old clusters as "clustered" and delete them from the kd-tree
854  Cluster->Left->Clustered = TRUE;
855  Cluster->Right->Clustered = TRUE;
856  KDDelete(Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left);
857  KDDelete(Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right);
858 
859  // compute the mean and sample count for the new cluster
860  Cluster->SampleCount =
861  MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,
862  Cluster->Left->SampleCount, Cluster->Right->SampleCount,
863  Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean);
864 
865  // add the new cluster to the KD tree
866  KDStore(Clusterer->KDTree, Cluster->Mean, Cluster);
867  return Cluster;
868 } // MakeNewCluster
CLUSTER * Cluster
Definition: cluster.cpp:164
KDTREE * KDTree
Definition: cluster.h:90
FLOAT32 Mean[1]
Definition: cluster.h:39
inT16 SampleSize
Definition: cluster.h:87
unsigned Prototype
Definition: cluster.h:34
#define FALSE
Definition: capi.h:29
inT32 MergeClusters(inT16 N, register PARAM_DESC ParamDesc[], register inT32 n1, register inT32 n2, register FLOAT32 m[], register FLOAT32 m1[], register FLOAT32 m2[])
PARAM_DESC * ParamDesc
Definition: cluster.h:88
Definition: cluster.h:32
inT32 CharID
Definition: cluster.h:38
struct sample * Right
Definition: cluster.h:37
struct sample * Left
Definition: cluster.h:36
unsigned Clustered
Definition: cluster.h:33
#define TRUE
Definition: capi.h:28
CLUSTER * Neighbor
Definition: cluster.cpp:165
void * Emalloc(int Size)
Definition: emalloc.cpp:35
void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data)
Definition: kdtree.cpp:209
void KDDelete(KDTREE *Tree, FLOAT32 Key[], void *Data)
Definition: kdtree.cpp:269
float FLOAT32
Definition: host.h:111
unsigned SampleCount
Definition: cluster.h:35
void MakePotentialClusters ( ClusteringContext context,
CLUSTER Cluster,
inT32  Level 
)

MakePotentialClusters ************************************************** Parameters: context ClusteringContext (see definition above) Cluster current cluster being visited in kd-tree walk Level level of this cluster in the kd-tree Operation: This routine is designed to be used in concert with the KDWalk routine. It will create a potential cluster for each sample in the kd-tree that is being walked. This potential cluster will then be pushed on the heap.

Definition at line 768 of file cluster.cpp.

769  {
770  ClusterPair HeapEntry;
771  int next = context->next;
772  context->candidates[next].Cluster = Cluster;
773  HeapEntry.data = &(context->candidates[next]);
774  context->candidates[next].Neighbor =
775  FindNearestNeighbor(context->tree,
776  context->candidates[next].Cluster,
777  &HeapEntry.key);
778  if (context->candidates[next].Neighbor != NULL) {
779  context->heap->Push(&HeapEntry);
780  context->next++;
781  }
782 } // MakePotentialClusters
CLUSTER * FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, FLOAT32 *Distance)
Definition: cluster.cpp:802
CLUSTER * Cluster
Definition: cluster.cpp:164
#define NULL
Definition: host.h:144
TEMPCLUSTER * candidates
Definition: cluster.cpp:198
ClusterHeap * heap
Definition: cluster.cpp:197
CLUSTER * Neighbor
Definition: cluster.cpp:165
void Push(Pair *entry)
Definition: genericheap.h:95
PROTOTYPE * MakePrototype ( CLUSTERER Clusterer,
CLUSTERCONFIG Config,
CLUSTER Cluster 
)

MakePrototype *********************************************************** Parameters: Clusterer data structure holding cluster tree Config parameters used to control prototype generation Cluster cluster to be made into a prototype Operation: This routine attempts to create a prototype from the specified cluster that conforms to the distribution specified in Config. If there are too few samples in the cluster to perform a statistical analysis, then a prototype is generated but labelled as insignificant. If the dimensions of the cluster are not independent, no prototype is generated and NULL is returned. If a prototype can be found that matches the desired distribution then a pointer to it is returned, otherwise NULL is returned. Return: Pointer to new prototype or NULL Exceptions: None History: 6/19/89, DSJ, Created.

Definition at line 978 of file cluster.cpp.

980  {
981  STATISTICS *Statistics;
982  PROTOTYPE *Proto;
983  BUCKETS *Buckets;
984 
985  // filter out clusters which contain samples from the same character
986  if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal))
987  return NULL;
988 
989  // compute the covariance matrix and ranges for the cluster
990  Statistics =
991  ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
992 
993  // check for degenerate clusters which need not be analyzed further
994  // note that the MinSamples test assumes that all clusters with multiple
995  // character samples have been removed (as above)
996  Proto = MakeDegenerateProto(
997  Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle,
998  (inT32) (Config->MinSamples * Clusterer->NumChar));
999  if (Proto != NULL) {
1000  FreeStatistics(Statistics);
1001  return Proto;
1002  }
1003  // check to ensure that all dimensions are independent
1004  if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize,
1005  Statistics->CoVariance, Config->Independence)) {
1006  FreeStatistics(Statistics);
1007  return NULL;
1008  }
1009 
1010  if (HOTELLING && Config->ProtoStyle == elliptical) {
1011  Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
1012  if (Proto != NULL) {
1013  FreeStatistics(Statistics);
1014  return Proto;
1015  }
1016  }
1017 
1018  // create a histogram data structure used to evaluate distributions
1019  Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount,
1020  Config->Confidence);
1021 
1022  // create a prototype based on the statistics and test it
1023  switch (Config->ProtoStyle) {
1024  case spherical:
1025  Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1026  break;
1027  case elliptical:
1028  Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1029  break;
1030  case mixed:
1031  Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
1032  Config->Confidence);
1033  break;
1034  case automatic:
1035  Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1036  if (Proto != NULL)
1037  break;
1038  Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1039  if (Proto != NULL)
1040  break;
1041  Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
1042  Config->Confidence);
1043  break;
1044  }
1045  FreeStatistics(Statistics);
1046  return Proto;
1047 } // MakePrototype
int inT32
Definition: host.h:102
FLOAT32 MinSamples
Definition: cluster.h:50
inT16 SampleSize
Definition: cluster.h:87
FLOAT64 Confidence
Definition: cluster.h:54
PROTOTYPE * MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
Definition: cluster.cpp:1220
FLOAT32 MaxIllegal
Definition: cluster.h:51
#define NULL
Definition: host.h:144
PROTOTYPE * MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
Definition: cluster.cpp:1258
PROTOTYPE * MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *NormalBuckets, FLOAT64 Confidence)
Definition: cluster.cpp:1302
BOOL8 Independent(PARAM_DESC ParamDesc[], inT16 N, FLOAT32 *CoVariance, FLOAT32 Independence)
Definition: cluster.cpp:1660
STATISTICS * ComputeStatistics(inT16 N, PARAM_DESC ParamDesc[], CLUSTER *Cluster)
Definition: cluster.cpp:1426
BOOL8 MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, FLOAT32 MaxIllegal)
Definition: cluster.cpp:2569
PROTOSTYLE ProtoStyle
Definition: cluster.h:49
BUCKETS * GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
Definition: cluster.cpp:1709
void FreeStatistics(STATISTICS *Statistics)
Definition: cluster.cpp:2230
PARAM_DESC * ParamDesc
Definition: cluster.h:88
PROTOTYPE * MakeDegenerateProto(uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, inT32 MinSamples)
Definition: cluster.cpp:1071
inT32 NumChar
Definition: cluster.h:93
FLOAT32 * CoVariance
Definition: cluster.cpp:173
Definition: cluster.h:45
PROTOTYPE * TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1113
#define HOTELLING
Definition: cluster.cpp:30
unsigned SampleCount
Definition: cluster.h:35
Definition: cluster.h:59
FLOAT32 Independence
Definition: cluster.h:53
SAMPLE* MakeSample ( CLUSTERER Clusterer,
const FLOAT32 Feature,
inT32  CharID 
)

MakeSample *********************************************************** Parameters: Clusterer clusterer data structure to add sample to Feature feature to be added to clusterer CharID unique ident. of char that sample came from Operation: This routine creates a new sample data structure to hold the specified feature. This sample is added to the clusterer data structure (so that it knows which samples are to be clustered later), and a pointer to the sample is returned to the caller. Return: Pointer to the new sample data structure Exceptions: ALREADYCLUSTERED MakeSample can't be called after ClusterSamples has been called History: 5/29/89, DSJ, Created.

Definition at line 454 of file cluster.cpp.

455  {
456  SAMPLE *Sample;
457  int i;
458 
459  // see if the samples have already been clustered - if so trap an error
460  if (Clusterer->Root != NULL)
462  "Can't add samples after they have been clustered");
463 
464  // allocate the new sample and initialize it
465  Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) +
466  (Clusterer->SampleSize -
467  1) * sizeof (FLOAT32));
468  Sample->Clustered = FALSE;
469  Sample->Prototype = FALSE;
470  Sample->SampleCount = 1;
471  Sample->Left = NULL;
472  Sample->Right = NULL;
473  Sample->CharID = CharID;
474 
475  for (i = 0; i < Clusterer->SampleSize; i++)
476  Sample->Mean[i] = Feature[i];
477 
478  // add the sample to the KD tree - keep track of the total # of samples
479  Clusterer->NumberOfSamples++;
480  KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample);
481  if (CharID >= Clusterer->NumChar)
482  Clusterer->NumChar = CharID + 1;
483 
484  // execute hook for monitoring clustering operation
485  // (*SampleCreationHook)( Sample );
486 
487  return (Sample);
488 } // MakeSample
KDTREE * KDTree
Definition: cluster.h:90
FLOAT32 Mean[1]
Definition: cluster.h:39
inT16 SampleSize
Definition: cluster.h:87
#define NULL
Definition: host.h:144
unsigned Prototype
Definition: cluster.h:34
#define FALSE
Definition: capi.h:29
inT32 NumberOfSamples
Definition: cluster.h:89
Definition: cluster.h:32
CLUSTER * Root
Definition: cluster.h:91
inT32 CharID
Definition: cluster.h:38
struct sample * Right
Definition: cluster.h:37
void DoError(int Error, const char *Message)
Definition: danerror.cpp:32
struct sample * Left
Definition: cluster.h:36
inT32 NumChar
Definition: cluster.h:93
unsigned Clustered
Definition: cluster.h:33
void * Emalloc(int Size)
Definition: emalloc.cpp:35
#define ALREADYCLUSTERED
Definition: cluster.h:133
void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data)
Definition: kdtree.cpp:209
float FLOAT32
Definition: host.h:111
unsigned SampleCount
Definition: cluster.h:35
PROTOTYPE * MakeSphericalProto ( CLUSTERER Clusterer,
CLUSTER Cluster,
STATISTICS Statistics,
BUCKETS Buckets 
)

Definition at line 1220 of file cluster.cpp.

1223  {
1224  PROTOTYPE *Proto = NULL;
1225  int i;
1226 
1227  // check that each dimension is a normal distribution
1228  for (i = 0; i < Clusterer->SampleSize; i++) {
1229  if (Clusterer->ParamDesc[i].NonEssential)
1230  continue;
1231 
1232  FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1233  Cluster->Mean[i],
1234  sqrt ((FLOAT64) (Statistics->AvgVariance)));
1235  if (!DistributionOK (Buckets))
1236  break;
1237  }
1238  // if all dimensions matched a normal distribution, make a proto
1239  if (i >= Clusterer->SampleSize)
1240  Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics);
1241  return (Proto);
1242 } // MakeSphericalProto
FLOAT32 Mean[1]
Definition: cluster.h:39
inT16 SampleSize
Definition: cluster.h:87
PROTOTYPE * NewSphericalProto(uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1514
#define NULL
Definition: host.h:144
BOOL8 DistributionOK(BUCKETS *Buckets)
Definition: cluster.cpp:2191
PARAM_DESC * ParamDesc
Definition: cluster.h:88
inT8 NonEssential
Definition: ocrfeatures.h:48
FLOAT32 AvgVariance
Definition: cluster.cpp:172
double FLOAT64
Definition: host.h:112
void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2019
FLOAT32 Mean ( PROTOTYPE Proto,
uinT16  Dimension 
)

Mean *********************************************************** Parameters: Proto prototype to return mean of Dimension dimension whose mean is to be returned Operation: This routine returns the mean of the specified prototype in the indicated dimension. Return: Mean of Prototype in Dimension Exceptions: none History: 7/6/89, DSJ, Created.

Definition at line 643 of file cluster.cpp.

643  {
644  return (Proto->Mean[Dimension]);
645 } // Mean
FLOAT32 * Mean
Definition: cluster.h:78
inT32 MergeClusters ( inT16  N,
register PARAM_DESC  ParamDesc[],
register inT32  n1,
register inT32  n2,
register FLOAT32  m[],
register FLOAT32  m1[],
register FLOAT32  m2[] 
)
inT32 MergeClusters ( inT16  N,
PARAM_DESC  ParamDesc[],
inT32  n1,
inT32  n2,
FLOAT32  m[],
FLOAT32  m1[],
FLOAT32  m2[] 
)

MergeClusters ************************************************************ Parameters: N # of dimensions (size of arrays) ParamDesc array of dimension descriptions n1, n2 number of samples in each old cluster m array to hold mean of new cluster m1, m2 arrays containing means of old clusters Operation: This routine merges two clusters into one larger cluster. To do this it computes the number of samples in the new cluster and the mean of the new cluster. The ParamDesc information is used to ensure that circular dimensions are handled correctly. Return: The number of samples in the new cluster. Exceptions: None History: 5/31/89, DSJ, Created.

Definition at line 886 of file cluster.cpp.

891  {
892  inT32 i, n;
893 
894  n = n1 + n2;
895  for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
896  if (ParamDesc->Circular) {
897  // if distance between means is greater than allowed
898  // reduce upper point by one "rotation" to compute mean
899  // then normalize the mean back into the accepted range
900  if ((*m2 - *m1) > ParamDesc->HalfRange) {
901  *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
902  if (*m < ParamDesc->Min)
903  *m += ParamDesc->Range;
904  }
905  else if ((*m1 - *m2) > ParamDesc->HalfRange) {
906  *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
907  if (*m < ParamDesc->Min)
908  *m += ParamDesc->Range;
909  }
910  else
911  *m = (n1 * *m1 + n2 * *m2) / n;
912  }
913  else
914  *m = (n1 * *m1 + n2 * *m2) / n;
915  }
916  return n;
917 } // MergeClusters
int inT32
Definition: host.h:102
BOOL8 MultipleCharSamples ( CLUSTERER Clusterer,
CLUSTER Cluster,
FLOAT32  MaxIllegal 
)

Definition at line 2569 of file cluster.cpp.

2600 {
2601  static BOOL8 *CharFlags = NULL;
2602  static inT32 NumFlags = 0;
2603  int i;
2604  LIST SearchState;
2605  SAMPLE *Sample;
2606  inT32 CharID;
2607  inT32 NumCharInCluster;
2608  inT32 NumIllegalInCluster;
2609  FLOAT32 PercentIllegal;
2610 
2611  // initial estimate assumes that no illegal chars exist in the cluster
2612  NumCharInCluster = Cluster->SampleCount;
2613  NumIllegalInCluster = 0;
2614 
2615  if (Clusterer->NumChar > NumFlags) {
2616  if (CharFlags != NULL)
2617  memfree(CharFlags);
2618  NumFlags = Clusterer->NumChar;
2619  CharFlags = (BOOL8 *) Emalloc (NumFlags * sizeof (BOOL8));
2620  }
2621 
2622  for (i = 0; i < NumFlags; i++)
2623  CharFlags[i] = FALSE;
2624 
2625  // find each sample in the cluster and check if we have seen it before
2626  InitSampleSearch(SearchState, Cluster);
2627  while ((Sample = NextSample (&SearchState)) != NULL) {
2628  CharID = Sample->CharID;
2629  if (CharFlags[CharID] == FALSE) {
2630  CharFlags[CharID] = TRUE;
2631  }
2632  else {
2633  if (CharFlags[CharID] == TRUE) {
2634  NumIllegalInCluster++;
2635  CharFlags[CharID] = ILLEGAL_CHAR;
2636  }
2637  NumCharInCluster--;
2638  PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster;
2639  if (PercentIllegal > MaxIllegal) {
2640  destroy(SearchState);
2641  return (TRUE);
2642  }
2643  }
2644  }
2645  return (FALSE);
2646 
2647 } // MultipleCharSamples
int inT32
Definition: host.h:102
#define NULL
Definition: host.h:144
#define ILLEGAL_CHAR
#define FALSE
Definition: capi.h:29
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:618
Definition: cluster.h:32
inT32 CharID
Definition: cluster.h:38
inT32 NumChar
Definition: cluster.h:93
unsigned char BOOL8
Definition: host.h:113
#define TRUE
Definition: capi.h:28
void * Emalloc(int Size)
Definition: emalloc.cpp:35
void memfree(void *element)
Definition: freelist.cpp:30
#define InitSampleSearch(S, C)
Definition: cluster.h:105
LIST destroy(LIST list)
Definition: oldlist.cpp:187
float FLOAT32
Definition: host.h:111
unsigned SampleCount
Definition: cluster.h:35
CHISTRUCT * NewChiStruct ( uinT16  DegreesOfFreedom,
FLOAT64  Alpha 
)

Definition at line 2436 of file cluster.cpp.

2436  {
2437 /*
2438  ** Parameters:
2439  ** DegreesOfFreedom degrees of freedom for new chi value
2440  ** Alpha confidence level for new chi value
2441  ** Operation:
2442  ** This routine allocates a new data structure which is used
2443  ** to hold a chi-squared value along with its associated
2444  ** number of degrees of freedom and alpha value.
2445  ** Return: none
2446  ** Exceptions: none
2447  ** History: Fri Aug 4 11:04:59 1989, DSJ, Created.
2448  */
2450 
2451  NewChiStruct = (CHISTRUCT *) Emalloc (sizeof (CHISTRUCT));
2452  NewChiStruct->DegreesOfFreedom = DegreesOfFreedom;
2453  NewChiStruct->Alpha = Alpha;
2454  return (NewChiStruct);
2455 
2456 } // NewChiStruct
uinT16 DegreesOfFreedom
Definition: cluster.cpp:190
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2290
CHISTRUCT * NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:2436
FLOAT64 Alpha
Definition: cluster.cpp:191
void * Emalloc(int Size)
Definition: emalloc.cpp:35
PROTOTYPE * NewEllipticalProto ( inT16  N,
CLUSTER Cluster,
STATISTICS Statistics 
)

NewEllipticalProto ******************************************************* Parameters: N number of dimensions Cluster cluster to be made into an elliptical prototype Statistics statistical info about samples in cluster Operation: This routine creates an elliptical prototype data structure to approximate the samples in the specified cluster. Elliptical prototypes have a variance for each dimension. All dimensions are normally distributed and independent. Return: Pointer to a new elliptical prototype data structure Exceptions: None History: 6/19/89, DSJ, Created.

Definition at line 1548 of file cluster.cpp.

1550  {
1551  PROTOTYPE *Proto;
1552  FLOAT32 *CoVariance;
1553  int i;
1554 
1555  Proto = NewSimpleProto (N, Cluster);
1556  Proto->Variance.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1557  Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1558  Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1559 
1560  CoVariance = Statistics->CoVariance;
1561  Proto->TotalMagnitude = 1.0;
1562  for (i = 0; i < N; i++, CoVariance += N + 1) {
1563  Proto->Variance.Elliptical[i] = *CoVariance;
1564  if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1565  Proto->Variance.Elliptical[i] = MINVARIANCE;
1566 
1567  Proto->Magnitude.Elliptical[i] =
1568  1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i]));
1569  Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
1570  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1571  }
1572  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1573  Proto->Style = elliptical;
1574  return (Proto);
1575 } // NewEllipticalProto
FLOATUNION Variance
Definition: cluster.h:81
FLOAT32 TotalMagnitude
Definition: cluster.h:79
PROTOTYPE * NewSimpleProto(inT16 N, CLUSTER *Cluster)
Definition: cluster.cpp:1618
#define PI
Definition: const.h:19
FLOAT32 * Elliptical
Definition: cluster.h:64
#define MINVARIANCE
Definition: cluster.cpp:142
FLOATUNION Weight
Definition: cluster.h:83
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 * CoVariance
Definition: cluster.cpp:173
void * Emalloc(int Size)
Definition: emalloc.cpp:35
unsigned Style
Definition: cluster.h:74
FLOAT32 LogMagnitude
Definition: cluster.h:80
float FLOAT32
Definition: host.h:111
PROTOTYPE * NewMixedProto ( inT16  N,
CLUSTER Cluster,
STATISTICS Statistics 
)

MewMixedProto ************************************************************ Parameters: N number of dimensions Cluster cluster to be made into a mixed prototype Statistics statistical info about samples in cluster Operation: This routine creates a mixed prototype data structure to approximate the samples in the specified cluster. Mixed prototypes can have different distributions for each dimension. All dimensions are independent. The structure is initially filled in as though it were an elliptical prototype. The actual distributions of the dimensions can be altered by other routines. Return: Pointer to a new mixed prototype data structure Exceptions: None History: 6/19/89, DSJ, Created.

Definition at line 1593 of file cluster.cpp.

1593  {
1594  PROTOTYPE *Proto;
1595  int i;
1596 
1597  Proto = NewEllipticalProto (N, Cluster, Statistics);
1598  Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
1599 
1600  for (i = 0; i < N; i++) {
1601  Proto->Distrib[i] = normal;
1602  }
1603  Proto->Style = mixed;
1604  return (Proto);
1605 } // NewMixedProto
DISTRIBUTION * Distrib
Definition: cluster.h:77
DISTRIBUTION
Definition: cluster.h:58
Definition: cluster.h:45
void * Emalloc(int Size)
Definition: emalloc.cpp:35
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1548
unsigned Style
Definition: cluster.h:74
Definition: cluster.h:59
PROTOTYPE * NewSimpleProto ( inT16  N,
CLUSTER Cluster 
)

NewSimpleProto *********************************************************** Parameters: N number of dimensions Cluster cluster to be made into a prototype Operation: This routine allocates memory to hold a simple prototype data structure, i.e. one without independent distributions and variances for each dimension. Return: Pointer to new simple prototype Exceptions: None History: 6/19/89, DSJ, Created.

Definition at line 1618 of file cluster.cpp.

1618  {
1619  PROTOTYPE *Proto;
1620  int i;
1621 
1622  Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
1623  Proto->Mean = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1624 
1625  for (i = 0; i < N; i++)
1626  Proto->Mean[i] = Cluster->Mean[i];
1627  Proto->Distrib = NULL;
1628 
1629  Proto->Significant = TRUE;
1630  Proto->Merged = FALSE;
1631  Proto->Style = spherical;
1632  Proto->NumSamples = Cluster->SampleCount;
1633  Proto->Cluster = Cluster;
1634  Proto->Cluster->Prototype = TRUE;
1635  return (Proto);
1636 } // NewSimpleProto
FLOAT32 * Mean
Definition: cluster.h:78
CLUSTER * Cluster
Definition: cluster.h:76
FLOAT32 Mean[1]
Definition: cluster.h:39
DISTRIBUTION * Distrib
Definition: cluster.h:77
#define NULL
Definition: host.h:144
unsigned Prototype
Definition: cluster.h:34
#define FALSE
Definition: capi.h:29
unsigned Significant
Definition: cluster.h:68
unsigned Merged
Definition: cluster.h:69
unsigned NumSamples
Definition: cluster.h:75
#define TRUE
Definition: capi.h:28
void * Emalloc(int Size)
Definition: emalloc.cpp:35
unsigned Style
Definition: cluster.h:74
float FLOAT32
Definition: host.h:111
unsigned SampleCount
Definition: cluster.h:35
PROTOTYPE * NewSphericalProto ( uinT16  N,
CLUSTER Cluster,
STATISTICS Statistics 
)

NewSpericalProto ********************************************************* Parameters: N number of dimensions Cluster cluster to be made into a spherical prototype Statistics statistical info about samples in cluster Operation: This routine creates a spherical prototype data structure to approximate the samples in the specified cluster. Spherical prototypes have a single variance which is common across all dimensions. All dimensions are normally distributed and independent. Return: Pointer to a new spherical prototype data structure Exceptions: None History: 6/19/89, DSJ, Created.

Definition at line 1514 of file cluster.cpp.

1516  {
1517  PROTOTYPE *Proto;
1518 
1519  Proto = NewSimpleProto (N, Cluster);
1520 
1521  Proto->Variance.Spherical = Statistics->AvgVariance;
1522  if (Proto->Variance.Spherical < MINVARIANCE)
1523  Proto->Variance.Spherical = MINVARIANCE;
1524 
1525  Proto->Magnitude.Spherical =
1526  1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical));
1527  Proto->TotalMagnitude = (float)pow((double)Proto->Magnitude.Spherical,
1528  (double) N);
1529  Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
1530  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1531 
1532  return (Proto);
1533 } // NewSphericalProto
FLOATUNION Variance
Definition: cluster.h:81
FLOAT32 TotalMagnitude
Definition: cluster.h:79
PROTOTYPE * NewSimpleProto(inT16 N, CLUSTER *Cluster)
Definition: cluster.cpp:1618
#define PI
Definition: const.h:19
#define MINVARIANCE
Definition: cluster.cpp:142
FLOATUNION Weight
Definition: cluster.h:83
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 AvgVariance
Definition: cluster.cpp:172
FLOAT32 Spherical
Definition: cluster.h:63
FLOAT32 LogMagnitude
Definition: cluster.h:80
CLUSTER* NextSample ( LIST SearchState)

NextSample ************************************************************ Parameters: SearchState ptr to list containing clusters to be searched Operation: This routine is used to find all of the samples which belong to a cluster. It starts by removing the top cluster on the cluster list (SearchState). If this cluster is a leaf it is returned. Otherwise, the right subcluster is pushed on the list and we continue the search in the left subcluster. This continues until a leaf is found. If all samples have been found, NULL is returned. InitSampleSearch() must be called before NextSample() to initialize the search. Return: Pointer to the next leaf cluster (sample) or NULL. Exceptions: None History: 6/16/89, DSJ, Created.

Definition at line 618 of file cluster.cpp.

618  {
619  CLUSTER *Cluster;
620 
621  if (*SearchState == NIL_LIST)
622  return (NULL);
623  Cluster = (CLUSTER *) first_node (*SearchState);
624  *SearchState = pop (*SearchState);
625  while (TRUE) {
626  if (Cluster->Left == NULL)
627  return (Cluster);
628  *SearchState = push (*SearchState, Cluster->Right);
629  Cluster = Cluster->Left;
630  }
631 } // NextSample
LIST push(LIST list, void *element)
Definition: oldlist.cpp:323
#define NULL
Definition: host.h:144
LIST pop(LIST list)
Definition: oldlist.cpp:305
Definition: cluster.h:32
struct sample * Right
Definition: cluster.h:37
struct sample * Left
Definition: cluster.h:36
#define NIL_LIST
Definition: oldlist.h:126
#define TRUE
Definition: capi.h:28
#define first_node(l)
Definition: oldlist.h:139
uinT16 NormalBucket ( PARAM_DESC ParamDesc,
FLOAT32  x,
FLOAT32  Mean,
FLOAT32  StdDev 
)

Definition at line 2107 of file cluster.cpp.

2110  {
2111 /*
2112  ** Parameters:
2113  ** ParamDesc used to identify circular dimensions
2114  ** x value to be normalized
2115  ** Mean mean of normal distribution
2116  ** StdDev standard deviation of normal distribution
2117  ** Operation:
2118  ** This routine determines which bucket x falls into in the
2119  ** discrete normal distribution defined by kNormalMean
2120  ** and kNormalStdDev. x values which exceed the range of
2121  ** the discrete distribution are clipped.
2122  ** Return:
2123  ** Bucket number into which x falls
2124  ** Exceptions:
2125  ** None
2126  ** History:
2127  ** 6/5/89, DSJ, Created.
2128  */
2129  FLOAT32 X;
2130 
2131  // wraparound circular parameters if necessary
2132  if (ParamDesc->Circular) {
2133  if (x - Mean > ParamDesc->HalfRange)
2134  x -= ParamDesc->Range;
2135  else if (x - Mean < -ParamDesc->HalfRange)
2136  x += ParamDesc->Range;
2137  }
2138 
2139  X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean;
2140  if (X < 0)
2141  return 0;
2142  if (X > BUCKETTABLESIZE - 1)
2143  return ((uinT16) (BUCKETTABLESIZE - 1));
2144  return (uinT16) floor((FLOAT64) X);
2145 } // NormalBucket
FLOAT32 HalfRange
Definition: ocrfeatures.h:52
#define BUCKETTABLESIZE
Definition: cluster.cpp:160
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:643
double FLOAT64
Definition: host.h:112
unsigned short uinT16
Definition: host.h:101
inT8 Circular
Definition: ocrfeatures.h:47
FLOAT32 Range
Definition: ocrfeatures.h:51
float FLOAT32
Definition: host.h:111
FLOAT64 NormalDensity ( inT32  x)

Definition at line 1944 of file cluster.cpp.

1944  {
1945 /*
1946  ** Parameters:
1947  ** x number to compute the normal probability density for
1948  ** Globals:
1949  ** kNormalMean mean of a discrete normal distribution
1950  ** kNormalVariance variance of a discrete normal distribution
1951  ** kNormalMagnitude magnitude of a discrete normal distribution
1952  ** Operation:
1953  ** This routine computes the probability density function
1954  ** of a discrete normal distribution defined by the global
1955  ** variables kNormalMean, kNormalVariance, and kNormalMagnitude.
1956  ** Normal magnitude could, of course, be computed in terms of
1957  ** the normal variance but it is precomputed for efficiency.
1958  ** Return:
1959  ** The value of the normal distribution at x.
1960  ** Exceptions:
1961  ** None
1962  ** History:
1963  ** 6/4/89, DSJ, Created.
1964  */
1965  FLOAT64 Distance;
1966 
1967  Distance = x - kNormalMean;
1968  return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
1969 } // NormalDensity
double FLOAT64
Definition: host.h:112
int NumBucketsMatch ( void *  arg1,
void *  arg2 
)

Definition at line 2320 of file cluster.cpp.

2321  { // uinT16 *DesiredNumberOfBuckets)
2322 /*
2323  ** Parameters:
2324  ** Histogram current histogram being tested for a match
2325  ** DesiredNumberOfBuckets match key
2326  ** Operation:
2327  ** This routine is used to search a list of histogram data
2328  ** structures to find one with the specified number of
2329  ** buckets. It is called by the list search routines.
2330  ** Return: TRUE if Histogram matches DesiredNumberOfBuckets
2331  ** Exceptions: none
2332  ** History: Thu Aug 3 14:17:33 1989, DSJ, Created.
2333  */
2334  BUCKETS *Histogram = (BUCKETS *) arg1;
2335  uinT16 *DesiredNumberOfBuckets = (uinT16 *) arg2;
2336 
2337  return (*DesiredNumberOfBuckets == Histogram->NumberOfBuckets);
2338 
2339 } // NumBucketsMatch
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
unsigned short uinT16
Definition: host.h:101
uinT16 OptimumNumberOfBuckets ( uinT32  SampleCount)

Definition at line 1843 of file cluster.cpp.

1843  {
1844 /*
1845  ** Parameters:
1846  ** SampleCount number of samples to be tested
1847  ** Operation:
1848  ** This routine computes the optimum number of histogram
1849  ** buckets that should be used in a chi-squared goodness of
1850  ** fit test for the specified number of samples. The optimum
1851  ** number is computed based on Table 4.1 on pg. 147 of
1852  ** "Measurement and Analysis of Random Data" by Bendat & Piersol.
1853  ** Linear interpolation is used to interpolate between table
1854  ** values. The table is intended for a 0.05 level of
1855  ** significance (alpha). This routine assumes that it is
1856  ** equally valid for other alpha's, which may not be true.
1857  ** Return:
1858  ** Optimum number of histogram buckets
1859  ** Exceptions:
1860  ** None
1861  ** History:
1862  ** 6/5/89, DSJ, Created.
1863  */
1864  uinT8 Last, Next;
1865  FLOAT32 Slope;
1866 
1867  if (SampleCount < kCountTable[0])
1868  return kBucketsTable[0];
1869 
1870  for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
1871  if (SampleCount <= kCountTable[Next]) {
1872  Slope = (FLOAT32) (kBucketsTable[Next] - kBucketsTable[Last]) /
1873  (FLOAT32) (kCountTable[Next] - kCountTable[Last]);
1874  return ((uinT16) (kBucketsTable[Last] +
1875  Slope * (SampleCount - kCountTable[Last])));
1876  }
1877  }
1878  return kBucketsTable[Last];
1879 } // OptimumNumberOfBuckets
#define LOOKUPTABLESIZE
Definition: cluster.cpp:228
unsigned short uinT16
Definition: host.h:101
unsigned char uinT8
Definition: host.h:99
float FLOAT32
Definition: host.h:111
FLOAT64 Solve ( SOLVEFUNC  Function,
void *  FunctionParams,
FLOAT64  InitialGuess,
FLOAT64  Accuracy 
)

Definition at line 2461 of file cluster.cpp.

2481 {
2482  FLOAT64 x;
2483  FLOAT64 f;
2484  FLOAT64 Slope;
2485  FLOAT64 Delta;
2486  FLOAT64 NewDelta;
2487  FLOAT64 xDelta;
2488  FLOAT64 LastPosX, LastNegX;
2489 
2490  x = InitialGuess;
2491  Delta = INITIALDELTA;
2492  LastPosX = MAX_FLOAT32;
2493  LastNegX = -MAX_FLOAT32;
2494  f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2495  while (Abs (LastPosX - LastNegX) > Accuracy) {
2496  // keep track of outer bounds of current estimate
2497  if (f < 0)
2498  LastNegX = x;
2499  else
2500  LastPosX = x;
2501 
2502  // compute the approx. slope of f(x) at the current point
2503  Slope =
2504  ((*Function) ((CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta;
2505 
2506  // compute the next solution guess */
2507  xDelta = f / Slope;
2508  x -= xDelta;
2509 
2510  // reduce the delta used for computing slope to be a fraction of
2511  //the amount moved to get to the new guess
2512  NewDelta = Abs (xDelta) * DELTARATIO;
2513  if (NewDelta < Delta)
2514  Delta = NewDelta;
2515 
2516  // compute the value of the function at the new guess
2517  f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2518  }
2519  return (x);
2520 
2521 } // Solve
#define MAX_FLOAT32
Definition: host.h:124
#define DELTARATIO
#define Abs(N)
Definition: cluster.cpp:208
#define INITIALDELTA
double FLOAT64
Definition: host.h:112
FLOAT32 StandardDeviation ( PROTOTYPE Proto,
uinT16  Dimension 
)

StandardDeviation ************************************************* Parameters: Proto prototype to return standard deviation of Dimension dimension whose stddev is to be returned Operation: This routine returns the standard deviation of the prototype in the indicated dimension. Return: Standard deviation of Prototype in Dimension Exceptions: none History: 7/6/89, DSJ, Created.

Definition at line 657 of file cluster.cpp.

657  {
658  switch (Proto->Style) {
659  case spherical:
660  return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical));
661  case elliptical:
662  return ((FLOAT32)
663  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
664  case mixed:
665  switch (Proto->Distrib[Dimension]) {
666  case normal:
667  return ((FLOAT32)
668  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
669  case uniform:
670  case D_random:
671  return (Proto->Variance.Elliptical[Dimension]);
672  case DISTRIBUTION_COUNT:
673  ASSERT_HOST(!"Distribution count not allowed!");
674  }
675  }
676  return 0.0f;
677 } // StandardDeviation
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOATUNION Variance
Definition: cluster.h:81
FLOAT32 * Elliptical
Definition: cluster.h:64
FLOAT32 Spherical
Definition: cluster.h:63
Definition: cluster.h:45
unsigned Style
Definition: cluster.h:74
float FLOAT32
Definition: host.h:111
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: cluster.h:59
PROTOTYPE * TestEllipticalProto ( CLUSTERER Clusterer,
CLUSTERCONFIG Config,
CLUSTER Cluster,
STATISTICS Statistics 
)

TestEllipticalProto **************************************************** Parameters: Clusterer data struct containing samples being clustered Config provides the magic number of samples that make a good cluster Cluster cluster to be made into an elliptical prototype Statistics statistical info about cluster Operation: This routine tests the specified cluster to see if ** there is a statistically significant difference between the sub-clusters that would be made if the cluster were to be split. If not, then a new prototype is formed and returned to the caller. If there is, then NULL is returned to the caller. Return: Pointer to new elliptical prototype or NULL.

Definition at line 1113 of file cluster.cpp.

1116  {
1117  // Fraction of the number of samples used as a range around 1 within
1118  // which a cluster has the magic size that allows a boost to the
1119  // FTable by kFTableBoostMargin, thus allowing clusters near the
1120  // magic size (equal to the number of sample characters) to be more
1121  // likely to stay together.
1122  const double kMagicSampleMargin = 0.0625;
1123  const double kFTableBoostMargin = 2.0;
1124 
1125  int N = Clusterer->SampleSize;
1126  CLUSTER* Left = Cluster->Left;
1127  CLUSTER* Right = Cluster->Right;
1128  if (Left == NULL || Right == NULL)
1129  return NULL;
1130  int TotalDims = Left->SampleCount + Right->SampleCount;
1131  if (TotalDims < N + 1 || TotalDims < 2)
1132  return NULL;
1133  const int kMatrixSize = N * N * sizeof(FLOAT32);
1134  FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1135  FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1136  FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
1137  // Compute a new covariance matrix that only uses essential features.
1138  for (int i = 0; i < N; ++i) {
1139  int row_offset = i * N;
1140  if (!Clusterer->ParamDesc[i].NonEssential) {
1141  for (int j = 0; j < N; ++j) {
1142  if (!Clusterer->ParamDesc[j].NonEssential)
1143  Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
1144  else
1145  Covariance[j + row_offset] = 0.0f;
1146  }
1147  } else {
1148  for (int j = 0; j < N; ++j) {
1149  if (i == j)
1150  Covariance[j + row_offset] = 1.0f;
1151  else
1152  Covariance[j + row_offset] = 0.0f;
1153  }
1154  }
1155  }
1156  double err = InvertMatrix(Covariance, N, Inverse);
1157  if (err > 1) {
1158  tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
1159  }
1160  int EssentialN = 0;
1161  for (int dim = 0; dim < N; ++dim) {
1162  if (!Clusterer->ParamDesc[dim].NonEssential) {
1163  Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
1164  ++EssentialN;
1165  } else {
1166  Delta[dim] = 0.0f;
1167  }
1168  }
1169  // Compute Hotelling's T-squared.
1170  double Tsq = 0.0;
1171  for (int x = 0; x < N; ++x) {
1172  double temp = 0.0;
1173  for (int y = 0; y < N; ++y) {
1174  temp += Inverse[y + N*x] * Delta[y];
1175  }
1176  Tsq += Delta[x] * temp;
1177  }
1178  memfree(Covariance);
1179  memfree(Inverse);
1180  memfree(Delta);
1181  // Changed this function to match the formula in
1182  // Statistical Methods in Medical Research p 473
1183  // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
1184  // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
1185  double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
1186  int Fx = EssentialN;
1187  if (Fx > FTABLE_X)
1188  Fx = FTABLE_X;
1189  --Fx;
1190  int Fy = TotalDims - EssentialN - 1;
1191  if (Fy > FTABLE_Y)
1192  Fy = FTABLE_Y;
1193  --Fy;
1194  double FTarget = FTable[Fy][Fx];
1195  if (Config->MagicSamples > 0 &&
1196  TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
1197  TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
1198  // Give magic-sized clusters a magic FTable boost.
1199  FTarget += kFTableBoostMargin;
1200  }
1201  if (F < FTarget) {
1202  return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1203  }
1204  return NULL;
1205 }
FLOAT32 Mean[1]
Definition: cluster.h:39
#define tprintf(...)
Definition: tprintf.h:31
inT16 SampleSize
Definition: cluster.h:87
#define NULL
Definition: host.h:144
#define FTABLE_Y
Definition: cluster.cpp:32
const double FTable[FTABLE_Y][FTABLE_X]
Definition: cluster.cpp:35
PARAM_DESC * ParamDesc
Definition: cluster.h:88
Definition: cluster.h:32
inT8 NonEssential
Definition: ocrfeatures.h:48
struct sample * Right
Definition: cluster.h:37
struct sample * Left
Definition: cluster.h:36
FLOAT32 * CoVariance
Definition: cluster.cpp:173
void * Emalloc(int Size)
Definition: emalloc.cpp:35
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1548
void memfree(void *element)
Definition: freelist.cpp:30
int MagicSamples
Definition: cluster.h:55
float FLOAT32
Definition: host.h:111
#define FTABLE_X
Definition: cluster.cpp:31
unsigned SampleCount
Definition: cluster.h:35
double InvertMatrix(const float *input, int size, float *inv)
Definition: cluster.cpp:2652
uinT16 UniformBucket ( PARAM_DESC ParamDesc,
FLOAT32  x,
FLOAT32  Mean,
FLOAT32  StdDev 
)

Definition at line 2149 of file cluster.cpp.

2152  {
2153 /*
2154  ** Parameters:
2155  ** ParamDesc used to identify circular dimensions
2156  ** x value to be normalized
2157  ** Mean center of range of uniform distribution
2158  ** StdDev 1/2 the range of the uniform distribution
2159  ** Operation:
2160  ** This routine determines which bucket x falls into in the
2161  ** discrete uniform distribution defined by
2162  ** BUCKETTABLESIZE. x values which exceed the range of
2163  ** the discrete distribution are clipped.
2164  ** Return:
2165  ** Bucket number into which x falls
2166  ** Exceptions:
2167  ** None
2168  ** History:
2169  ** 6/5/89, DSJ, Created.
2170  */
2171  FLOAT32 X;
2172 
2173  // wraparound circular parameters if necessary
2174  if (ParamDesc->Circular) {
2175  if (x - Mean > ParamDesc->HalfRange)
2176  x -= ParamDesc->Range;
2177  else if (x - Mean < -ParamDesc->HalfRange)
2178  x += ParamDesc->Range;
2179  }
2180 
2181  X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
2182  if (X < 0)
2183  return 0;
2184  if (X > BUCKETTABLESIZE - 1)
2185  return (uinT16) (BUCKETTABLESIZE - 1);
2186  return (uinT16) floor((FLOAT64) X);
2187 } // UniformBucket
FLOAT32 HalfRange
Definition: ocrfeatures.h:52
#define BUCKETTABLESIZE
Definition: cluster.cpp:160
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:643
double FLOAT64
Definition: host.h:112
unsigned short uinT16
Definition: host.h:101
inT8 Circular
Definition: ocrfeatures.h:47
FLOAT32 Range
Definition: ocrfeatures.h:51
float FLOAT32
Definition: host.h:111
FLOAT64 UniformDensity ( inT32  x)

Definition at line 1973 of file cluster.cpp.

1973  {
1974 /*
1975  ** Parameters:
1976  ** x number to compute the uniform probability density for
1977  ** Operation:
1978  ** This routine computes the probability density function
1979  ** of a uniform distribution at the specified point. The
1980  ** range of the distribution is from 0 to BUCKETTABLESIZE.
1981  ** Return:
1982  ** The value of the uniform distribution at x.
1983  ** Exceptions:
1984  ** None
1985  ** History:
1986  ** 6/5/89, DSJ, Created.
1987  */
1988  static FLOAT64 UniformDistributionDensity = (FLOAT64) 1.0 / BUCKETTABLESIZE;
1989 
1990  if ((x >= 0.0) && (x <= BUCKETTABLESIZE))
1991  return UniformDistributionDensity;
1992  else
1993  return (FLOAT64) 0.0;
1994 } // UniformDensity
#define BUCKETTABLESIZE
Definition: cluster.cpp:160
double FLOAT64
Definition: host.h:112

Variable Documentation

const double FTable[FTABLE_Y][FTABLE_X]

Definition at line 35 of file cluster.cpp.