|
tesseract
3.03
|
#include "cluster.h"#include "commandlineflags.h"#include "featdefs.h"#include "intproto.h"#include "oldlist.h"Go to the source code of this file.
Classes | |
| struct | LABELEDLISTNODE |
| struct | MERGE_CLASS_NODE |
Namespaces | |
| namespace | tesseract |
Typedefs | |
| typedef struct LABELEDLISTNODE * | LABELEDLIST |
| typedef MERGE_CLASS_NODE * | MERGE_CLASS |
Functions | |
| void | ParseArguments (int *argc, char ***argv) |
| ShapeTable * | tesseract::LoadShapeTable (const STRING &file_prefix) |
| void | tesseract::WriteShapeTable (const STRING &file_prefix, const ShapeTable &shape_table) |
| MasterTrainer * | tesseract::LoadTrainingData (int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix) |
| const char * | GetNextFilename (int argc, const char *const *argv) |
| LABELEDLIST | FindList (LIST List, char *Label) |
| LABELEDLIST | NewLabeledList (const char *Label) |
| void | ReadTrainingSamples (const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples) |
| void | WriteTrainingSamples (const FEATURE_DEFS_STRUCT &FeatureDefs, char *Directory, LIST CharList, const char *program_feature_type) |
| void | FreeTrainingSamples (LIST CharList) |
| void | FreeLabeledList (LABELEDLIST LabeledList) |
| void | FreeLabeledClassList (LIST ClassListList) |
| CLUSTERER * | SetUpForClustering (const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST CharSample, const char *program_feature_type) |
| LIST | RemoveInsignificantProtos (LIST ProtoList, BOOL8 KeepSigProtos, BOOL8 KeepInsigProtos, int N) |
| void | CleanUpUnusedData (LIST ProtoList) |
| void | MergeInsignificantProtos (LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *Config) |
| MERGE_CLASS | FindClass (LIST List, const char *Label) |
| MERGE_CLASS | NewLabeledClass (const char *Label) |
| CLASS_STRUCT * | SetUpForFloat2Int (const UNICHARSET &unicharset, LIST LabeledClassList) |
| void | Normalize (float *Values) |
| void | FreeNormProtoList (LIST CharList) |
| void | AddToNormProtosList (LIST *NormProtoList, LIST ProtoList, char *CharName) |
| int | NumberOfProtos (LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos) |
| void | allocNormProtos () |
Variables | |
| FEATURE_DEFS_STRUCT | feature_defs |
| CLUSTERCONFIG | Config |
| typedef struct LABELEDLISTNODE * LABELEDLIST |
| typedef MERGE_CLASS_NODE* MERGE_CLASS |
Definition at line 56 of file commontraining.h.
| void AddToNormProtosList | ( | LIST * | NormProtoList, |
| LIST | ProtoList, | ||
| char * | CharName | ||
| ) |
Definition at line 870 of file commontraining.cpp.
{
PROTOTYPE* Proto;
LABELEDLIST LabeledProtoList;
LabeledProtoList = NewLabeledList(CharName);
iterate(ProtoList)
{
Proto = (PROTOTYPE *) first_node (ProtoList);
LabeledProtoList->List = push(LabeledProtoList->List, Proto);
}
*NormProtoList = push(*NormProtoList, LabeledProtoList);
}
| void allocNormProtos | ( | ) |
| void CleanUpUnusedData | ( | LIST | ProtoList | ) |
Definition at line 618 of file commontraining.cpp.
{
PROTOTYPE* Prototype;
iterate(ProtoList)
{
Prototype = (PROTOTYPE *) first_node (ProtoList);
if(Prototype->Variance.Elliptical != NULL)
{
memfree(Prototype->Variance.Elliptical);
Prototype->Variance.Elliptical = NULL;
}
if(Prototype->Magnitude.Elliptical != NULL)
{
memfree(Prototype->Magnitude.Elliptical);
Prototype->Magnitude.Elliptical = NULL;
}
if(Prototype->Weight.Elliptical != NULL)
{
memfree(Prototype->Weight.Elliptical);
Prototype->Weight.Elliptical = NULL;
}
}
}
| MERGE_CLASS FindClass | ( | LIST | List, |
| const char * | Label | ||
| ) |
Definition at line 713 of file commontraining.cpp.
{
MERGE_CLASS MergeClass;
iterate (List)
{
MergeClass = (MERGE_CLASS) first_node (List);
if (strcmp (MergeClass->Label, Label) == 0)
return (MergeClass);
}
return (NULL);
} /* FindClass */
| LABELEDLIST FindList | ( | LIST | List, |
| char * | Label | ||
| ) |
Definition at line 320 of file commontraining.cpp.
{
LABELEDLIST LabeledList;
iterate (List)
{
LabeledList = (LABELEDLIST) first_node (List);
if (strcmp (LabeledList->Label, Label) == 0)
return (LabeledList);
}
return (NULL);
} /* FindList */
| void FreeLabeledClassList | ( | LIST | ClassListList | ) |
Definition at line 744 of file commontraining.cpp.
{
MERGE_CLASS MergeClass;
iterate (ClassList) /* iterate thru all of the fonts */
{
MergeClass = (MERGE_CLASS) first_node (ClassList);
free (MergeClass->Label);
FreeClass(MergeClass->Class);
delete MergeClass;
}
destroy (ClassList);
} /* FreeLabeledClassList */
| void FreeLabeledList | ( | LABELEDLIST | LabeledList | ) |
Definition at line 483 of file commontraining.cpp.
{
/*
** Parameters:
** LabeledList labeled list to be freed
** Globals: none
** Operation:
** This routine deallocates all of the memory consumed by
** a labeled list. It does not free any memory which may be
** consumed by the items in the list.
** Return: none
** Exceptions: none
** History: Fri Aug 18 17:52:45 1989, DSJ, Created.
*/
destroy(LabeledList->List);
free(LabeledList->Label);
free(LabeledList);
} /* FreeLabeledList */
| void FreeNormProtoList | ( | LIST | CharList | ) |
Definition at line 854 of file commontraining.cpp.
{
LABELEDLIST char_sample;
iterate (CharList) /* iterate thru all of the fonts */
{
char_sample = (LABELEDLIST) first_node (CharList);
FreeLabeledList (char_sample);
}
destroy (CharList);
} // FreeNormProtoList
| void FreeTrainingSamples | ( | LIST | CharList | ) |
Definition at line 453 of file commontraining.cpp.
{
/*
** Parameters:
** FontList list of all fonts in document
** Globals: none
** Operation:
** This routine deallocates all of the space allocated to
** the specified list of training samples.
** Return: none
** Exceptions: none
** History: Fri Aug 18 17:44:27 1989, DSJ, Created.
*/
LABELEDLIST char_sample;
FEATURE_SET FeatureSet;
LIST FeatureList;
iterate(CharList) { /* iterate thru all of the fonts */
char_sample = (LABELEDLIST) first_node(CharList);
FeatureList = char_sample->List;
iterate(FeatureList) { /* iterate thru all of the classes */
FeatureSet = (FEATURE_SET) first_node(FeatureList);
FreeFeatureSet(FeatureSet);
}
FreeLabeledList(char_sample);
}
destroy(CharList);
} /* FreeTrainingSamples */
| const char* GetNextFilename | ( | int | argc, |
| const char *const * | argv | ||
| ) |
Definition at line 297 of file commontraining.cpp.
{
/*
** Parameters: none
** Globals:
** tessoptind defined by tessopt sys call
** Operation:
** This routine returns the next command line argument. If
** there are no remaining command line arguments, it returns
** NULL. This routine should only be called after all option
** arguments have been parsed and removed with ParseArguments.
** Return: Next command line argument or NULL.
** Exceptions: none
** History: Fri Aug 18 09:34:12 1989, DSJ, Created.
*/
if (tessoptind < argc)
return argv[tessoptind++];
else
return NULL;
} /* GetNextFilename */
| void MergeInsignificantProtos | ( | LIST | ProtoList, |
| const char * | label, | ||
| CLUSTERER * | Clusterer, | ||
| CLUSTERCONFIG * | Config | ||
| ) |
Definition at line 553 of file commontraining.cpp.
{
PROTOTYPE *Prototype;
bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
LIST pProtoList = ProtoList;
iterate(pProtoList) {
Prototype = (PROTOTYPE *) first_node (pProtoList);
if (Prototype->Significant || Prototype->Merged)
continue;
FLOAT32 best_dist = 0.125;
PROTOTYPE* best_match = NULL;
// Find the nearest alive prototype.
LIST list_it = ProtoList;
iterate(list_it) {
PROTOTYPE* test_p = (PROTOTYPE *) first_node (list_it);
if (test_p != Prototype && !test_p->Merged) {
FLOAT32 dist = ComputeDistance(Clusterer->SampleSize,
Clusterer->ParamDesc,
Prototype->Mean, test_p->Mean);
if (dist < best_dist) {
best_match = test_p;
best_dist = dist;
}
}
}
if (best_match != NULL && !best_match->Significant) {
if (debug)
tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n",
best_match->NumSamples, Prototype->NumSamples,
best_match->Mean[0], best_match->Mean[1],
Prototype->Mean[0], Prototype->Mean[1]);
best_match->NumSamples = MergeClusters(Clusterer->SampleSize,
Clusterer->ParamDesc,
best_match->NumSamples,
Prototype->NumSamples,
best_match->Mean,
best_match->Mean, Prototype->Mean);
Prototype->NumSamples = 0;
Prototype->Merged = 1;
} else if (best_match != NULL) {
if (debug)
tprintf("Red proto at %g,%g matched a green one at %g,%g\n",
Prototype->Mean[0], Prototype->Mean[1],
best_match->Mean[0], best_match->Mean[1]);
Prototype->Merged = 1;
}
}
// Mark significant those that now have enough samples.
int min_samples = (inT32) (Config->MinSamples * Clusterer->NumChar);
pProtoList = ProtoList;
iterate(pProtoList) {
Prototype = (PROTOTYPE *) first_node (pProtoList);
// Process insignificant protos that do not match a green one
if (!Prototype->Significant && Prototype->NumSamples >= min_samples &&
!Prototype->Merged) {
if (debug)
tprintf("Red proto at %g,%g becoming green\n",
Prototype->Mean[0], Prototype->Mean[1]);
Prototype->Significant = true;
}
}
} /* MergeInsignificantProtos */
| MERGE_CLASS NewLabeledClass | ( | const char * | Label | ) |
Definition at line 730 of file commontraining.cpp.
{
MERGE_CLASS MergeClass;
MergeClass = new MERGE_CLASS_NODE;
MergeClass->Label = (char*)Emalloc (strlen (Label)+1);
strcpy (MergeClass->Label, Label);
MergeClass->Class = NewClass (MAX_NUM_PROTOS, MAX_NUM_CONFIGS);
return (MergeClass);
} /* NewLabeledClass */
| LABELEDLIST NewLabeledList | ( | const char * | Label | ) |
Definition at line 352 of file commontraining.cpp.
{
LABELEDLIST LabeledList;
LabeledList = (LABELEDLIST) Emalloc (sizeof (LABELEDLISTNODE));
LabeledList->Label = (char*)Emalloc (strlen (Label)+1);
strcpy (LabeledList->Label, Label);
LabeledList->List = NIL_LIST;
LabeledList->SampleCount = 0;
LabeledList->font_sample_count = 0;
return (LabeledList);
} /* NewLabeledList */
| void Normalize | ( | float * | Values | ) |
Definition at line 837 of file commontraining.cpp.
{
register float Slope;
register float Intercept;
register float Normalizer;
Slope = tan (Values [2] * 2 * PI);
Intercept = Values [1] - Slope * Values [0];
Normalizer = 1 / sqrt (Slope * Slope + 1.0);
Values [0] = Slope * Normalizer;
Values [1] = - Normalizer;
Values [2] = Intercept * Normalizer;
} // Normalize
| int NumberOfProtos | ( | LIST | ProtoList, |
| BOOL8 | CountSigProtos, | ||
| BOOL8 | CountInsigProtos | ||
| ) |
Definition at line 888 of file commontraining.cpp.
{
int N = 0;
PROTOTYPE *Proto;
iterate(ProtoList)
{
Proto = (PROTOTYPE *) first_node ( ProtoList );
if (( Proto->Significant && CountSigProtos ) ||
( ! Proto->Significant && CountInsigProtos ) )
N++;
}
return(N);
}
| void ParseArguments | ( | int * | argc, |
| char *** | argv | ||
| ) |
Definition at line 89 of file commontraining.cpp.
{
STRING usage;
if (*argc) {
usage += (*argv)[0];
}
usage += " [.tr files ...]";
tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
// Record the index of the first non-flag argument to 1, since we set
// remove_flags to true when parsing the flags.
tessoptind = 1;
// Set some global values based on the flags.
Config.MinSamples =
MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_min_samples_fraction)));
Config.MaxIllegal =
MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_max_illegal)));
Config.Independence =
MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_independence)));
Config.Confidence =
MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_confidence)));
// Set additional parameters from config file if specified.
if (!FLAGS_configfile.empty()) {
tesseract::ParamUtils::ReadParamsFile(
FLAGS_configfile.c_str(),
tesseract::SET_PARAM_CONSTRAINT_NON_INIT_ONLY,
ccutil.params());
}
}
| void ReadTrainingSamples | ( | const FEATURE_DEFS_STRUCT & | feature_defs, |
| const char * | feature_name, | ||
| int | max_samples, | ||
| UNICHARSET * | unicharset, | ||
| FILE * | file, | ||
| LIST * | training_samples | ||
| ) |
Definition at line 383 of file commontraining.cpp.
{
/*
** Parameters:
** file open text file to read samples from
** Globals: none
** Operation:
** This routine reads training samples from a file and
** places them into a data structure which organizes the
** samples by FontName and CharName. It then returns this
** data structure.
** Return: none
** Exceptions: none
** History: Fri Aug 18 13:11:39 1989, DSJ, Created.
** Tue May 17 1998 simplifications to structure, illiminated
** font, and feature specification levels of structure.
*/
char buffer[2048];
char unichar[UNICHAR_LEN + 1];
LABELEDLIST char_sample;
FEATURE_SET feature_samples;
CHAR_DESC char_desc;
int i;
int feature_type = ShortNameToFeatureType(feature_defs, feature_name);
// Zero out the font_sample_count for all the classes.
LIST it = *training_samples;
iterate(it) {
char_sample = reinterpret_cast<LABELEDLIST>(first_node(it));
char_sample->font_sample_count = 0;
}
while (fgets(buffer, 2048, file) != NULL) {
if (buffer[0] == '\n')
continue;
sscanf(buffer, "%*s %s", unichar);
if (unicharset != NULL && !unicharset->contains_unichar(unichar)) {
unicharset->unichar_insert(unichar);
if (unicharset->size() > MAX_NUM_CLASSES) {
tprintf("Error: Size of unicharset in training is "
"greater than MAX_NUM_CLASSES\n");
exit(1);
}
}
char_sample = FindList(*training_samples, unichar);
if (char_sample == NULL) {
char_sample = NewLabeledList(unichar);
*training_samples = push(*training_samples, char_sample);
}
char_desc = ReadCharDescription(feature_defs, file);
feature_samples = char_desc->FeatureSets[feature_type];
if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
char_sample->List = push(char_sample->List, feature_samples);
char_sample->SampleCount++;
char_sample->font_sample_count++;
} else {
FreeFeatureSet(feature_samples);
}
for (i = 0; i < char_desc->NumFeatureSets; i++) {
if (feature_type != i)
FreeFeatureSet(char_desc->FeatureSets[i]);
}
free(char_desc);
}
} // ReadTrainingSamples
| LIST RemoveInsignificantProtos | ( | LIST | ProtoList, |
| BOOL8 | KeepSigProtos, | ||
| BOOL8 | KeepInsigProtos, | ||
| int | N | ||
| ) |
Definition at line 645 of file commontraining.cpp.
{
LIST NewProtoList = NIL_LIST;
LIST pProtoList;
PROTOTYPE* Proto;
PROTOTYPE* NewProto;
int i;
pProtoList = ProtoList;
iterate(pProtoList)
{
Proto = (PROTOTYPE *) first_node (pProtoList);
if ((Proto->Significant && KeepSigProtos) ||
(!Proto->Significant && KeepInsigProtos))
{
NewProto = (PROTOTYPE *)Emalloc(sizeof(PROTOTYPE));
NewProto->Mean = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
NewProto->Significant = Proto->Significant;
NewProto->Style = Proto->Style;
NewProto->NumSamples = Proto->NumSamples;
NewProto->Cluster = NULL;
NewProto->Distrib = NULL;
for (i=0; i < N; i++)
NewProto->Mean[i] = Proto->Mean[i];
if (Proto->Variance.Elliptical != NULL)
{
NewProto->Variance.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
for (i=0; i < N; i++)
NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i];
}
else
NewProto->Variance.Elliptical = NULL;
//---------------------------------------------
if (Proto->Magnitude.Elliptical != NULL)
{
NewProto->Magnitude.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
for (i=0; i < N; i++)
NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i];
}
else
NewProto->Magnitude.Elliptical = NULL;
//------------------------------------------------
if (Proto->Weight.Elliptical != NULL)
{
NewProto->Weight.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
for (i=0; i < N; i++)
NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i];
}
else
NewProto->Weight.Elliptical = NULL;
NewProto->TotalMagnitude = Proto->TotalMagnitude;
NewProto->LogMagnitude = Proto->LogMagnitude;
NewProtoList = push_last(NewProtoList, NewProto);
}
}
FreeProtoList(&ProtoList);
return (NewProtoList);
} /* RemoveInsignificantProtos */
| CLUSTERER* SetUpForClustering | ( | const FEATURE_DEFS_STRUCT & | FeatureDefs, |
| LABELEDLIST | CharSample, | ||
| const char * | program_feature_type | ||
| ) |
Definition at line 502 of file commontraining.cpp.
{
/*
** Parameters:
** char_sample: LABELEDLIST that holds all the feature information for a
** given character.
** Globals:
** None
** Operation:
** This routine reads samples from a LABELEDLIST and enters
** those samples into a clusterer data structure. This
** data structure is then returned to the caller.
** Return:
** Pointer to new clusterer data structure.
** Exceptions:
** None
** History:
** 8/16/89, DSJ, Created.
*/
uinT16 N;
int i, j;
FLOAT32 *Sample = NULL;
CLUSTERER *Clusterer;
inT32 CharID;
LIST FeatureList = NULL;
FEATURE_SET FeatureSet = NULL;
int desc_index = ShortNameToFeatureType(FeatureDefs, program_feature_type);
N = FeatureDefs.FeatureDesc[desc_index]->NumParams;
Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
FeatureList = char_sample->List;
CharID = 0;
iterate(FeatureList) {
FeatureSet = (FEATURE_SET) first_node(FeatureList);
for (i = 0; i < FeatureSet->MaxNumFeatures; i++) {
if (Sample == NULL)
Sample = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
for (j = 0; j < N; j++)
Sample[j] = FeatureSet->Features[i]->Params[j];
MakeSample (Clusterer, Sample, CharID);
}
CharID++;
}
if ( Sample != NULL ) free( Sample );
return( Clusterer );
} /* SetUpForClustering */
| CLASS_STRUCT* SetUpForFloat2Int | ( | const UNICHARSET & | unicharset, |
| LIST | LabeledClassList | ||
| ) |
SetUpForFloat2Int
Definition at line 774 of file commontraining.cpp.
{
MERGE_CLASS MergeClass;
CLASS_TYPE Class;
int NumProtos;
int NumConfigs;
int NumWords;
int i, j;
float Values[3];
PROTO NewProto;
PROTO OldProto;
BIT_VECTOR NewConfig;
BIT_VECTOR OldConfig;
// printf("Float2Int ...\n");
CLASS_STRUCT* float_classes = new CLASS_STRUCT[unicharset.size()];
iterate(LabeledClassList)
{
UnicityTableEqEq<int> font_set;
MergeClass = (MERGE_CLASS) first_node (LabeledClassList);
Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label)];
NumProtos = MergeClass->Class->NumProtos;
NumConfigs = MergeClass->Class->NumConfigs;
font_set.move(&MergeClass->Class->font_set);
Class->NumProtos = NumProtos;
Class->MaxNumProtos = NumProtos;
Class->Prototypes = (PROTO) Emalloc (sizeof(PROTO_STRUCT) * NumProtos);
for(i=0; i < NumProtos; i++)
{
NewProto = ProtoIn(Class, i);
OldProto = ProtoIn(MergeClass->Class, i);
Values[0] = OldProto->X;
Values[1] = OldProto->Y;
Values[2] = OldProto->Angle;
Normalize(Values);
NewProto->X = OldProto->X;
NewProto->Y = OldProto->Y;
NewProto->Length = OldProto->Length;
NewProto->Angle = OldProto->Angle;
NewProto->A = Values[0];
NewProto->B = Values[1];
NewProto->C = Values[2];
}
Class->NumConfigs = NumConfigs;
Class->MaxNumConfigs = NumConfigs;
Class->font_set.move(&font_set);
Class->Configurations = (BIT_VECTOR*) Emalloc (sizeof(BIT_VECTOR) * NumConfigs);
NumWords = WordsInVectorOfSize(NumProtos);
for(i=0; i < NumConfigs; i++)
{
NewConfig = NewBitVector(NumProtos);
OldConfig = MergeClass->Class->Configurations[i];
for(j=0; j < NumWords; j++)
NewConfig[j] = OldConfig[j];
Class->Configurations[i] = NewConfig;
}
}
return float_classes;
} // SetUpForFloat2Int
| void WriteTrainingSamples | ( | const FEATURE_DEFS_STRUCT & | FeatureDefs, |
| char * | Directory, | ||
| LIST | CharList, | ||
| const char * | program_feature_type | ||
| ) |
Definition at line 51 of file commontraining.cpp.
Definition at line 52 of file commontraining.cpp.