|
tesseract
3.03
|
#include <renderer.h>
Public Member Functions | |
| TessPDFRenderer (const char *datadir) | |
Protected Member Functions | |
| virtual bool | BeginDocumentHandler () |
| virtual bool | AddImageHandler (TessBaseAPI *api) |
| virtual bool | EndDocumentHandler () |
Renders tesseract output into searchable PDF
Definition at line 186 of file renderer.h.
| tesseract::TessPDFRenderer::TessPDFRenderer | ( | const char * | datadir | ) |
Definition at line 32 of file pdfrenderer.cpp.
: TessResultRenderer("PDF", "pdf") { obj_ = 0; datadir_ = datadir; offsets_.push_back(0); }
| bool tesseract::TessPDFRenderer::AddImageHandler | ( | TessBaseAPI * | api | ) | [protected, virtual] |
Implements tesseract::TessResultRenderer.
Definition at line 583 of file pdfrenderer.cpp.
{
char buf[kBasicBufSize];
Pix *pix = api->GetInputImage();
char *filename = (char *)api->GetInputName();
int ppi = api->GetSourceYResolution();
if (!pix || ppi <= 0)
return false;
double width = pixGetWidth(pix) * 72.0 / ppi;
double height = pixGetHeight(pix) * 72.0 / ppi;
// PAGE
snprintf(buf, sizeof(buf),
"%ld 0 obj\n"
"<<\n"
" /Type /Page\n"
" /Parent %ld 0 R\n"
" /MediaBox [0 0 %.2f %.2f]\n"
" /Contents %ld 0 R\n"
" /Resources\n"
" <<\n"
" /XObject << /Im1 %ld 0 R >>\n"
" /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
" /Font << /f-0-0 %ld 0 R >>\n"
" >>\n"
">>\n"
"endobj\n",
obj_,
2L, // Pages object
width,
height,
obj_ + 1, // Contents object
obj_ + 2, // Image object
3L); // Type0 Font
pages_.push_back(obj_);
AppendPDFObject(buf);
// CONTENTS
char* pdftext = GetPDFTextObjects(api, width, height, imagenum());
long pdftext_len = strlen(pdftext);
unsigned char *pdftext_casted = reinterpret_cast<unsigned char *>(pdftext);
size_t len;
unsigned char *comp_pdftext =
zlibCompress(pdftext_casted,
pdftext_len,
&len);
long comp_pdftext_len = len;
snprintf(buf, sizeof(buf),
"%ld 0 obj\n"
"<<\n"
" /Length %ld /Filter /FlateDecode\n"
">>\n"
"stream\n", obj_, comp_pdftext_len);
AppendString(buf);
long objsize = strlen(buf);
AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
objsize += comp_pdftext_len;
lept_free(comp_pdftext);
delete[] pdftext;
snprintf(buf, sizeof(buf),
"endstream\n"
"endobj\n");
AppendString(buf);
objsize += strlen(buf);
AppendPDFObjectDIY(objsize);
char *pdf_object;
if (!fileToPDFObj(filename, obj_, &pdf_object, &objsize)) {
if (!pixToPDFObj(pix, obj_, &pdf_object, &objsize)) {
return false;
}
}
AppendData(pdf_object, objsize);
AppendPDFObjectDIY(objsize);
delete[] pdf_object;
return true;
}
| bool tesseract::TessPDFRenderer::BeginDocumentHandler | ( | ) | [protected, virtual] |
Reimplemented from tesseract::TessResultRenderer.
Definition at line 254 of file pdfrenderer.cpp.
{
fprintf(stderr, "XXX 2");
char buf[kBasicBufSize];
snprintf(buf, sizeof(buf),
"%%PDF-1.5\n"
"%%%c%c%c%c\n",
0xDE, 0xAD, 0xBE, 0xEB);
AppendPDFObject(buf);
// CATALOG
snprintf(buf, sizeof(buf),
"1 0 obj\n"
"<<\n"
" /Type /Catalog\n"
" /Pages %ld 0 R\n"
">>\n"
"endobj\n", 2L);
AppendPDFObject(buf);
// We are reserving object #2 for the /Pages
// object, which I am going to create and write
// at the end of the PDF file.
AppendPDFObject("");
// TYPE0 FONT
snprintf(buf, sizeof(buf),
"3 0 obj\n"
"<<\n"
" /BaseFont /GlyphLessFont\n"
" /DescendantFonts [ %ld 0 R ]\n"
" /Encoding /Identity-H\n"
" /Subtype /Type0\n"
" /ToUnicode %ld 0 R\n"
" /Type /Font\n"
">>\n"
"endobj\n",
4L, // CIDFontType2 font
5L // ToUnicode
);
AppendPDFObject(buf);
// CIDFONTTYPE2
snprintf(buf, sizeof(buf),
"4 0 obj\n"
"<<\n"
" /BaseFont /GlyphLessFont\n"
" /CIDToGIDMap /Identity\n"
" /CIDSystemInfo\n"
" <<\n"
" /Ordering (Identity)\n"
" /Registry (Adobe)\n"
" /Supplement 0\n"
" >>\n"
" /FontDescriptor %ld 0 R\n"
" /Subtype /CIDFontType2\n"
" /Type /Font\n"
" /DW 1000\n"
">>\n"
"endobj\n",
6L // Font descriptor
);
AppendPDFObject(buf);
const char *stream =
"/CIDInit /ProcSet findresource begin\n"
"12 dict begin\n"
"begincmap\n"
"/CIDSystemInfo\n"
"<<\n"
" /Registry (Adobe)\n"
" /Ordering (UCS)\n"
" /Supplement 0\n"
">> def\n"
"/CMapName /Adobe-Identify-UCS def\n"
"/CMapType 2 def\n"
"1 begincodespacerange\n"
"<0000> <FFFF>\n"
"endcodespacerange\n"
"1 beginbfrange\n"
"<0000> <FFFF> <0000>\n"
"endbfrange\n"
"endcmap\n"
"CMapName currentdict /CMap defineresource pop\n"
"end\n"
"end\n";
// TOUNICODE
snprintf(buf, sizeof(buf),
"5 0 obj\n"
"<< /Length %lu >>\n"
"stream\n"
"%s"
"endstream\n"
"endobj\n", (unsigned long) strlen(stream), stream);
AppendPDFObject(buf);
// TODO(jbreiden) Fix the FontBBox entry. And of course make
// the font data match the descriptor.
// FONT DESCRIPTOR
snprintf(buf, sizeof(buf),
"6 0 obj\n"
"<<\n"
" /Ascent 1000\n"
" /CapHeight 1000\n"
" /Descent 0\n" // Nothing goes below baseline
" /Flags 4\n"
" /FontBBox [ 0 0 1000 1000 ]\n"
" /FontFile2 %ld 0 R\n"
" /FontName /GlyphLessFont\n"
" /ItalicAngle 0\n"
" /StemV 80\n"
" /Type /FontDescriptor\n"
">>\n"
"endobj\n",
7L // Font data
);
AppendPDFObject(buf);
snprintf(buf, sizeof(buf), "%s/pdf.ttf", datadir_);
FILE *fp = fopen(buf, "rb");
if (!fp)
return false;
fseek(fp, 0, SEEK_END);
long int size = ftell(fp);
fseek(fp, 0, SEEK_SET);
char *buffer = new char[size];
fread(buffer, 1, size, fp);
fclose(fp);
// FONTFILE2
snprintf(buf, sizeof(buf),
"7 0 obj\n"
"<<\n"
" /Length %ld\n"
" /Length1 %ld\n"
">>\n"
"stream\n", size, size);
AppendString(buf);
size_t objsize = strlen(buf);
AppendData(buffer, size);
objsize += size;
snprintf(buf, sizeof(buf),
"endstream\n"
"endobj\n");
AppendString(buf);
objsize += strlen(buf);
AppendPDFObjectDIY(objsize);
return true;
}
| bool tesseract::TessPDFRenderer::EndDocumentHandler | ( | ) | [protected, virtual] |
Reimplemented from tesseract::TessResultRenderer.
Definition at line 662 of file pdfrenderer.cpp.
{
char buf[kBasicBufSize];
// We reserved the /Pages object number early, so that the /Page
// objects could refer to their parent. We finally have enough
// information to go fill it in. Using lower level calls to manipulate
// the offset record in two spots, because we are placing objects
// out of order in the file.
// PAGES
const long int kPagesObjectNumber = 2;
offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1
snprintf(buf, sizeof(buf),
"%ld 0 obj\n"
"<<\n"
" /Type /Pages\n"
" /Kids [ ", kPagesObjectNumber);
AppendString(buf);
size_t pages_objsize = strlen(buf);
for (size_t i = 0; i < pages_.size(); i++) {
snprintf(buf, sizeof(buf),
"%ld 0 R ", pages_[i]);
AppendString(buf);
pages_objsize += strlen(buf);
}
snprintf(buf, sizeof(buf),
"]\n"
" /Count %d\n"
">>\n"
"endobj\n", pages_.size());
AppendString(buf);
pages_objsize += strlen(buf);
offsets_.back() += pages_objsize; // manipulation #2
// INFO
char* datestr = l_getFormattedDate();
snprintf(buf, sizeof(buf),
"%ld 0 obj\n"
"<<\n"
" /Producer (Tesseract %s)\n"
" /CreationDate (D:%s)\n"
" /Title (%s)"
">>\n"
"endobj\n", obj_, VERSION, datestr, title());
lept_free(datestr);
AppendPDFObject(buf);
snprintf(buf, sizeof(buf),
"xref\n"
"0 %ld\n"
"0000000000 65535 f \n", obj_);
AppendString(buf);
for (int i = 1; i < obj_; i++) {
snprintf(buf, sizeof(buf), "%010ld 00000 n \n", offsets_[i]);
AppendString(buf);
}
snprintf(buf, sizeof(buf),
"trailer\n"
"<<\n"
" /Size %ld\n"
" /Root %ld 0 R\n"
" /Info %ld 0 R\n"
">>\n"
"startxref\n"
"%ld\n"
"%%%%EOF\n",
obj_,
1L, // catalog
obj_ - 1, // info
offsets_.back());
AppendString(buf);
return true;
}