Creating a searchable Pdf - tvn-cosine/tesseract.net GitHub Wiki
How to create a searchable Pdf from an image
-
Example 1
string dataPath = "./tessdata/";
string language = "eng";
string inputFile = "./input.png";
OcrEngineMode oem = OcrEngineMode.DEFAULT;
PageSegmentationMode psm = PageSegmentationMode.AUTO_OSD;
TessBaseAPI tessBaseAPI = new TessBaseAPI();
// Initialize tesseract-ocr
if (!tessBaseAPI.Init(dataPath, language, oem))
{
throw new Exception("Could not initialize tesseract.");
}
// Set the Page Segmentation mode
tessBaseAPI.SetPageSegMode(psm);
// Set the input image
Pix pix = tessBaseAPI.SetImage(inputFile);
// Recognize image
tessBaseAPI.Recognize();
//ensure input name is set
tessBaseAPI.SetInputName(inputFile);
var fileInfo = new System.IO.FileInfo(inputFile);
string tessDataPath = tessBaseAPI.GetDatapath();
string outputName = fileInfo.FullName.Replace(fileInfo.Extension, string.Empty); //input name.pdf
// call pdf renderer and export pdf
using (var pdfRenderer = new PdfRenderer(outputName, tessDataPath, false))
{
pdfRenderer.BeginDocument("tesseract.net searchable Pdf generation");
pdfRenderer.AddImage(tessBaseAPI);
pdfRenderer.EndDocument();
}
tessBaseAPI.Dispose();
pix.Dispose();