Creating a searchable Pdf - tvn-cosine/tesseract.net GitHub Wiki

How to create a searchable Pdf from an image

  1. Example 1

string dataPath = "./tessdata/";
string language = "eng";
string inputFile = "./input.png";
OcrEngineMode oem = OcrEngineMode.DEFAULT;
PageSegmentationMode psm = PageSegmentationMode.AUTO_OSD;

TessBaseAPI tessBaseAPI = new TessBaseAPI();

// Initialize tesseract-ocr 
if (!tessBaseAPI.Init(dataPath, language, oem))
{
    throw new Exception("Could not initialize tesseract.");
}

// Set the Page Segmentation mode
tessBaseAPI.SetPageSegMode(psm);

// Set the input image
Pix pix = tessBaseAPI.SetImage(inputFile);

// Recognize image
tessBaseAPI.Recognize();

//ensure input name is set
tessBaseAPI.SetInputName(inputFile);

var fileInfo = new System.IO.FileInfo(inputFile);
string tessDataPath = tessBaseAPI.GetDatapath();
string outputName = fileInfo.FullName.Replace(fileInfo.Extension, string.Empty); //input name.pdf
             
// call pdf renderer and export pdf
using (var pdfRenderer = new PdfRenderer(outputName, tessDataPath, false))
{
    pdfRenderer.BeginDocument("tesseract.net searchable Pdf generation");
    pdfRenderer.AddImage(tessBaseAPI);
    pdfRenderer.EndDocument();
}

tessBaseAPI.Dispose();
pix.Dispose();