Extracting text - tvn-cosine/tesseract.net GitHub Wiki

How to recognize an image and extract the text using TessBaseApi

  1. Example 1

string dataPath = "./tessdata/";
string language = "eng";
string inputFile = "./input.png";
OcrEngineMode oem = OcrEngineMode.DEFAULT;
PageSegmentationMode psm = PageSegmentationMode.AUTO_OSD;

TessBaseAPI tessBaseAPI = new TessBaseAPI();

// Initialize tesseract-ocr 
if (!tessBaseAPI.Init(dataPath, language, oem))
{
   throw new Exception("Could not initialize tesseract.");
}

// Set the Page Segmentation mode
tessBaseAPI.SetPageSegMode(psm);

// Set the input image
Pix pix = tessBaseAPI.SetImage(inputFile);

// Recognize image
tessBaseAPI.Recognize();

ResultIterator resultIterator = tessBaseAPI.GetIterator();

// Extract text from result iterator
StringBuilder stringBuilder = new StringBuilder();
PageIteratorLevel pageIteratorLevel = PageIteratorLevel.RIL_PARA;
do
{
   stringBuilder.Append(resultIterator.GetUTF8Text(pageIteratorLevel));
} while (resultIterator.Next(pageIteratorLevel));

tessBaseAPI.Dispose();
pix.Dispose();