Extracting text - tvn-cosine/tesseract.net GitHub Wiki
How to recognize an image and extract the text using TessBaseApi
-
Example 1
string dataPath = "./tessdata/";
string language = "eng";
string inputFile = "./input.png";
OcrEngineMode oem = OcrEngineMode.DEFAULT;
PageSegmentationMode psm = PageSegmentationMode.AUTO_OSD;
TessBaseAPI tessBaseAPI = new TessBaseAPI();
// Initialize tesseract-ocr
if (!tessBaseAPI.Init(dataPath, language, oem))
{
throw new Exception("Could not initialize tesseract.");
}
// Set the Page Segmentation mode
tessBaseAPI.SetPageSegMode(psm);
// Set the input image
Pix pix = tessBaseAPI.SetImage(inputFile);
// Recognize image
tessBaseAPI.Recognize();
ResultIterator resultIterator = tessBaseAPI.GetIterator();
// Extract text from result iterator
StringBuilder stringBuilder = new StringBuilder();
PageIteratorLevel pageIteratorLevel = PageIteratorLevel.RIL_PARA;
do
{
stringBuilder.Append(resultIterator.GetUTF8Text(pageIteratorLevel));
} while (resultIterator.Next(pageIteratorLevel));
tessBaseAPI.Dispose();
pix.Dispose();