html/ocr parser using Cython/lxml/Tesseract/ImageMagick/Pandas
Tested against Windows 10 / Python 3.11 / Anaconda / Windows
pip install xmlhtml2pandas
Cython and a C compiler must be installed!
import os
os.environ["OMP_THREAD_LIMIT"] = "1"
os.environ["MAGICK_THREAD_LIMIT"] = "1"
from xmlhtml2pandas import parse_xmlhtml, preprocess_images_and_run_tesseract
from cythondfprint import add_printer
add_printer(1)
for file2parse in [
r"C:\Users\hansc\Downloads\Apostas Futebol _ Sportingbet.mhtml",
r"C:\Users\hansc\Downloads\bet365 - Apostas Desportivas Online.mhtml",
r"C:\Users\hansc\Downloads\bet365 - Apostas Desportivas Online2.mhtml",
]:
with open(
file2parse,
"rb",
) as f:
df_html = parse_xmlhtml(f, "html", ())
print(df_html)
print(df_html.dtypes)
for picture in preprocess_images_and_run_tesseract(
density=200,
resize_percentage=100,
tesser_cpus=1,
image_magick_cpus=1,
path_in=r"C:\Users\hansc\Desktop\testimg",
path_out=r"C:\Users\hansc\Desktop\testimg_outfiles",
magick_options="""-colorspace LinearGray -normalize -auto-level -alpha deactivate -adaptive-blur 1 -adaptive-sharpen 1 -trim -fuzz 60 -antialias -auto-gamma -auto-level -black-point-compensation -normalize -enhance -white-balance -antialias -black-threshold 4 -mean-shift 1x5+17%""",
magick_path=r"C:\Program Files\ImageMagick-7.1.1-Q16-HDRI\magick.exe",
tesseractpath=r"C:\Program Files\Tesseract-OCR\tesseract.exe",
tessdata_dir=r"C:\Program Files\Tesseract-OCR\tessdata",
tesser_options_str="-l por+eng --oem 3 --psm 6 -c tessedit_create_hocr=1 -c hocr_font_info=1 -c tessedit_pageseg_mode=6",
debug=False,
subprocess_kwargs_tesser=None,
subprocess_kwargs_magick=None,
include_screenshots=True,
):
print(picture)
import os
import subprocess
os.environ["OMP_THREAD_LIMIT"] = "1"
os.environ["MAGICK_THREAD_LIMIT"] = "1"
os.environ["KMP_ALL_THREADS"] = "1"
os.environ["KMP_TEAMS_THREAD_LIMIT"] = "1"
os.environ["OMP_THREAD_LIMIT"] = "1"
os.environ["KMP_DEVICE_THREAD_LIMIT"] = "1"
from xmlhtml2pandas import parse_xmlhtml, preprocess_images_and_run_tesseract
subprocess.run("screencap -p > /sdcard/shot.png",shell=True)
for picture in preprocess_images_and_run_tesseract(
density=200,
resize_percentage=100,
tesser_cpus=1,
image_magick_cpus=1,
path_in=r"/sdcard/shot.png",
path_out=r"/sdcard/Downloadsout",
magick_options="""-colorspace LinearGray -normalize -auto-level -alpha deactivate -adaptive-blur 1 -adaptive-sharpen 1 -trim -fuzz 60 -antialias -auto-gamma -auto-level -black-point-compensation -normalize -enhance -white-balance -antialias -black-threshold 4 -mean-shift 1x5+17%""",
magick_path=r"/data/data/com.termux/files/usr/bin/magick",
tesseractpath=r"/data/data/com.termux/files/usr/bin/tesseract",
tessdata_dir=r"/data/data/com.termux/files/usr/share/tessdata_fast",
tesser_options_str="-l por+eng --oem 3 --psm 6 -c tessedit_create_hocr=1 -c hocr_font_info=1 -c tessedit_pageseg_mode=6",
debug=False,
subprocess_kwargs_tesser=None,
subprocess_kwargs_magick=None,
include_screenshots=False,
):
print(picture)