notes-on-optical-printer-te.../extract/ocr.py

21 lines
544 B
Python

#pip install pytesseract PyMuPDF Pillow opencv-python
import fitz
import io
from PIL import Image
import pytesseract
import cv2
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
file = "../original/NOTES_ON_OPTICAL_PRINTER_TECHNIQUE.pdf"
pdf_file = fitz.open(file)
for page in pdf_file:
pix = page.get_pixmap(dpi=300)
filePath = "pages/page-%i.png" % page.number
pix.save(filePath)
image = cv2.imread(filePath)
text = pytesseract.image_to_string(image, lang='eng', config='--psm 6 --oem 3')
print(text)