21 lines
544 B
Python
21 lines
544 B
Python
|
#pip install pytesseract PyMuPDF Pillow opencv-python
|
||
|
|
||
|
import fitz
|
||
|
import io
|
||
|
from PIL import Image
|
||
|
import pytesseract
|
||
|
import cv2
|
||
|
|
||
|
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
|
||
|
file = "../original/NOTES_ON_OPTICAL_PRINTER_TECHNIQUE.pdf"
|
||
|
|
||
|
pdf_file = fitz.open(file)
|
||
|
|
||
|
for page in pdf_file:
|
||
|
pix = page.get_pixmap(dpi=300)
|
||
|
filePath = "pages/page-%i.png" % page.number
|
||
|
pix.save(filePath)
|
||
|
image = cv2.imread(filePath)
|
||
|
text = pytesseract.image_to_string(image, lang='eng', config='--psm 6 --oem 3')
|
||
|
print(text)
|