Extract text from a scanned pdf with images?

10,214

OCR on PDF files using Python

import os
import io
from PIL import Image
import pytesseract
from wand.image import Image as wi
import gc

def Get_text_from_image(pdf_path):
    pdf=wi(filename=pdf_path,resolution=300)
    pdfImg=pdf.convert('jpeg')
    imgBlobs=[]
    extracted_text=[]
    for img in pdfImg.sequence:
        page=wi(image=img)
        imgBlobs.append(page.make_blob('jpeg'))
    for imgBlob in imgBlobs:
        im=Image.open(io.BytesIO(imgBlob))
        text=pytesseract.image_to_string(im,lang='eng')
        extracted_text.append(text)
    return ([i.replace("\n","") for i in extracted_text])   

I done a minor modification

The below code converts all the pages of the PDF to images in sequence by end of the code Am destroying Image sequence because it is taking huge memory to processing

def Get_text_from_image(pdf_path):
    import pytesseract,io,gc
    from PIL import Image
    from wand.image import Image as wi
    import gc
    """ Extracting text content from Image  """

    pdf=wi(filename=pdf_path,resolution=300)                                                                                                                
    pdfImg=pdf.convert('jpeg')                                                                                                                                                                              
    imgBlobs=[]
    extracted_text=[]
    try:        
        for img in pdfImg.sequence:
            page=wi(image=img)
            imgBlobs.append(page.make_blob('jpeg'))
            for i in range(0,5):
                [gc.collect() for i in range(0,10)]

        for imgBlob in imgBlobs:
            im=Image.open(io.BytesIO(imgBlob))
            text=pytesseract.image_to_string(im,lang='eng')
            text = text.replace(r"\n", " ")
            extracted_text.append(text)
            for i in range(0,5):
                [gc.collect() for i in range(0,10)]
        return (''.join([i.replace("\n"," ").replace("\n\n"," ") for i in extracted_text]))
        [gc.collect() for i in range(0,10)]
    finally:
        [gc.collect() for i in range(0,10)]
        img.destroy()
Share:
10,214
Revolucion for Monica
Author by

Revolucion for Monica

I am an MSc by research postgraduate in AI. Looking to join the police ! At the moment working on political marketing and crime mapping/prevention projects

Updated on July 26, 2022

Comments

  • Revolucion for Monica
    Revolucion for Monica almost 2 years

    I've tried to extract text from a pdf created from the computer and it worked but I wasn't able to extract text from a scanned pdf, which you can find here, with images and several pages such as this one :

    enter image description here

    Here is the code I used :

    # libraries
    ## split
    from PyPDF2 import PdfFileWriter, PdfFileReader
    ## read 
    import sys
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfpage import PDFPage
    from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
    from pdfminer.layout import LAParams
    import io
    # remove files
    import os
    
    # split in case there is several pages
    def pdfspliter(filename):
        inputpdf = PdfFileReader(open(filename, "rb"))
    
        for i in range(inputpdf.numPages):
            output = PdfFileWriter()
            output.addPage(inputpdf.getPage(i))
            with open("document-page%s.pdf" % i, "wb") as outputStream:
                output.write(outputStream)
            pdfparser("document-page%s.pdf" % i)
            os.remove("document-page%s.pdf" % i)
    
    # read a given page
    def pdfparser(data):
    
        fp = open(data, 'rb')
        rsrcmgr = PDFResourceManager()
        retstr = io.StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
    
        for page in PDFPage.get_pages(fp):
            interpreter.process_page(page)
            data =  retstr.getvalue()
    
        print(data)
    
    if __name__ == '__main__':
        filename = sys.argv[1]
        pdfspliter(filename)
    

    Can you help extract text from this kind of files ?

    Update with Tesseract OCR

    I made an attempt with Tesseract OCR with Python, it extracts some pages of a pdf text but really takes time and seems to stop at a point :

    # import the necessary packages
    from PIL import Image
    import pytesseract
    import argparse
    import cv2
    import os
    ## split
    from PyPDF2 import PdfFileWriter, PdfFileReader
    # remove
    import sys
    # 
    from pdf2image import convert_from_path
    # import all files with a name
    import glob
    
    # functions
    def pdfspliterimager(filename):
        inputpdf = PdfFileReader(open(filename, "rb"))
        for i in range(inputpdf.numPages):
            output = PdfFileWriter()
            output.addPage(inputpdf.getPage(i))
            with open("document-page%s.pdf" % i, "wb") as outputStream:
                output.write(outputStream)
            pages = convert_from_path("document-page%s.pdf" % i, 500)
            for page in pages:
                page.save('out%s.jpg'%i, 'JPEG')
    
            os.remove("document-page%s.pdf" % i)
    
    # construct the argument parse and parse the arguments
    ap = argparse.ArgumentParser()
    ap.add_argument("-i", "--image", required=True,
        help="path to input image to be OCR'd")
    ap.add_argument("-p", "--preprocess", type=str, default="thresh",
        help="type of preprocessing to be done")
    args = vars(ap.parse_args())
    
    # we test if it is a pdf
    image_path = args["image"]
    # if it is a pdf we convert it to an image
    if image_path.endswith('.pdf'):
        pdfspliterimager(image_path)
    
    # for all files with out in their name
    file_names = glob.glob("out*")
    for file_name in file_names:
        # load the image and convert it to grayscale
        image = cv2.imread(file_name)
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
        # check to see if we should apply thresholding to preprocess the
        # image
        if args["preprocess"] == "thresh":
            gray = cv2.threshold(gray, 0, 255,
                cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
    
        # make a check to see if median blurring should be done to remove
        # noise
        elif args["preprocess"] == "blur":
            gray = cv2.medianBlur(gray, 3)
    
        # write the grayscale image to disk as a temporary file so we can
        # apply OCR to it
        filename = "{}.png".format(os.getpid())
        cv2.imwrite(filename, gray)
    
        # load the image as a PIL/Pillow image, apply OCR, and then delete
        # the temporary file
        text = pytesseract.image_to_string(Image.open(filename))
        os.remove(filename)
        print(text)
    
        # show the output images
        cv2.imshow("Image", image)
        cv2.imshow("Output", gray)
        cv2.waitKey(0)
    
  • Alex
    Alex about 4 years
    This is a nice minimal code snippet that does the job. Replacing newlines with spaces would be a better default behavior IMO, but the return statement is easily customized.
  • thrinadhn
    thrinadhn about 4 years
    I Added Garbage collection and exception handling to remove image sequence for releasing memory i will work some extend not 100 % and tesseract OCR will not give 100% accurary it depends upon your scanned page quality