Pdfminer python 3.5

36,824

Solution 1

There is a solution for Python 3.5: you need pdfminer.six. Under win10 I could easy install it with

pip install pdfminer.six

You can check the installed version with

pdfminer.__version__

I haven't tested it intensively yet. But I could run the following code for the conversion pdf→text and pdf→html

Solution 2

Improved solution (Dez 2016)

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import HTMLConverter,TextConverter,XMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import io

def convert(case,fname, pages=None):
    if not pages: pagenums = set();
    else:         pagenums = set(pages);      
    manager = PDFResourceManager() 
    codec = 'utf-8'
    caching = True

    if case == 'text' :
        output = io.StringIO()
        converter = TextConverter(manager, output, codec=codec, laparams=LAParams())     
    if case == 'HTML' :
        output = io.BytesIO()
        converter = HTMLConverter(manager, output, codec=codec, laparams=LAParams())

    interpreter = PDFPageInterpreter(manager, converter)   
    infile = open(fname, 'rb')

    for page in PDFPage.get_pages(infile, pagenums,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    convertedPDF = output.getvalue()  

    infile.close(); converter.close(); output.close()
    return convertedPDF

#//////////// main ///////////////////////
filePDF  = 'myDir//myPDF.pdf'     # input
fileHTML = 'myDir//myHTML.html'   # output
fileTXT  = 'myDir//myTXT.txt'     # output

case = "HTML"

if case == 'HTML' :
    convertedPDF = convert('HTML', filePDF, pages=[0,1])
    fileConverted = open(fileHTML, "wb", encoding="utf-8")
if case == 'text' :
    convertedPDF = convert('text', filePDF, pages=[0,1])
    fileConverted = open(fileTXT, "w", encoding="utf-8")

fileConverted.write(convertedPDF)
fileConverted.close()
#print(convertedPDF) 

Solution 3

In my case on Python 3.7 I tried using it and it worked like a charm for me!

here is the code I used:

def convert_pdf_to_txt(path_to_file):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path_to_file, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text
Share:
36,824
gary
Author by

gary

Updated on November 15, 2021

Comments

  • gary
    gary over 2 years

    I have followed a few tutorials around but I am not able to get this code block to run, I did the necessary switches from StringIO to BytesIO (I believe?)

    I am unsure why 'banana' is printing nothing, I think the errors might be red herrings? is it something to do with me following a python2.7 tutorial and trying to translate it to python3?

    errors: File "/Users/foo/PycharmProjects/Try/Pdfminer.py", line 28, in <module>
        banana = convert("A1.pdf")
      File "/Users/foo/PycharmProjects/Try/Pdfminer.py", line 19, in convert
        infile = file(fname, 'rb')
    NameError: name 'file' is not defined
    

    script

    from io import BytesIO
    
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    
    def convert(fname, pages=None):
        if not pages:
            pagenums = set()
        else:
            pagenums = set(pages)
    
        output = BytesIO()
        manager = PDFResourceManager()
        converter = TextConverter(manager, output, laparams=LAParams())
        interpreter = PDFPageInterpreter(manager, converter)
    
        infile = file(fname, 'rb')
        for page in PDFPage.get_pages(infile, pagenums):
            interpreter.process_page(page)
        infile.close()
        converter.close()
        text = output.getvalue()
        output.close
        return text
    
    banana = convert("A1.pdf")
    print(banana)
    

    The same thing happens with this variant:

    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    from io import BytesIO
    
    def convert_pdf_to_txt(path):
        rsrcmgr = PDFResourceManager()
        retstr = BytesIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos=set()
    
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
            interpreter.process_page(page)
    
        text = retstr.getvalue()
    
        fp.close()
        device.close()
        retstr.close()
        return text
    
    Banana = convert_pdf_to_txt("A1.pdf")
    print(Banana)
    

    I have tried searching for this (most of the pdfminer code is from this or this) but having no luck.

    Any insight is appreciated.

    Cheers