Pdfminer python 3.5
Solution 1
There is a solution for Python 3.5: you need pdfminer.six. Under win10 I could easy install it with
pip install pdfminer.six
You can check the installed version with
pdfminer.__version__
I haven't tested it intensively yet. But I could run the following code for the conversion pdf→text and pdf→html
Solution 2
Improved solution (Dez 2016)
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import HTMLConverter,TextConverter,XMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import io
def convert(case,fname, pages=None):
if not pages: pagenums = set();
else: pagenums = set(pages);
manager = PDFResourceManager()
codec = 'utf-8'
caching = True
if case == 'text' :
output = io.StringIO()
converter = TextConverter(manager, output, codec=codec, laparams=LAParams())
if case == 'HTML' :
output = io.BytesIO()
converter = HTMLConverter(manager, output, codec=codec, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums,caching=caching, check_extractable=True):
interpreter.process_page(page)
convertedPDF = output.getvalue()
infile.close(); converter.close(); output.close()
return convertedPDF
#//////////// main ///////////////////////
filePDF = 'myDir//myPDF.pdf' # input
fileHTML = 'myDir//myHTML.html' # output
fileTXT = 'myDir//myTXT.txt' # output
case = "HTML"
if case == 'HTML' :
convertedPDF = convert('HTML', filePDF, pages=[0,1])
fileConverted = open(fileHTML, "wb", encoding="utf-8")
if case == 'text' :
convertedPDF = convert('text', filePDF, pages=[0,1])
fileConverted = open(fileTXT, "w", encoding="utf-8")
fileConverted.write(convertedPDF)
fileConverted.close()
#print(convertedPDF)
Solution 3
In my case on Python 3.7 I tried using it and it worked like a charm for me!
here is the code I used:
def convert_pdf_to_txt(path_to_file):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path_to_file, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
gary
Updated on November 15, 2021Comments
-
gary over 2 years
I have followed a few tutorials around but I am not able to get this code block to run, I did the necessary switches from StringIO to BytesIO (I believe?)
I am unsure why 'banana' is printing nothing, I think the errors might be red herrings? is it something to do with me following a python2.7 tutorial and trying to translate it to python3?
errors: File "/Users/foo/PycharmProjects/Try/Pdfminer.py", line 28, in <module> banana = convert("A1.pdf") File "/Users/foo/PycharmProjects/Try/Pdfminer.py", line 19, in convert infile = file(fname, 'rb') NameError: name 'file' is not defined
script
from io import BytesIO from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage def convert(fname, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) output = BytesIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = file(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close return text banana = convert("A1.pdf") print(banana)
The same thing happens with this variant:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from io import BytesIO def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = BytesIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text Banana = convert_pdf_to_txt("A1.pdf") print(Banana)
I have tried searching for this (most of the pdfminer code is from this or this) but having no luck.
Any insight is appreciated.
Cheers