有只小跳蛙
由于这些解决方案都不支持最新版本的PDFMiner,所以我编写了一个简单的解决方案,它将使用PDFMiner返回pdf文本。这将适用于那些获得导入错误的process_pdfimport sysfrom pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreterfrom pdfminer.pdfpage import PDFPagefrom pdfminer.converter
import XMLConverter, HTMLConverter, TextConverterfrom pdfminer.layout import LAParamsfrom cStringIO import StringIOdef pdfparser(data):
fp = file(data, 'rb')
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
data = retstr.getvalue()
print dataif __name__ == '__main__':
pdfparser(sys.argv[1])参见下面为Python 3工作的代码:import sysfrom pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreterfrom pdfminer.pdfpage import PDFPagefrom pdfminer.converter
import XMLConverter, HTMLConverter, TextConverterfrom pdfminer.layout import LAParamsimport iodef pdfparser(data):
fp = open(data, 'rb')
rsrcmgr = PDFResourceManager()
retstr = io.StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
data = retstr.getvalue()
print(data)if __name__ == '__main__':
pdfparser(sys.argv[1])