繁花如伊
对于Python 3:点安装pdfminer.sixfrom pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreterfrom pdfminer.converter import TextConverterfrom pdfminer.layout import LAParamsfrom pdfminer.pdfpage import PDFPagefrom io import StringIOdef convert_pdf_to_txt(path, codec='utf-8'): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text
慕田峪4524236
对于python3,还有另一个:pip install pdfminer3kfrom pdfminer.pdfinterp import PDFResourceManager, process_pdffrom pdfminer.converter import TextConverterfrom pdfminer.layout import LAParamsfrom io import StringIOimport timefrom functools import wrapsdef fn_timer(function)://this is for calculating the run time(function) @wraps(function) def function_timer(*args, **kwargs): t0 = time.time() result = function(*args, **kwargs) t1 = time.time() print ("Total time running %s: %s seconds" % ('test', str(t1-t0)) ) return result return function_timer@fn_timerdef convert_pdf(path, pages): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) fp = open(path, 'rb') process_pdf(rsrcmgr, device, fp,pages) fp.close() device.close() str = retstr.getvalue() retstr.close() return strfile = r'M:\a.pdf'print(convert_pdf(file,[1,]))