python读取pdf转⽂字和提取⽬录代码
import pyocr
import importlib
import sys
import time
from io import StringIO
mcgrady
time1 = time.time()
# print("初始时间为:",time1)
import os.path
from pdfminer.pdfparr import PDFParr
continuedstory
from pdfminer.pdfdocument import PDFDocument,PDFNoOutlines
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
verter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal,LAParams
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from PyPDF2 import PdfFileReader as pdf_read
directory_str = ''
def bookmark_listhandler(list):
firefly come back to meglobal directory_str
for message in list:
if isinstance(message, dict):
directory_str += message['/Title'] + '\n'
# print(message['/Title'])
#print(message)
el:
bookmark_listhandler(message)
# text_path = r'photo-words.pdf'
def file_name(file_dir):
L=[]
for i,j,files in os.walk(file_dir):
新视野大学英语视听说L=files
for file in files:
print(file)
return L
def _par_toc(doc):
"""With an open PDFDocument object, get the table of contents (toc) data
[this is a higher-order function to be pasd to with_pdf()]"""
toc = []
try:
outlines = _outlines()
for (level,title,dest,a,) in outlines:掉线英文
print(level, title)
toc.append((level, title))
except PDFNoOutlines:
pass
#print(toc)
return toc
def par(pathtxt,text_path):
'''解析PDF⽂本,并保存到TXT⽂件中'''
print(text_path)
fp = open(text_path,'rb')
stayhungry#⽤⽂件对象创建⼀个PDF⽂档分析器
parr = PDFParr(fp)
#创建⼀个PDF⽂档
doc = PDFDocument(parr)
#连接分析器,与⽂档对象
本科文凭查询parr.t_document(doc)
#提供初始化密码,如果没有密码,就创建⼀个空的字符串
#检测⽂档是否提供txt转换,不提供就忽略
if not doc.is_extractable:
rai PDFTextExtractionNotAllowed
print("不提供")
el:
#创建PDF,资源管理器,来共享资源
rsrcmgr = PDFResourceManager()
外贸英语常见专业术语
#创建⼀个PDF设备对象
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr,laparams=laparams)
#创建⼀个PDF解释其对象caterpillar音标
interpreter = PDFPageInterpreter(rsrcmgr,device)
#循环遍历列表,每次处理⼀个page内容
# _pages() 获取page列表
for page ate_pages(doc):
interpreter.process_page(page)
#接受该页⾯的LTPage对象
layout = _result()
# 这⾥layout是⼀个LTPage对象⾥⾯存放着这个page解析出的各种对象
# ⼀般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
# 想要获取⽂本就获得对象的text属性,
for x in layout:
if(isinstance(x,LTTextBoxHorizontal)):
with open(pathtxt,'a',encoding='utf-8') as f:
results = x.get_text()
#print(results)
f.write(results +"\n")
if __name__ == '__main__':
path='C:\\Urs\\chenqi\\Desktop\\test\\QA-CivilAviationKG-master\\raw' files=file_name(path)
i=1
for file in files:
names = file.split('.')
print(names[0]+".pdf")
pathtxt=names[0]+'text'+'.txt'
print(pathtxt)
par(pathtxt,path+'\\'+file)
i=i+1
for i in range(len(files)):
print(i, files[i])
with open(path+'/'+files[i], 'rb') as f:
pdf = pdf_read(f)
# 检索⽂档中存在的⽂本⼤纲,返回的对象是⼀个嵌套的列表
text_outline_list = Outlines()
酥油蜂蜜bookmark_listhandler(text_outline_list)
names = files[i].split('.')
with open(names[0]+'title'+'.txt', 'w', encoding='utf-8') as f:
f.write(directory_str)
time2 = time.time()
print("总共消耗时间为:",time2-time1)
效果