QQ:231469242 原创
单个PDF内容提取
# -*- coding: utf-8 -*- """ io.open() is the preferred, higher-level interface to file I/O. It wraps the OS-level file descriptor in an object that you can use to access the file in a Pythonic manner. os.open() is just a wrapper for the lower-level POSIX syscall. It takes less symbolic (and more POSIX-y) arguments, and returns the file descriptor (a number) that represents the opened file. It does not return a file object; the returned value will not have read() or write() methods. """ import re from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.converter import TextConverter from pdfminer.layout import LAParams # pip3 install pdfminer3k from io import StringIO from io import open #pdf文件名 pdfFilename="avelumab.pdf" #文件名前缀 frontName="usan/2016/" #商标文件名 trademark_filename="trademarks.txt" #赞助商文件名 sponsor_filename="sponsor.txt" #读取PDF数据 def readPDF(pdfFile): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) process_pdf(rsrcmgr, device, pdfFile) device.close() content = retstr.getvalue() retstr.close() return content #规范PDF数据 def Format(str1): list2=[] #分割成列表 list1=str1.split("\n") for i in list1: #if i=="/n": if i=='' or i==' 'or i==' ': continue list2.append(i) return list2 #提取me_usan,药品名 def Get_me_usan(the_list_data): return the_list_data[0] #提取me_therapeutic def Get_me_therapeutic(the_list_data): for i in the_list_data: if "Treatment of" in i: return i #提取me_chemical1 分子式1 def Get_me_chemical1(the_list_data): for i in the_list_data: if "1. " in i: return i return "" #提取me_chemical2 分子式2 def Get_me_chemical2(the_list_data): for i in the_list_data: if "2. " in i: return i return "" #匹配分子式 def Re_formula(str1): #匹配正在表达式 re_formula=re.compile(r'C(\d)+H(\d)+') mo1=re_formula.search(str1) if mo1!=None: return True return False #提取me_mo_formula,特征包含碳氢CH元素 def Get_me_mo_formula(the_list_data): for i in the_list_data: #转换为大写 i=i.upper() value=Re_formula(i) if value==True: return i return "" #提取分子质量me_mo_weight,如果出现MOLECULAR WEIGHT,且下一个值是数字或浮点数,就提取下一个值 def Get_me_mo_weight(the_list_data): for count in range(len(the_list_data)): #如果出现MOLECULAR WEIGHT,则提取下一个值 if 'MOLECULAR WEIGHT' in the_list_data[count]: value=the_list_data[count+1] if type(eval(value)) == int or type(eval(value)) == float: return value return "" #从trademarks.txt搜索数据 def Get_txt_contents(filename): file=open(filename) content=file.readlines() content1=[i.replace("\n","") for i in content] return content1 #提取me_trademark,从trademarks.txt搜索数据 def Get_me_trademark(the_list_data): for i in the_list_data: i=i.strip(" ") for k in list_trademarks: if k in i: return i return "" #提取me_sponsor,从sponsor.txt搜索数据 def Get_me_sponsor(the_list_data): for i in the_list_data: i=i.strip(" ") for k in list_sponsors: if k in i: return i return "" #匹配CAS正则表达式 def Re_CAS(str1): re_CAS=re.compile(r'(\d)+-(\d)+-(\d)+') mo1=re_CAS.search(str1) if mo1!=None: return True return False #提取CAS def Get_CAS(the_list_data): for i in the_list_data: value=Re_CAS(i) if value==True: return i return "" #匹配WHO正则表达式 def Re_WHO(str1): re_WHO=re.compile(r'(\d)+') mo1=re_WHO.search(str1) if mo1!=None: return True return False #提取WHO def Get_WHO(the_list_data): for count in range(len(the_list_data)): #如果出现MOLECULAR WEIGHT,则提取下一个值 try: if 'WHO NUMBER' in the_list_data[count]: value=the_list_data[count+1] if type(eval(value)) == int: return value except: return "" return "" #匹配UNII正则表达式 def Re_UNII(str1): #{10}表示出现10次 re_UNII=re.compile(r'[A-Za-z0-9]{10}') mo1=re_UNII.search(str1) if mo1!=None: return True return False #提取UNII def Get_UNII(the_list_data): for count in range(len(the_list_data)): #如果出现MOLECULAR WEIGHT,则提取下一个值 if 'UNII' in the_list_data[count]: value=the_list_data[count+1] if Re_UNII(value)==True: return value return "" #获取me_down数据 def Get_me_down(the_list_data): name=frontName+pdfFilename return name pdfFile = open(pdfFilename, 'rb') outputString = readPDF(pdfFile) list_data=Format(outputString) me_source=2016 #提取me_usan,药品名 me_usan=Get_me_usan(list_data) #提取me_therapeutic 治疗疾病 me_therapeutic=Get_me_therapeutic(list_data) #提取me_therapeutic me_chemical1=Get_me_chemical1(list_data) #提取me_chemical2 分子式2 me_chemical2=Get_me_chemical2(list_data) #提取me_mo_formula,特征包含碳氢CH元素 me_mo_formula=Get_me_mo_formula(list_data) #提取分子质量me_mo_weight #me_mo_weight=Get_me_mo_weight(list_data) #商标名数据库 list_trademarks=Get_txt_contents(trademark_filename) #提取商标名 me_trademark=Get_me_trademark(list_data) #赞助商数据库 list_sponsors=Get_txt_contents(sponsor_filename) #提取赞助商,新公司则找不到 me_sponsor=Get_me_sponsor(list_data) #提取CAS me_CAS=Get_CAS(list_data) #提取WHO me_WHO=Get_WHO(list_data) #提取UNII me_UNII=Get_UNII(list_data) #获取me_down me_down=Get_me_down(list_data) #me_bianma数据默认为空 me_bianma="" #me_ylbm数据默认为空 me_ylbm="" me_mo_weight=""
多个PDF内容提取
# -*- coding: utf-8 -*- """ Created on Tue Dec 27 11:37:54 2016 批量提取PDF数据到excel内 """ import re import os import pandas,csv from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.converter import TextConverter from pdfminer.layout import LAParams # pip3 install pdfminer3k from io import StringIO from io import open #获取目录内文件名 list_filenames=os.listdir() #获取所有PDF文件名 list_pdfFilename=[i for i in list_filenames if ".pdf" in i] #pdf文件名,测试用 #pdfFilename="cenobamate.pdf" #参数设置 #文件名前缀 frontName="usan/2016/" #me_sorce字段 me_source=2016 #me_bianma数据默认为空 me_bianma="" #me_ylbm数据默认为空 me_ylbm="" me_code="" me_en="" #me_mo_weight="" #疾病诊断文件名 therapeutic_filename="therapeutic.txt" #商标文件名 trademark_filename="trademarks.txt" #赞助商文件名 sponsor_filename="sponsor.txt" #读取PDF数据 def readPDF(pdfFile): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) process_pdf(rsrcmgr, device, pdfFile) device.close() content = retstr.getvalue() retstr.close() return content #规范PDF数据 def Format(str1): list2=[] re_blank=re.compile(r'\s') #分割成列表 list1=str1.split("\n") for i in list1: m=re_blank.search(i) #遇到空格 if m==None: continue list2.append(i) return list2 #提取me_usan,药品名,并非百分百准确 def Get_me_usan(pdfFilename): me_usan=pdfFilename.replace(".pdf","") return me_usan #提取me_therapeutic def Get_me_therapeutic(the_list_data): for i in the_list_data: #小写 i=i.lower() for k in list_therapeutic: if k in i: return i return "" #提取me_chemical1 分子式1 def Get_me_chemical1(the_list_data): for i in the_list_data: if "1. " in i: return i return "" #提取me_chemical2 分子式2 def Get_me_chemical2(the_list_data): for i in the_list_data: if "2. " in i: return i return "" #匹配分子式 def Re_formula(str1): #匹配正在表达式 re_formula=re.compile(r'C(\d)+H(\d)+') mo1=re_formula.search(str1) if mo1!=None: return True return False #提取me_mo_formula,特征包含碳氢CH元素 def Get_me_mo_formula(the_list_data): for i in the_list_data: #转换为大写 i=i.upper() value=Re_formula(i) if value==True: return i return "" #提取分子质量me_mo_weight,如果出现MOLECULAR WEIGHT,且下一个值是数字或浮点数,就提取下一个值 def Get_me_mo_weight(the_list_data): for count in range(len(the_list_data)): #如果出现MOLECULAR WEIGHT,则提取下一个值 if 'MOLECULAR WEIGHT' in the_list_data[count]: value=the_list_data[count+1] if "kDa" in value: return value try: if type(eval(value)) == int or type(eval(value)) == float: return value except: return "" return "" #从trademarks.txt搜索数据 def Get_txt_contents(filename): file=open(filename) content=file.readlines() content1=[i.replace("\n","") for i in content] #转换为小写 content2=[i.lower() for i in content1] return content2 #提取me_trademark,从trademarks.txt搜索数据 def Get_me_trademark(the_list_data): for i in the_list_data: i=i.strip(" ") for k in list_trademarks: if k in i: return i return "" #提取me_sponsor,从sponsor.txt搜索数据 def Get_me_sponsor(the_list_data): for i in the_list_data: i=i.strip(" ") for k in list_sponsors: if k in i: return i return "" #匹配CAS正则表达式 def Re_CAS(str1): re_CAS=re.compile(r'(\d)+-(\d)+-(\d)+') mo1=re_CAS.search(str1) if mo1!=None: return True return False #提取CAS def Get_CAS(the_list_data): for i in the_list_data: value=Re_CAS(i) if value==True: return i return "" #匹配WHO正则表达式 def Re_WHO(str1): re_WHO=re.compile(r'(\d)+') mo1=re_WHO.search(str1) if mo1!=None: return True return False #提取WHO #提取WHO def Get_WHO(the_list_data): for count in range(len(the_list_data)): #如果出现MOLECULAR WEIGHT,则提取下一个值 try: if 'WHO NUMBER' in the_list_data[count]: value=the_list_data[count+1] if type(eval(value)) == int: return value except: return "" return "" #匹配UNII正则表达式 def Re_UNII(str1): #{10}表示出现10次 re_UNII=re.compile(r'[A-Za-z0-9]{10}') mo1=re_UNII.search(str1) if mo1!=None: return True return False #提取UNII def Get_UNII(the_list_data): for count in range(len(the_list_data)): #如果出现MOLECULAR WEIGHT,则提取下一个值 if 'UNII' in the_list_data[count]: value=the_list_data[count+1] if Re_UNII(value)==True: return value return "" #获取me_down数据 def Get_me_down(pdfFilename): name=frontName+pdfFilename return name #获取单个PDF的药品名称,编码,who,unii,分子式等内容 def Get_one_pdf_content(pdfFilename,count): list_one_pdf_content=[] pdfFile = open(pdfFilename, 'rb') outputString = readPDF(pdfFile) list_data=Format(outputString) #me_uid me_uid=count #提取me_usan,药品名 me_usan=Get_me_usan(pdfFilename) #提取me_therapeutic 治疗疾病,用字典方法改写 me_therapeutic=Get_me_therapeutic(list_data) #提取me_therapeutic me_chemical1=Get_me_chemical1(list_data) #提取me_chemical2 分子式2 me_chemical2=Get_me_chemical2(list_data) #提取me_mo_formula,特征包含碳氢CH元素 me_mo_formula=Get_me_mo_formula(list_data) #提取分子质量me_mo_weight,有问题需要改进 me_mo_weight=Get_me_mo_weight(list_data) #提取商标名 me_trademark=Get_me_trademark(list_data) #提取赞助商,新公司则找不到 me_sponsor=Get_me_sponsor(list_data) #提取CAS me_CAS=Get_CAS(list_data) #提取WHO me_WHO=Get_WHO(list_data) #提取UNII me_UNII=Get_UNII(list_data) #获取me_down me_down=Get_me_down(pdfFilename) #把所有内容添加进去 list_one_pdf_content.append(me_uid) list_one_pdf_content.append(me_source) list_one_pdf_content.append(me_usan) list_one_pdf_content.append(me_therapeutic) list_one_pdf_content.append(me_chemical1) list_one_pdf_content.append(me_chemical2) list_one_pdf_content.append(me_mo_formula) list_one_pdf_content.append(me_mo_weight) list_one_pdf_content.append(me_trademark) list_one_pdf_content.append(me_sponsor) list_one_pdf_content.append(me_code) list_one_pdf_content.append(me_CAS) list_one_pdf_content.append(me_WHO) list_one_pdf_content.append(me_UNII) list_one_pdf_content.append(me_en) list_one_pdf_content.append(me_down) list_one_pdf_content.append(me_bianma) list_one_pdf_content.append(me_ylbm) return list_one_pdf_content #获取所有PDF的药品名称,编码,who,unii,分子式等内容 def Get_all_pdf_content(list_pdfFilename): #添加首行 list_all_pdfContent.append(list_firstRow) for count in range(len(list_pdfFilename)): filename=list_pdfFilename[count] try: list_one_pdf_content=Get_one_pdf_content(filename,count) except: list_one_pdf_content="" list_all_pdfContent.append(list_one_pdf_content) return list_all_pdfContent #首行信息 list_firstRow=["me_uid","me_source","me_usan","me_therapeutic","me_chemical1","me_chemical2","me_mo_formula","me_mo_weight","me_trademark","me_sponsor", "me_codename","me_cas","me_who","me_unii","me_en","me_down","me_bianma","me_ylbm"] #治疗疾病数据库 list_therapeutic=Get_txt_contents(therapeutic_filename) #商标名数据库 list_trademarks=Get_txt_contents(trademark_filename) #赞助商数据库 list_sponsors=Get_txt_contents(sponsor_filename) #获取所有PDF的药品名称,编码,who,unii,分子式等内容 list_all_pdfContent=[] list_all_pdfContent=Get_all_pdf_content(list_pdfFilename) csvObj=open("output.csv",'w',newline='') csvWriter=csv.writer(csvObj) for rowData in list_all_pdfContent: csvWriter.writerow(rowData) csvObj.close()
需要数据库
自动输出结果