文字版PDF可使用fitz轻松获取PDF文档中的纯文字内容,再使用大模型进行问答(简化版RAG)。
示例Python代码如下:
# -*- coding: utf-8 -*-
import os
import openai
import fitz
###设置代理,本地vpn
os.environ["http_proxy"] = "http://127.0.0.1:7890"
os.environ["https_proxy"] = "http://127.0.0.1:7890"
openai.api_key = "api key"
def get_pdf_content(pdf_path: str) -> str:
doc = fitz.open(pdf_path)
num_pages = doc.page_count
text_content_list = []
# 读取PDF的全部文本内容
for page_index in range(num_pages):
page = doc.load_page(page_index)
text = page.get_text()
text_content_list.append(text)
# 合并全部页面的文本
return ''.join(text_content_list)
def get_answer(pdf_content: str, query: str) -> str:
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": f"The full text of PDF file is: {pdf_content}"},
{"role": "user", "content": query}
],
max_tokens=1000
)
answer = response['choices'][0]['message']['content']
return answer
if __name__ == '__main__':
# Example usage — make sure to update the PDF path
pdf_content = get_pdf_content("../data/oppo_n3_flip.pdf")
queries = [
"OPPO Find N3 Flip的价格?",
"蚂蚁集团发布的大模型叫什么?",
"混元大模型是什么时候发布的?"
]
# 打印所有问题的答案
for query in queries:
answer=get_answer(pdf_content=pdf_content, query=query)
print(f"query:{query},\n RAG answer:{answer}")
结果如下: