使用KNN及TF進(jìn)行中文PDF搜索,類似于AutoGPT或ChatPDF實(shí)現(xiàn)原理!
直接上代碼,結(jié)合該功能與GPT進(jìn)行搞基:
PDF文本搜索
import osimport reimport shutilimport urllib.requestfrom pathlib import Pathfrom tempfile import NamedTemporaryFileimport fitzimport numpy as npimport openaiimport tensorflow_hub as hubfrom sklearn.neighbors import NearestNeighbors# 對每頁P(yáng)DF進(jìn)行預(yù)處理,生成一個(gè)text_listdef preprocess(text):
? ?text = text.replace('\n', ' ')
? ?text = re.sub('\s+', ' ', text)
? ?return textdef pdf_to_text(path, start_page=1, end_page=None):
? ?doc = fitz.open(path)
? ?total_pages = doc.page_count
? ?if end_page is None:
? ? ? ?end_page = total_pages
? ?text_list = []
? ?for i in range(start_page - 1, end_page):
? ? ? ?text = doc.load_page(i).get_text("text")
? ? ? ?text = preprocess(text)
? ? ? ?text_list.append(text)
? ?doc.close()
? ?return text_listdef text_to_chunks(texts, word_length=150, start_page=1):
? ?text_toks = [t.split(' ') for t in texts]
? ?page_nums = []
? ?chunks = []
? ?for idx, words in enumerate(text_toks):
? ? ? ?for i in range(0, len(words), word_length):
? ? ? ? ? ?chunk = words[i : i + word_length]
? ? ? ? ? ?if (
? ? ? ? ? ? ? ?(i + word_length) > len(words)
? ? ? ? ? ? ? ?and (len(chunk) < word_length)
? ? ? ? ? ? ? ?and (len(text_toks) != (idx + 1))
? ? ? ? ? ?):
? ? ? ? ? ? ? ?text_toks[idx + 1] = chunk + text_toks[idx + 1]
? ? ? ? ? ? ? ?continue
? ? ? ? ? ?chunk = ' '.join(chunk).strip()
? ? ? ? ? ?chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'# ? ? ? ? ? ? print({idx+start_page})
? ? ? ? ? ?chunks.append(chunk)
? ?return chunksclass SemanticSearch:
? ?def __init__(self):
? ? ? ?self.use = hub.load("F:/*******") # 中文 https://www.intumu.com/article/203
? ? ? ?
? ? ? ?self.fitted = False
? ?def fit(self, data, batch=100, n_neighbors=3): # batch=1000, n_neighbors=5
? ? ? ?self.data = data
? ? ? ?self.embeddings = self.get_text_embedding(data, batch=batch)
? ? ? ?n_neighbors = min(n_neighbors, len(self.embeddings))
? ? ? ?self.nn = NearestNeighbors(n_neighbors=n_neighbors)
? ? ? ?self.nn.fit(self.embeddings)
? ? ? ?self.fitted = True
? ?def __call__(self, text, return_data=True):
? ? ? ?inp_emb = self.use([text])
? ? ? ?neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
? ? ? ?if return_data:
? ? ? ? ? ?return [self.data[i] for i in neighbors]
? ? ? ?else:
? ? ? ? ? ?return neighbors
? ?def get_text_embedding(self, texts, batch=1000):
? ? ? ?embeddings = []
? ? ? ?for i in range(0, len(texts), batch):
? ? ? ? ? ?text_batch = texts[i : (i + batch)]
? ? ? ? ? ?emb_batch = self.use(text_batch)
? ? ? ? ? ?embeddings.append(emb_batch)
? ? ? ?embeddings = np.vstack(embeddings)
? ? ? ?return embeddingsdef load_recommender(path, start_page=1):
? ?global recommender
? ?texts = pdf_to_text(path, start_page=start_page)
? ?chunks = text_to_chunks(texts, start_page=start_page)
? ?recommender.fit(chunks)
? ?return 'Corpus Loaded.'# 開始訓(xùn)練語料庫pdf_path='第3章 ?巖土工程勘察.pdf'recommender = SemanticSearch()load_recommender(pdf_path) # 使用fit生成語料庫 https://www.intumu.com/article/203question='鉆孔深度相關(guān)規(guī)定?'topn_chunks = recommender(question)print(topn_chunks)
GPT查詢代碼
def generate_answer(question, openAI_key):
? ?topn_chunks = recommender(question) ?
? ?prompt = ""
? ?prompt += 'search results:\n\n'
? ?for c in topn_chunks:
? ? ? ?prompt += c + '\n\n'
? ?prompt += (
? ? ? ?"Instructions: Compose a comprehensive reply to the query using the search results given. "
? ? ? ?"Cite each reference using [ Page Number] notation (every result has this number at the beginning). "
? ? ? ?"Citation should be done at the end of each sentence. If the search results mention multiple subjects "
? ? ? ?"with the same name, create separate answers for each. Only include information found in the results and "
? ? ? ?"don't add any additional information. Make sure the answer is correct and don't output false content. "
? ? ? ?"If the text does not relate to the query, simply state 'Text Not Found in PDF'. Ignore outlier "
? ? ? ?"search results which has nothing to do with the question. Only answer what is asked. The "
? ? ? ?"answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: "
? ?)
? ?prompt += f"Query: {question}\nAnswer:"
? ?answer = generate_text(openAI_key, prompt, "text-davinci-003")
# ? ? answer = handle_message(prompt)
? ?return answer
def generate_text(openAI_key, prompt, engine="text-davinci-003"):
? ?openai.api_key = openAI_key
? ?completions = openai.Completion.create(
? ? ? ?engine=engine,
? ? ? ?prompt=prompt,
? ? ? ?max_tokens=512,
? ? ? ?n=1,
? ? ? ?stop=None,
? ? ? ?temperature=0.7,
? ?)
? ?message = completions.choices[0].text
? ?return message
openAI_key = 'sk-zo59kJ9gV7yx8xgsn8jrT3BlbkFJT******' #https://www.intumu.com/article/203
generate_answer(question, openAI_key)
結(jié)語
以上類似于AutoGPT或chatPDF的實(shí)現(xiàn)原理,感興趣的讀者可以試試。
civilpy:Python數(shù)據(jù)分析及可視化實(shí)例目錄913 贊同 · 36 評論文章
