最美情侣中文字幕电影,在线麻豆精品传媒,在线网站高清黄,久久黄色视频

歡迎光臨散文網(wǎng) 會員登陸 & 注冊

使用KNN及TF進(jìn)行中文PDF搜索,類似于AutoGPT或ChatPDF實(shí)現(xiàn)原理!

2023-05-05 10:26 作者:civilpy  | 我要投稿

直接上代碼,結(jié)合該功能與GPT進(jìn)行搞基:

PDF文本搜索

import osimport reimport shutilimport urllib.requestfrom pathlib import Pathfrom tempfile import NamedTemporaryFileimport fitzimport numpy as npimport openaiimport tensorflow_hub as hubfrom sklearn.neighbors import NearestNeighbors# 對每頁P(yáng)DF進(jìn)行預(yù)處理,生成一個(gè)text_listdef preprocess(text): ? ?text = text.replace('\n', ' ') ? ?text = re.sub('\s+', ' ', text) ? ?return textdef pdf_to_text(path, start_page=1, end_page=None): ? ?doc = fitz.open(path) ? ?total_pages = doc.page_count ? ?if end_page is None: ? ? ? ?end_page = total_pages ? ?text_list = [] ? ?for i in range(start_page - 1, end_page): ? ? ? ?text = doc.load_page(i).get_text("text") ? ? ? ?text = preprocess(text) ? ? ? ?text_list.append(text) ? ?doc.close() ? ?return text_listdef text_to_chunks(texts, word_length=150, start_page=1): ? ?text_toks = [t.split(' ') for t in texts] ? ?page_nums = [] ? ?chunks = [] ? ?for idx, words in enumerate(text_toks): ? ? ? ?for i in range(0, len(words), word_length): ? ? ? ? ? ?chunk = words[i : i + word_length] ? ? ? ? ? ?if ( ? ? ? ? ? ? ? ?(i + word_length) > len(words) ? ? ? ? ? ? ? ?and (len(chunk) < word_length) ? ? ? ? ? ? ? ?and (len(text_toks) != (idx + 1)) ? ? ? ? ? ?): ? ? ? ? ? ? ? ?text_toks[idx + 1] = chunk + text_toks[idx + 1] ? ? ? ? ? ? ? ?continue ? ? ? ? ? ?chunk = ' '.join(chunk).strip() ? ? ? ? ? ?chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'# ? ? ? ? ? ? print({idx+start_page}) ? ? ? ? ? ?chunks.append(chunk) ? ?return chunksclass SemanticSearch: ? ?def __init__(self): ? ? ? ?self.use = hub.load("F:/*******") # 中文 https://www.intumu.com/article/203 ? ? ? ? ? ? ? ?self.fitted = False ? ?def fit(self, data, batch=100, n_neighbors=3): # batch=1000, n_neighbors=5 ? ? ? ?self.data = data ? ? ? ?self.embeddings = self.get_text_embedding(data, batch=batch) ? ? ? ?n_neighbors = min(n_neighbors, len(self.embeddings)) ? ? ? ?self.nn = NearestNeighbors(n_neighbors=n_neighbors) ? ? ? ?self.nn.fit(self.embeddings) ? ? ? ?self.fitted = True ? ?def __call__(self, text, return_data=True): ? ? ? ?inp_emb = self.use([text]) ? ? ? ?neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0] ? ? ? ?if return_data: ? ? ? ? ? ?return [self.data[i] for i in neighbors] ? ? ? ?else: ? ? ? ? ? ?return neighbors ? ?def get_text_embedding(self, texts, batch=1000): ? ? ? ?embeddings = [] ? ? ? ?for i in range(0, len(texts), batch): ? ? ? ? ? ?text_batch = texts[i : (i + batch)] ? ? ? ? ? ?emb_batch = self.use(text_batch) ? ? ? ? ? ?embeddings.append(emb_batch) ? ? ? ?embeddings = np.vstack(embeddings) ? ? ? ?return embeddingsdef load_recommender(path, start_page=1): ? ?global recommender ? ?texts = pdf_to_text(path, start_page=start_page) ? ?chunks = text_to_chunks(texts, start_page=start_page) ? ?recommender.fit(chunks) ? ?return 'Corpus Loaded.'# 開始訓(xùn)練語料庫pdf_path='第3章 ?巖土工程勘察.pdf'recommender = SemanticSearch()load_recommender(pdf_path) # 使用fit生成語料庫 https://www.intumu.com/article/203question='鉆孔深度相關(guān)規(guī)定?'topn_chunks = recommender(question)print(topn_chunks)

GPT查詢代碼

def generate_answer(question, openAI_key): ? ?topn_chunks = recommender(question) ? ? ?prompt = "" ? ?prompt += 'search results:\n\n' ? ?for c in topn_chunks: ? ? ? ?prompt += c + '\n\n' ? ?prompt += ( ? ? ? ?"Instructions: Compose a comprehensive reply to the query using the search results given. " ? ? ? ?"Cite each reference using [ Page Number] notation (every result has this number at the beginning). " ? ? ? ?"Citation should be done at the end of each sentence. If the search results mention multiple subjects " ? ? ? ?"with the same name, create separate answers for each. Only include information found in the results and " ? ? ? ?"don't add any additional information. Make sure the answer is correct and don't output false content. " ? ? ? ?"If the text does not relate to the query, simply state 'Text Not Found in PDF'. Ignore outlier " ? ? ? ?"search results which has nothing to do with the question. Only answer what is asked. The " ? ? ? ?"answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: " ? ?) ? ?prompt += f"Query: {question}\nAnswer:" ? ?answer = generate_text(openAI_key, prompt, "text-davinci-003") # ? ? answer = handle_message(prompt) ? ?return answer def generate_text(openAI_key, prompt, engine="text-davinci-003"): ? ?openai.api_key = openAI_key ? ?completions = openai.Completion.create( ? ? ? ?engine=engine, ? ? ? ?prompt=prompt, ? ? ? ?max_tokens=512, ? ? ? ?n=1, ? ? ? ?stop=None, ? ? ? ?temperature=0.7, ? ?) ? ?message = completions.choices[0].text ? ?return message openAI_key = 'sk-zo59kJ9gV7yx8xgsn8jrT3BlbkFJT******' #https://www.intumu.com/article/203 generate_answer(question, openAI_key)

結(jié)語

以上類似于AutoGPT或chatPDF的實(shí)現(xiàn)原理,感興趣的讀者可以試試。

civilpy:Python數(shù)據(jù)分析及可視化實(shí)例目錄913 贊同 · 36 評論文章


使用KNN及TF進(jìn)行中文PDF搜索,類似于AutoGPT或ChatPDF實(shí)現(xiàn)原理!的評論 (共 條)

分享到微博請遵守國家法律
阿克苏市| 柯坪县| 静安区| 正定县| 安国市| 娄底市| 马关县| 德江县| 南江县| 中方县| 宝兴县| 台前县| 伊金霍洛旗| 东丰县| 长兴县| 卓资县| 凯里市| 嘉荫县| 莱阳市| 马关县| 三台县| 兴国县| 平江县| 得荣县| 白山市| 怀集县| 玛纳斯县| 加查县| 永丰县| 廊坊市| 华池县| 镇赉县| 鄂托克旗| 百色市| 巫山县| 古交市| 浙江省| 安岳县| 乌拉特中旗| 南雄市| 辽源市|