Obsidian 英語(yǔ)生詞自動(dòng)引用自單詞庫(kù)

- from nltk.tokenize import word_tokenize
- from nltk.stem import WordNetLemmatizer
- from nltk.corpus import wordnet
- from nltk import pos_tag
- from pathlib import Path
- import pandas as pd
- import re
- def get_wordnet_pos(treebank_tag):
- if treebank_tag.startswith('J'):
- return wordnet.ADJ
- elif treebank_tag.startswith('V'):
- return wordnet.VERB
- elif treebank_tag.startswith('N'):
- return wordnet.NOUN
- elif treebank_tag.startswith('R'):
- return wordnet.ADV
- else:
- return None
- def main():
- obsidian_path = Path("C:/Users/insil/Desktop/英語(yǔ)") # obsidian倉(cāng)庫(kù)
- idx = obsidian_path / '首字母索引'
- article = obsidian_path / '閱讀理解 1.md'
- lemmatizer = WordNetLemmatizer()
- study_words = set()
- for i in idx.glob('*.md'):
- with open(i, encoding='utf-8') as f:
- words = f.readlines()
- words = pd.Series(words)
- words_df = words.str.extract(r' \[(?P<flag>\s)\] \[\[(?P<word>\w+)\]\]')
- words_df.dropna(inplace=True)
- words_set = set(words_df['word'])
- study_words = study_words.union(words_set)
- with open(article, 'r', encoding='utf-8') as f:
- txt = f.readlines()
- for p in range(len(txt)):
- if txt[p] != '\n':
- txt[p] = re.sub(r'\]\]', '', txt[p])
- txt[p] = re.sub(r'\[\[(\w+)\|', '', txt[p])
- txt_words2 = word_tokenize(txt[p])
- txt_words = word_tokenize(txt[p].lower())
- txt_words = pos_tag(txt_words)
- for word in range(len(txt_words)):
- if get_wordnet_pos(txt_words[word][1]):
- w = lemmatizer.lemmatize(txt_words[word][0], get_wordnet_pos(txt_words[word][1]))
- else:
- w = txt_words[word][1]
- if w in study_words:
- txt_words2[word] = f'[[{w}|{txt_words2[word]}]]'
- res = ' '.join(txt_words2)
- res += '\n'
- txt[p] = res
- with open(article, 'w', encoding='utf-8') as f:
- f.write(''.join(txt))
- if __name__ == '__main__':
- main()
標(biāo)簽: