**度文庫(kù)提?。▋H作學(xué)習(xí)交流)
import requests,re
from lxml import etree
from docx import Document
def get_detail(url):
? ? header = {'User-agent': 'Googlebot'}
? ? response = requests.get(url , headers = header).content.decode('gbk')
? ? #print(response)
? ? title_ze=r'<title>(.+?)_百度文庫(kù)</title>'
? ? div_ze=r'<div class="bd doc-reader">(.+?)<div class="aside">'
? ? title=re.findall(title_ze,response,re.S)[0]
? ? div=re.findall(div_ze,response,re.S)[0]
? ? div=etree.HTML(div)
? ? details=div.xpath('//div//text()')
? ? j=0
? ? for i in range(len(details)):
? ? ? ? if details[j] in [' ']:
? ? ? ? ? ? details.pop(j)
? ? ? ? else:
? ? ? ? ? ? j+=1
? ? data=title,details
? ? print(data)
? ? return data
def get_word(data):
? ? document = Document()
? ? document.add_heading(data[0])
? ??
? ? for detail in data[1]:
? ? ? ? document.add_paragraph(detail) #添加段落??
? ? ? ? document.save(f'C:/Users/Desktop/******.docx')
if __name__=='__main__':
? ? url="https://wenku.baidu.com/view/779f8c48cd22bcd126fff705cc17552706225e6d.html"
? ? text=get_detail(url)
? ? get_word(text)