爬蟲(chóng)
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
#需求:爬取三國(guó)演義小說(shuō)所有的章節(jié)標(biāo)題和章節(jié)內(nèi)容http://www.shicimingju.com/book/sanguoyanyi.html
if __name__ == "__main__":
? ? #對(duì)首頁(yè)的頁(yè)面數(shù)據(jù)進(jìn)行爬取
? ? headers = {
? ? ? ? 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
? ? }
? ? url = 'http://www.shicimingju.com/book/sanguoyanyi.html'
? ? page_text = requests.get(url=url,headers=headers).text
? ? #在首頁(yè)中解析出章節(jié)的標(biāo)題和詳情頁(yè)的url
? ? #1.實(shí)例化BeautifulSoup對(duì)象,需要將頁(yè)面源碼數(shù)據(jù)加載到該對(duì)象中
? ? soup = BeautifulSoup(page_text,'lxml')
? ? #解析章節(jié)標(biāo)題和詳情頁(yè)的url
? ? li_list = soup.select('.book-mulu > ul > li')
? ? fp = open('./sanguo.txt','w',encoding='utf-8')
? ? for li in li_list:
? ? ? ? title = li.a.string
? ? ? ? detail_url = 'http://www.shicimingju.com'+li.a['href']
? ? ? ? #對(duì)詳情頁(yè)發(fā)起請(qǐng)求,解析出章節(jié)內(nèi)容
? ? ? ? detail_page_text = requests.get(url=detail_url,headers=headers).text
? ? ? ? #解析出詳情頁(yè)中相關(guān)的章節(jié)內(nèi)容
? ? ? ? detail_soup = BeautifulSoup(detail_page_text,'lxml')
? ? ? ? div_tag = detail_soup.find('div',class_='chapter_content')
? ? ? ? #解析到了章節(jié)的內(nèi)容
? ? ? ? content = div_tag.text
? ? ? ? fp.write(title+':'+content+'\n')
? ? ? ? print(title,'爬取成功?。?!')
import requests
from lxml import etree
if __name__ == '__main__':
? ? headers = {
? ? ? ? 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
? ? }
? ? url = 'https://news.163.com/'
? ? page_text = requests.get(url = url,headers = headers).text
? ? #print(page_text)
? ? tree = etree.HTML(page_text)
? ? li_list=tree.xpath('/html/body/div[1]/div[3]/div[2]/div[3]/div[3]/div[10]//li')
? ? for li in li_list:
? ? ? ? rank=li.xpath('./em/text()')
? ? ? ? #print(rank)
? ? ? ? title=li.xpath('./a/@title')
? ? ? ? #print(title)
? ? ? ? num=li.xpath('./span/text()')[0]
? ? ? ? print(num)
?
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from lxml import etree
if __name__ == "__main__":
? ? #實(shí)例化好了一個(gè)etree對(duì)象,且將被解析的源碼加載到了該對(duì)象中
? ? tree = etree.parse('test.html')
? ? # r = tree.xpath('/html/body/div')
? ? # r = tree.xpath('/html//div')
? ? # r = tree.xpath('//div')
? ? # r = tree.xpath('//div[@class="song"]')
? ? # r = tree.xpath('//div[@class="tang"]//li[5]/a/text()')[0]
? ? # r = tree.xpath('//li[7]//text()')
? ? # r = tree.xpath('//div[@class="tang"]//text()')
? ? r = tree.xpath('//div[@class="song"]/img/@src')
? ? print(r)
import re ?# 導(dǎo)入正則表達(dá)式模塊re
pattern = r'he\w+' ?# 模式字符串,以'he'開(kāi)頭的字符串,r表示'\w'不進(jìn)行轉(zhuǎn)義
string = 'hello world HELLO WORLD' ?# 定義要匹配的字符串
match = re.findall(pattern, string, re.I) ?# 搜索字符串,不區(qū)分大小寫(xiě)
print(match) ?# 輸出匹配結(jié)果
string = '你好世界hello world HELLO WORLD' ?# 定義要匹配的字符串
match = re.findall(pattern, string) ?# 搜索字符串,區(qū)分大小寫(xiě)
print(match) ?# 輸出匹配結(jié)果
pattern = r'https://(.*?)(\d+).com/' ?# 表達(dá)式,非貪婪操作符'?'
import re ?# 導(dǎo)入正則表達(dá)式模塊re
pattern = r'https://(.*?)' ?# 表達(dá)式
string = 'https://www.hao123.com/' ?# 定義要匹配的字符串
match = re.findall(pattern, string) ?# 匹配字符串
print(match) ?# 輸出匹配結(jié)果
pattern = r'https://(.*)' ?# 表達(dá)式
match = re.findall(pattern, string) ?# 匹配字符串
print(match) ?# 輸出匹配結(jié)果