最美情侣中文字幕电影,在线麻豆精品传媒,在线网站高清黄,久久黄色视频

歡迎光臨散文網(wǎng) 會(huì)員登陸 & 注冊(cè)

爬蟲(chóng)

2023-06-09 14:54 作者:alpha-H111  | 我要投稿

#!/usr/bin/env python

# -*- coding:utf-8 -*-

import requests

from bs4 import BeautifulSoup

#需求:爬取三國(guó)演義小說(shuō)所有的章節(jié)標(biāo)題和章節(jié)內(nèi)容http://www.shicimingju.com/book/sanguoyanyi.html

if __name__ == "__main__":

? ? #對(duì)首頁(yè)的頁(yè)面數(shù)據(jù)進(jìn)行爬取

? ? headers = {

? ? ? ? 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'

? ? }

? ? url = 'http://www.shicimingju.com/book/sanguoyanyi.html'

? ? page_text = requests.get(url=url,headers=headers).text


? ? #在首頁(yè)中解析出章節(jié)的標(biāo)題和詳情頁(yè)的url

? ? #1.實(shí)例化BeautifulSoup對(duì)象,需要將頁(yè)面源碼數(shù)據(jù)加載到該對(duì)象中

? ? soup = BeautifulSoup(page_text,'lxml')

? ? #解析章節(jié)標(biāo)題和詳情頁(yè)的url

? ? li_list = soup.select('.book-mulu > ul > li')

? ? fp = open('./sanguo.txt','w',encoding='utf-8')

? ? for li in li_list:

? ? ? ? title = li.a.string

? ? ? ? detail_url = 'http://www.shicimingju.com'+li.a['href']

? ? ? ? #對(duì)詳情頁(yè)發(fā)起請(qǐng)求,解析出章節(jié)內(nèi)容

? ? ? ? detail_page_text = requests.get(url=detail_url,headers=headers).text

? ? ? ? #解析出詳情頁(yè)中相關(guān)的章節(jié)內(nèi)容

? ? ? ? detail_soup = BeautifulSoup(detail_page_text,'lxml')

? ? ? ? div_tag = detail_soup.find('div',class_='chapter_content')

? ? ? ? #解析到了章節(jié)的內(nèi)容

? ? ? ? content = div_tag.text

? ? ? ? fp.write(title+':'+content+'\n')

? ? ? ? print(title,'爬取成功?。?!')


import requests

from lxml import etree

if __name__ == '__main__':

? ? headers = {

? ? ? ? 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'

? ? }

? ? url = 'https://news.163.com/'

? ? page_text = requests.get(url = url,headers = headers).text

? ? #print(page_text)

? ? tree = etree.HTML(page_text)

? ? li_list=tree.xpath('/html/body/div[1]/div[3]/div[2]/div[3]/div[3]/div[10]//li')

? ? for li in li_list:

? ? ? ? rank=li.xpath('./em/text()')

? ? ? ? #print(rank)

? ? ? ? title=li.xpath('./a/@title')

? ? ? ? #print(title)

? ? ? ? num=li.xpath('./span/text()')[0]

? ? ? ? print(num)

?

#!/usr/bin/env python

# -*- coding:utf-8 -*-

from lxml import etree

if __name__ == "__main__":

? ? #實(shí)例化好了一個(gè)etree對(duì)象,且將被解析的源碼加載到了該對(duì)象中

? ? tree = etree.parse('test.html')

? ? # r = tree.xpath('/html/body/div')

? ? # r = tree.xpath('/html//div')

? ? # r = tree.xpath('//div')

? ? # r = tree.xpath('//div[@class="song"]')

? ? # r = tree.xpath('//div[@class="tang"]//li[5]/a/text()')[0]

? ? # r = tree.xpath('//li[7]//text()')

? ? # r = tree.xpath('//div[@class="tang"]//text()')

? ? r = tree.xpath('//div[@class="song"]/img/@src')


? ? print(r)




import re ?# 導(dǎo)入正則表達(dá)式模塊re


pattern = r'he\w+' ?# 模式字符串,以'he'開(kāi)頭的字符串,r表示'\w'不進(jìn)行轉(zhuǎn)義

string = 'hello world HELLO WORLD' ?# 定義要匹配的字符串

match = re.findall(pattern, string, re.I) ?# 搜索字符串,不區(qū)分大小寫(xiě)

print(match) ?# 輸出匹配結(jié)果

string = '你好世界hello world HELLO WORLD' ?# 定義要匹配的字符串

match = re.findall(pattern, string) ?# 搜索字符串,區(qū)分大小寫(xiě)

print(match) ?# 輸出匹配結(jié)果


pattern = r'https://(.*?)(\d+).com/' ?# 表達(dá)式,非貪婪操作符'?'


import re ?# 導(dǎo)入正則表達(dá)式模塊re


pattern = r'https://(.*?)' ?# 表達(dá)式

string = 'https://www.hao123.com/' ?# 定義要匹配的字符串

match = re.findall(pattern, string) ?# 匹配字符串

print(match) ?# 輸出匹配結(jié)果

pattern = r'https://(.*)' ?# 表達(dá)式

match = re.findall(pattern, string) ?# 匹配字符串

print(match) ?# 輸出匹配結(jié)果



爬蟲(chóng)的評(píng)論 (共 條)

分享到微博請(qǐng)遵守國(guó)家法律
大埔区| 德州市| 电白县| 开平市| 临湘市| 英超| 贵溪市| 宜州市| 和平区| 桐庐县| 海南省| 什邡市| 屯留县| 南京市| 鄄城县| 县级市| 潮州市| 海南省| 台山市| 南岸区| 宁远县| 门头沟区| 聂荣县| 葫芦岛市| 旬邑县| 淅川县| 新巴尔虎右旗| 闽侯县| 临安市| 沾化县| 久治县| 江北区| 嘉鱼县| 大埔县| 五大连池市| 南丹县| 固阳县| 九江县| 乳源| 高雄市| 沐川县|