手機(jī)站首頁(yè)散文詩(shī)歌雜文隨筆日記小小說(shuō)

散文網(wǎng) » 生活 »日常 » 爬蟲(chóng)

爬蟲(chóng)

2023-06-09 14:54 作者:alpha-H111 0人讀過(guò) | 我要投稿

#!/usr/bin/env python

# -*- coding:utf-8 -*-

import requests

from bs4 import BeautifulSoup

#需求：爬取三國(guó)演義小說(shuō)所有的章節(jié)標(biāo)題和章節(jié)內(nèi)容http://www.shicimingju.com/book/sanguoyanyi.html

if __name__ == "__main__":

? ? #對(duì)首頁(yè)的頁(yè)面數(shù)據(jù)進(jìn)行爬取

? ? headers = {

? ? ? ? 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'

? ? }

? ? url = 'http://www.shicimingju.com/book/sanguoyanyi.html'

? ? page_text = requests.get(url=url,headers=headers).text

? ? #在首頁(yè)中解析出章節(jié)的標(biāo)題和詳情頁(yè)的url

? ? #1.實(shí)例化BeautifulSoup對(duì)象，需要將頁(yè)面源碼數(shù)據(jù)加載到該對(duì)象中

? ? soup = BeautifulSoup(page_text,'lxml')

? ? #解析章節(jié)標(biāo)題和詳情頁(yè)的url

? ? li_list = soup.select('.book-mulu > ul > li')

? ? fp = open('./sanguo.txt','w',encoding='utf-8')

? ? for li in li_list:

? ? ? ? title = li.a.string

? ? ? ? detail_url = 'http://www.shicimingju.com'+li.a['href']

? ? ? ? #對(duì)詳情頁(yè)發(fā)起請(qǐng)求，解析出章節(jié)內(nèi)容

? ? ? ? detail_page_text = requests.get(url=detail_url,headers=headers).text

? ? ? ? #解析出詳情頁(yè)中相關(guān)的章節(jié)內(nèi)容

? ? ? ? detail_soup = BeautifulSoup(detail_page_text,'lxml')

? ? ? ? div_tag = detail_soup.find('div',class_='chapter_content')

? ? ? ? #解析到了章節(jié)的內(nèi)容

? ? ? ? content = div_tag.text

? ? ? ? fp.write(title+':'+content+'\n')

? ? ? ? print(title,'爬取成功?。?！')

import requests

from lxml import etree

if __name__ == '__main__':

? ? headers = {

? ? ? ? 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'

? ? }

? ? url = 'https://news.163.com/'

? ? page_text = requests.get(url = url,headers = headers).text

? ? #print(page_text)

? ? tree = etree.HTML(page_text)

? ? li_list=tree.xpath('/html/body/div[1]/div[3]/div[2]/div[3]/div[3]/div[10]//li')

? ? for li in li_list:

? ? ? ? rank=li.xpath('./em/text()')

? ? ? ? #print(rank)

? ? ? ? title=li.xpath('./a/@title')

? ? ? ? #print(title)

? ? ? ? num=li.xpath('./span/text()')[0]

? ? ? ? print(num)

?

#!/usr/bin/env python

# -*- coding:utf-8 -*-

from lxml import etree

if __name__ == "__main__":

? ? #實(shí)例化好了一個(gè)etree對(duì)象，且將被解析的源碼加載到了該對(duì)象中

? ? tree = etree.parse('test.html')

? ? # r = tree.xpath('/html/body/div')

? ? # r = tree.xpath('/html//div')

? ? # r = tree.xpath('//div')

? ? # r = tree.xpath('//div[@class="song"]')

? ? # r = tree.xpath('//div[@class="tang"]//li[5]/a/text()')[0]

? ? # r = tree.xpath('//li[7]//text()')

? ? # r = tree.xpath('//div[@class="tang"]//text()')

? ? r = tree.xpath('//div[@class="song"]/img/@src')

? ? print(r)

import re ?# 導(dǎo)入正則表達(dá)式模塊re

pattern = r'he\w+' ?# 模式字符串，以'he'開(kāi)頭的字符串，r表示'\w'不進(jìn)行轉(zhuǎn)義

string = 'hello world HELLO WORLD' ?# 定義要匹配的字符串

match = re.findall(pattern, string, re.I) ?# 搜索字符串，不區(qū)分大小寫(xiě)

print(match) ?# 輸出匹配結(jié)果

string = '你好世界hello world HELLO WORLD' ?# 定義要匹配的字符串

match = re.findall(pattern, string) ?# 搜索字符串，區(qū)分大小寫(xiě)

print(match) ?# 輸出匹配結(jié)果

pattern = r'https://(.*?)(\d+).com/' ?# 表達(dá)式，非貪婪操作符'?'

import re ?# 導(dǎo)入正則表達(dá)式模塊re

pattern = r'https://(.*?)' ?# 表達(dá)式

string = 'https://www.hao123.com/' ?# 定義要匹配的字符串

match = re.findall(pattern, string) ?# 匹配字符串

print(match) ?# 輸出匹配結(jié)果

pattern = r'https://(.*)' ?# 表達(dá)式

match = re.findall(pattern, string) ?# 匹配字符串

print(match) ?# 輸出匹配結(jié)果

標(biāo)簽：

爬蟲(chóng)的評(píng)論 (共條)

愛(ài)情散文傷感散文哲理散文優(yōu)美生活隨筆親情唯美句子傷感的句子現(xiàn)代詩(shī)歌空間日志經(jīng)典語(yǔ)句愛(ài)情句子作文大全

最美情侣中文字幕电影,在线麻豆精品传媒,在线网站高清黄,久久黄色视频

爬蟲(chóng)

爬蟲(chóng)的評(píng)論 (共條)

你可能也喜歡這些文章

最新發(fā)布的文章

最美情侣中文字幕电影,在线麻豆精品传媒,在线网站高清黄,久久黄色视频

爬蟲(chóng)

本文作者的其他文章

爬蟲(chóng)的評(píng)論 (共 條)

你可能也喜歡這些文章

最新發(fā)布的文章

爬蟲(chóng)的評(píng)論 (共條)