【2023python爬蟲1000集】目前B站最完整的爬蟲教程,包含所有干貨內(nèi)容

屠戮電影天堂(結(jié)合老師視頻代碼,又通過GPT優(yōu)化了代碼,jupyter notebook內(nèi)可以正常運(yùn)行)
import requests
import re
import csv
import pandas as pd
# 設(shè)置請求頭
headers = {
??"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.62"
}
# 發(fā)送請求獲取主頁內(nèi)容
url = "https://www.dytt89.com"
response = requests.get(url, headers=headers)
response.encoding = 'gb2312'
# 使用正則表達(dá)式提取子頁面鏈接和電影信息
obj1 = re.compile(r'2023必看熱片.*?<ul>(?P<ul>.*?)</ul>', re.S)
obj2 = re.compile(r'<a href=(?P<href>.*?) ', re.S)
obj3 = re.compile(r'◎片 名(?P<movie>.*?)<br />.*?<td '
?????????r'style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<download>.*?)">', re.S)
result1 = obj1.finditer(response.text)
child_href_list = []
# 提取子頁面鏈接
for it in result1:
??ul = it.group('ul')
???
??result2 = obj2.finditer(ul)
??for itt in result2:
????child_href = url + itt.group('href').strip("/").strip("'")
????child_href_list.append(child_href)
movies = []
# 提取子頁面內(nèi)容并保存電影信息
for href in child_href_list:
??child_response = requests.get(href)
??child_response.encoding = 'gb2312'
???
??result3 = obj3.search(child_response.text)
??movie = result3.group('movie').strip()
??download_link = result3.group('download').strip()
???
??movies.append({'電影': movie, '下載鏈接': download_link})
# 將電影信息保存為DataFrame
df = pd.DataFrame(movies)
# 保存為CSV文件
filename = 'movies.csv'
df.to_csv(filename, index=False, encoding='utf-8-sig')
print(f"電影信息已成功保存到 {filename} 文件中。")