爬取豆瓣評論
# from selenium import webdriver # from selenium.webdriver import ActionChains import requests from bs4 import BeautifulSoup import json import time as t # # 全部按鈕 # driver = webdriver.Chrome() # driver.get("https://movie.douban.com/subject/30391186") # driver.maximize_window() # # element = driver.find_element_by_xpath('/html/body/div[3]/div[1]/div[2]/div[1]/div[9]/div[1]/h2/span/a') # ActionChains(driver).click(on_element=element).perform() # 定義要爬取的電影ID和評論數(shù)量 movie_id = '3011317' comment_num = 10 # 構(gòu)造請求頭部 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} # 構(gòu)造請求URL url = 'https://movie.douban.com/subject/3011317/comments?status=P' # 設(shè)置代理IP proxy = {'http': 'http://127.0.0.1:8888', 'https': 'https://127.0.0.1:8888'} # 發(fā)送HTTP請求并獲取響應(yīng)內(nèi)容 response = requests.get(url, headers=headers) # 使用BeautifulSoup解析HTML頁面 soup = BeautifulSoup(response.text, 'html.parser') # 提取評論內(nèi)容、用戶名,時(shí)間和評分 comments = [] comment_items = soup.find_all('div', {'class': 'comment-item'}) for item in comment_items: comment = item.find('span', {'class': 'short'}).text.strip() user = item.find('span', {'class': 'comment-info'}).find('a').text.strip() time = item.find('span', {'class': 'comment-time'}) if time: time = time.text.strip() else: time = '' rating = item.find('span', {'class': 'rating'}) if rating: rating = rating['title'] else: rating = '' # 時(shí)間間隔 t.sleep(1) comments.append({'user': user, 'rating': rating, 'time': time, 'comment': comment}) # 將評論數(shù)據(jù)保存為JSON文件 with open('comments.json', 'w', encoding='utf-8') as f: json.dump(comments, f, ensure_ascii=False, indent=4) # # 輸出爬取結(jié)果 # with open('comments.txt', 'w', encoding='utf-8') as f: # for comment in comments: # f.write(f"{comment['user']}{comment['rating']}{comment['time']}: {comment['comment']}\n")