2023最新爬取京東網(wǎng)址圖書信息python代碼!
import time import re import csv from selenium import webdriver from selenium.webdriver.chrome.options import Options from bs4 import BeautifulSoup # 使用Chrome瀏覽器驅(qū)動(dòng)程序并將其設(shè)置為無頭模式 chrome_options = Options() driver = webdriver.Chrome(options=chrome_options) # 發(fā)送請求并檢索網(wǎng)頁內(nèi)容,這個(gè)放爬取網(wǎng)址 url = "- 商品搜索 - 京東" driver.get(url) # 延遲以完全加載頁面 time.sleep(5) # 初始化變量 page_number = 1 max_pages = 10 # 設(shè)置要爬取的頁面數(shù),爬的頁數(shù)太多會(huì)被反爬機(jī)制阻止,建議設(shè)置sleep data = [] while page_number <= max_pages: print("正在爬取第", page_number, "頁") # 檢索頁面完全加載后的html內(nèi)容 html_content = driver.page_source # 使用BeautifulSoup解析html內(nèi)容 soup = BeautifulSoup(html_content, "html.parser") # 查找所有包含產(chǎn)品信息的class為“gl-i-wrap”的div div_list = soup.find_all("div", class_="gl-i-wrap") # 從每個(gè)div中提取文本信息 for div in div_list: name = div.find("div", class_="p-name").get_text() price = div.find("div", class_="p-price").get_text() commit = div.find("div", class_="p-commit").get_text() commit = commit.replace('條評價(jià)', '').replace('+', '') if '萬' in commit: commit = float(commit.replace('萬', '')) * 10000 # 模擬點(diǎn)擊書名,獲取新頁面中的信息 link = div.find("div", class_="p-name").find("a").get("href") if "http" not in link: link = "https:" + link # 打開新標(biāo)簽頁 driver.execute_script(f'''window.open("{link}","_blank");''') # 切換到新標(biāo)簽頁 windows = driver.window_handles driver.switch_to.window(windows[-1]) time.sleep(5) soup_new = BeautifulSoup(driver.page_source, "html.parser") publisher = soup_new.find("li", title=True, clstag="shangpin|keycount|product|chubanshe_3")["title"] \ if soup_new.find("li", title=True, clstag="shangpin|keycount|product|chubanshe_3") is not None else '未找到' publish_date = soup_new.find("li", text=re.compile(r"出版時(shí)間:")).get_text().replace('出版時(shí)間:', '') \ if soup_new.find("li", text=re.compile(r"出版時(shí)間:")) is not None else '未找到' # 點(diǎn)擊“商品評價(jià)”按鈕 shop_button = driver.find_elements_by_xpath("http://*[@id='detail']/div[1]/ul/li[5]")[0] shop_button.click() time.sleep(2) # 爬取并輸出評價(jià)信息(好評、中評、差評數(shù)目) good_comments = driver.find_elements_by_xpath("http://*[@id='comment']/div[2]/div[2]/div[1]/ul/li[5]/a/em") for comment in good_comments: comment_text = comment.text.strip("()+") if "萬" in comment_text: comment_text = str(int(float(comment_text.strip("萬")) * 10000)) good_comments_count = int(comment_text) medium_comments = driver.find_elements_by_xpath("http://*[@id='comment']/div[2]/div[2]/div[1]/ul/li[6]/a/em") for comment in medium_comments: comment_text = comment.text.strip("()+") if "萬" in comment_text: comment_text = str(int(float(comment_text.strip("萬")) * 10000)) medium_comments_count = int(comment_text) bad_comments = driver.find_elements_by_xpath("http://*[@id='comment']/div[2]/div[2]/div[1]/ul/li[7]/a/em") for comment in bad_comments: comment_text = comment.text.strip("()+") if "萬" in comment_text: comment_text = str(int(float(comment_text.strip("萬")) * 10000)) bad_comments_count = int(comment_text) driver.close() driver.switch_to.window(windows[0]) # 將數(shù)據(jù)添加到列表中 info = { "書名": name, "價(jià)格": price, "評論數(shù)": commit, "出版社": publisher, "出版年份": publish_date, "好評": good_comments_count, "中評": medium_comments_count, "差評": bad_comments_count } # 打印書籍信息 print(info) # 點(diǎn)擊下一頁按鈕(如果可用) next_page_button = driver.find_element_by_class_name("pn-next") if next_page_button: next_page_button.click() time.sleep(3) # 延遲以完全加載下一頁 else: break page_number += 1 # 關(guān)閉瀏覽器驅(qū)動(dòng)程序 driver.quit() # 將數(shù)據(jù)保存到CSV文件中 filename = "book_info.csv" fields = ["書名", "價(jià)格", "評論數(shù)", "出版社", "出版年份", "好評", "中評", "差評"] with open(filename, 'w', newline='', encoding='utf-8') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fields) writer.writeheader() writer.writerows(data) print("數(shù)據(jù)已保存到", filename) data.append(info) time.sleep(6)