2023最新使用python編寫(xiě),request庫(kù)爬取當(dāng)當(dāng)網(wǎng)書(shū)籍信息100頁(yè)!
代碼如下,輸入關(guān)鍵詞即可運(yùn)行! import re import concurrent.futures import pandas as pd from time import sleep from bs4 import BeautifulSoup import requests def process_book(book): try: title = book.find("a", class_="pic").img.get("alt", "") price = float(book.find("span", class_="search_now_price").get_text(strip=True).replace("¥", "")) rating_text = book.find("a", class_="search_comment_num").get_text(strip=True) rating_count = int(re.search(r"\d+", rating_text).group()) if re.search(r"\d+", rating_text) else 0 author_info = book.find("p", class_="search_book_author").get_text(strip=True).split("/") author = author_info[0] if len(author_info) > 0 else '' publish_date = author_info[1] if len(author_info) > 1 else '' publisher = author_info[2].split("加")[0] if len(author_info) > 2 else '' return [title, price, rating_count, author, publish_date, publisher] except (AttributeError, ValueError, IndexError): return None def fetch_page(page, url_template, headers): url = url_template.format(page=page) response = requests.get(url, headers=headers) sleep(5) soup = BeautifulSoup(response.text, "html.parser") books = soup.find_all("li", class_=re.compile("line\d+")) return [process_book(book) for book in books if process_book(book) is not None] def fetch_data(): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36" } url_template = "http://search.dangdang.com/?key=人工智能&page_index={page}" with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: pages_data = list(executor.map(fetch_page, range(1, 101), [url_template] * 101, [headers] * 101))#頁(yè)數(shù)[]內(nèi)為上限,()調(diào)整爬取頁(yè)數(shù) data = [item for sublist in pages_data for item in sublist] # flatten the list df = pd.DataFrame(data, columns=["書(shū)名", "價(jià)格", "評(píng)論數(shù)", "作者", "出版年份", "出版社"]) for _, row in df.iterrows(): print("書(shū)名:", row["書(shū)名"]) print("價(jià)格:", row["價(jià)格"]) print("評(píng)論數(shù):", row["評(píng)論數(shù)"]) print("作者:", row["作者"]) print("出版年份:", row["出版年份"]) print("出版社:", row["出版社"]) print("---------------------------------") df.to_csv("book_data.csv", index=False) fetch_data()