爬取百度指數(shù)


緣由是想玩一下數(shù)據(jù)可視化,需要數(shù)據(jù)源,茅頭對(duì)準(zhǔn)了百度指數(shù),用周末時(shí)間碼了一下,在此記錄。
init.py
start_date = "2020-11-26"
end_date = "2022-11-25"
keyword_list = ["騰訊", "網(wǎng)易", "米哈游"]
header.py
cookie = ""
header = {
? ?"Accept": "application/json, text/plain, */*",
? ?"Accept-Encoding": "gzip, deflate, br",
? ?"Accept-Language": "zh-CN,zh;q=0.9",
? ?"Cipher-Text": "1669446300853_1669526110570_Zps8JEXzq9SKrsYVS8CTdXkWyVq/utJah5EChxPJNob8Fk+q4oEOdHsqjPHziaQdXUo3Soeq9UND9NJ7KI5474rTUgh6apQWleSKesxhLrG38d4HYhm3Z13QnTdY8SkltqngGgRMk7HZDt4ChGgwZwsbNYsvL1I9ur3MyF2msajKplNNj5Y3LMuaMS5gxyruyeErcZUV5UW9r2lxFRwMX9EnXK2ihb15TaGFqa6ByNjSoD8ixNXwP0VWTMhaYTo/8NEAi1pyKQUOhZ8BGBh8XTnS6s7Bue/cZ7O65Ai6xvs9YY/UQb9XGxkVyZ9EndTOf+Affh+MG6dbEwqFyn3gUGJnPxpQ8AnZdrhkdfZZBacjpUn+PuqRweESGv2Goi9dXFNYRfLK0/ZPM7dd75dTY+YyIEu6hYAQt3rBv9b3QVM=",
? ?"Connection": "keep-alive",
? ?"Cookie": f"{cookie}",
? ?"Host": "index.baidu.com",
? ?"Referer": "https://index.baidu.com/v2/main/index.html",
? ?"sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
? ?"sec-ch-ua-mobile": "?0",
? ?"sec-ch-ua-platform": "\"Windows\"",
? ?"Sec-Fetch-Dest": "empty",
? ?"Sec-Fetch-Mode": "cors",
? ?"Sec-Fetch-Site": "same-origin",
? ?"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
}
get_data.py
import requests
from header import header
from url_pool import url_get_data, url_uniqid
import json
from init import keyword_list, start_date, end_date
def get_data():
? ?data_list = []
? ?for keyword in keyword_list:
? ? ? ?word = f'[[{{"name":"{keyword}","wordType":1}}],[]]'
? ? ? ?params = {
? ? ? ? ? ?"area": "0",
? ? ? ? ? ?"word": word,
? ? ? ? ? ?"startDate": start_date,
? ? ? ? ? ?"endDate": end_date
? ? ? ?}
? ? ? ?# 找數(shù)據(jù)
? ? ? ?res = requests.get(url=url_get_data, params=params, headers=header)
? ? ? ?print(res.text)
? ? ? ?print("-------------")
? ? ? ?data_json = json.loads(res.text)
? ? ? ?data = data_json["data"]["userIndexes"][0]["all"]["data"]
? ? ? ?print(data)
? ? ? ?# 處理密匙
? ? ? ?uniqid = data_json["data"]["uniqid"]
? ? ? ?print("密匙id是:" + uniqid)
? ? ? ?res = requests.get(url=url_uniqid + uniqid, headers=header)
? ? ? ?key = json.loads(res.text)["data"]
? ? ? ?print("密鑰是:", key)
? ? ? ?data_list.append({"data": data, "key": key})
? ?return data_list
do_data.py
import datetime
from get_data import get_data
# 解密函數(shù)
def decryption(keys, data):
? ?dec_dict = {}
? ?for j in range(len(keys) // 2):
? ? ? ?dec_dict[keys[j]] = keys[len(keys) // 2 + j]
? ?dec_data = ''
? ?for k in range(len(data)):
? ? ? ?dec_data += dec_dict[data[k]]
? ?return dec_data
# 處理加密數(shù)據(jù)并整理[[],[]..]
def do_data():
? ?data_list = get_data()
? ?dec_data_list = []
? ?for one_data in data_list:
? ? ? ?temp_list = []
? ? ? ?dec_one_data = decryption(keys=one_data["key"], data=one_data["data"])
? ? ? ?temp_list.append(dec_one_data)
? ? ? ?dec_data_list.append(temp_list)
? ?return dec_data_list
url_pool.py
import requests
from header import header
from urllib.parse import urlencode
from init import start_date, end_date, keyword_list
url_get_data = f"https://index.baidu.com/api/SearchApi/index?"
url_uniqid = "https://index.baidu.com/Interface/ptbk?uniqid="
main.py
from init import keyword_list, end_date, start_date
from do_data import do_data
import datetime
if __name__ == '__main__':
? ?str_startDate = start_date
? ?str_endDate = end_date
? ?date_Start = datetime.datetime.strptime(str_startDate, '%Y-%m-%d')
? ?date_End = datetime.datetime.strptime(str_endDate, '%Y-%m-%d')
? ?dec_data_list = do_data()
? ?# 寫時(shí)間表頭
? ?first = 1
? ?with open("merge.csv", "a+", encoding="utf-8") as fp:
? ? ? ?while date_Start <= date_End:
? ? ? ? ? ?if first == 1:
? ? ? ? ? ? ? ?fp.write(",")
? ? ? ? ? ? ? ?first = 0
? ? ? ? ? ?str_date_Start = date_Start.strftime("%Y-%m-%d")
? ? ? ? ? ?print(str_date_Start)
? ? ? ? ? ?fp.write(str_date_Start+",")
? ? ? ? ? ?date_Start += datetime.timedelta(days=1)
? ? ? ?fp.write("\n")
? ?# 寫關(guān)鍵詞所對(duì)應(yīng)的每天的數(shù)據(jù)
? ?cnt = 0
? ?with open("merge.csv", "a+", encoding="utf-8") as fp:
? ? ? ?for key in keyword_list:
? ? ? ? ? ?fp.write(key+",")
? ? ? ? ? ?for data in dec_data_list[cnt]:
? ? ? ? ? ? ? ?fp.write(data+",")
? ? ? ? ? ?cnt += 1
? ? ? ? ? ?fp.write("\n")
更多內(nèi)容敬請(qǐng)關(guān)注https://github.com/faithererer,給顆小星星* v *
標(biāo)簽: