元氣桌面單頁面爬取
# 糖醋慕雨的快樂時光
import requests
import json
from openpyxl import load_workbook
from openpyxl import Workbook
import os
# 獲取程序的絕對路徑
current_directory = os.path.dirname(os.path.abspath(__file__))
# 創(chuàng)建douyin文件夾路徑
douyin_folder = os.path.join(current_directory, "元氣")
# 如果douyin文件夾不存在,則創(chuàng)建它
if not os.path.exists("元氣"):
? ?os.mkdir("元氣")
# 檢查文件是否存在
file_path_xlsm = "yuan.xlsm"
if not os.path.exists(file_path_xlsm):
? ?print('not exists')
? ?# 創(chuàng)建一個工作簿
? ?wb = Workbook()
? ?# 保存為.xlsx文件
? ?wb.save(file_path_xlsm)
else:
? ?# 文件已存在,打開文件進行讀取操作
? ?print('exists')
# ? ? # 打開現(xiàn)有的.xlsx文件
# current_directory = os.path.dirname(os.path.abspath(__file__))
# file_path = os.path.join(current_directory,file_path_xlsm)
# wb = load_workbook(file_path)
wb = load_workbook(file_path_xlsm)
# 創(chuàng)建一個Workbook對象
# 選擇或新建一個工作表
ws = wb.active
ws.cell(row=1, column=1, value="作者名") ?# 作者名
ws.cell(row=1, column=2, value="作品名") ?# 作品名
ws.cell(row=1, column=3, value="作品圖片鏈接-封面") ?# 作品圖片鏈接
ws.cell(row=1, column=4, value="作品視頻鏈接") ?# ?# 作品視頻鏈接
ws.cell(row=1, column=5, value="作者主頁鏈接") ?# 作者主頁鏈接
ws.cell(row=1, column=6, value="4k視頻鏈接") ?# 4k視頻鏈接
ws.cell(row=1, column=7, value="作品上傳時間") ?# 作品上傳時間
dwid = input('請輸入url地址:')
dwid = int(dwid.split('=')[1])
detail_params = {
? ?'wid': dwid,
? ?'common': {'player_version': 0}
}
headers = {
? ?'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
? ?'accept-language': 'zh-CN,zh;q=0.9',
? ?'accept-encoding': 'gzip, deflate, br',
? ?'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
detail_url = 'https://pcwallpaper.zhhainiao.com/wallpaper/live/detail' #
# 獲取頁面html
def scrape_Main(url,headers,detail_params):
? ?response = requests.post(url, headers=headers,json=detail_params)
? ?try:
? ? ? ?if response.status_code == 200:
? ? ? ? ? ?response.encoding = response.apparent_encoding
? ? ? ? ? ?# print(response.text)/
? ? ? ? ? ?return response.text
? ?except requests.RequestException:
? ? ? ?print('請求失敗')
html = scrape_Main(detail_url,headers,detail_params)
# author_uid = json.loads(html) ?# 作者獨有的uid
# print(author_uid) ?# 41424485
data = json.loads(html)['data']
# print(data)
preview_video = data['preview_video'] ?# 視頻鏈接
update_time = data['update_time'] ?# 更新時間- 上傳時間
preview_jpg = data['preview_jpg'].split('?')[0] ?# 圖片鏈接
author = 'https://wp.cheetahfun.com/personal/author?author_uid=' + str(data['author_uid']) ?# 作者主頁鏈接
video_4k = data['video_4k'] ?# 我打不開, 應該是4k 視頻
author_name = data['author_name'] ?# 作者名
wname = data['wname'] ?# 作品名
# 獲取最后一行的行號
last_row = ws.max_row
ws.cell(row=last_row + 1, column=1, value=author_name) ?# 作者名
ws.cell(row=last_row + 1, column=2, value=wname) ? ? ? ?# 作品名
ws.cell(row=last_row + 1, column=3, value=preview_jpg) ?# 作品圖片鏈接
ws.cell(row=last_row + 1, column=4, value=preview_video) ?# 作品視頻鏈接
ws.cell(row=last_row + 1, column=5, value=author) ?# 作者主頁鏈接
ws.cell(row=last_row + 1, column=6, value=video_4k) ?# 4k視頻鏈接
ws.cell(row=last_row + 1, column=7, value=update_time) ?# 作品上傳時間
with open('yuan.txt', 'w', encoding='utf-8') as f:
? ?f.write("作者名:{}\n作品名:{}\n作者主頁:{}\n作品視頻:{}\n圖片鏈接:{}\n作品上傳時間:{}\n".format(author_name, wname, author,preview_video,preview_jpg,update_time))
print(' {}: ?數(shù)據(jù)已保存在yuan.txt文件下'.format(wname))
# 凍結(jié)首行
ws.freeze_panes = 'A2'
# 保存文件
wb.save(file_path_xlsm)
# 創(chuàng)建"元氣\\wname\\"文件夾路徑
douyin_folder = os.path.join("元氣\\", f"{wname}\\")
# 如果"元氣\\wname\\"文件夾不存在,則創(chuàng)建它
if not os.path.exists(douyin_folder):
? ?os.mkdir(douyin_folder)
Video = requests.get(preview_video).content
with open('元氣\\' + ?f'{wname}\\'+ wname + '.mp4', mode='wb+') as f:
? ?f.write(Video)
? ?print(wname + ': 視頻下載完成')
preview_jpg = requests.get(preview_jpg).content
with open('元氣\\' + f'{wname}\\'+ wname + '.jpg', mode='wb+') as f:
? ?f.write(preview_jpg)
? ?print(wname + ': 圖片下載完成')
print('作品: {} 保存成功'.format(wname))
import pandas as pd
# 讀取Excel文件
df = pd.read_excel(file_path_xlsm)
# 使用drop_duplicates方法去重
df_deduplicated = df.drop_duplicates()
# 將去重后的DataFrame保存回Excel
df_deduplicated.to_excel(file_path_xlsm, index=False)
? ?# 打開現(xiàn)有的.xlsx文件
# current_directory = os.path.dirname(os.path.abspath(__file__))
# file_path = os.path.join(current_directory, file_path_xlsm)
wb = load_workbook(file_path_xlsm)
# 選擇或新建一個工作表
ws = wb.active
# 凍結(jié)首行
ws.freeze_panes = 'A2'
# 保存文件
wb.save(file_path_xlsm)
print('數(shù)據(jù)去重成功')
# 測試: https://wp.cheetahfun.com/personal/wallpaper?dwid=180048