Python超強(qiáng)爬蟲8天速成(完整版)爬取各種網(wǎng)站數(shù)據(jù)實(shí)戰(zhàn)案例

import requests import os from lxml import etree # 作業(yè):分頁(yè)爬取站長(zhǎng)素材的免費(fèi)簡(jiǎn)歷模板 if __name__ == '__main__': headers = { 'User-Agent': '====' } url = 'https://sc.chinaz.com/jianli/free_{}.html' # https://sc.chinaz.com/jianli/free_2.html for pageNum in range(1, 3): # 獲取1-10頁(yè)的模板 if pageNum == 1: new_url = 'https://sc.chinaz.com/jianli/free.html' else: new_url = url.format(pageNum) # 獲取某一頁(yè)簡(jiǎn)歷 response = requests.get(url=new_url, headers=headers) page_text = response.text # 實(shí)例化etree對(duì)象 tree = etree.HTML(page_text) # 創(chuàng)建文件夾保存簡(jiǎn)歷模板 if not os.path.exists('./jianliLibs'): os.mkdir('./jianliLibs') # 定位到所有a標(biāo)簽,a標(biāo)簽下有模板地址 a_list = tree.xpath('//div[@id="container"]//p/a') # 遍歷a標(biāo)簽,獲取模板名稱、模板地址,并下載模板 for a in a_list: # 獲取模板下載詳情頁(yè)面 down_url = a.xpath('./@href')[0] detail_data = requests.get(url=down_url, headers=headers) detail_data.encoding = 'utf-8' # 亂碼解決 detail_text = detail_data.text # 獲取下載模板的連接和名稱 download_tree = etree.HTML(detail_text) # 模板下載鏈接 target_url = download_tree.xpath('//ul[@class="clearfix"]/li[1]/a/@href')[0] # 模板名字 target_name = download_tree.xpath('//div[@class="ppt_tit clearfix"]/h1/text()')[0] + '.rar' # print(target_name) # 簡(jiǎn)歷下載 jianLi = requests.get(url=target_url, headers=headers).content jianLi_path = 'jianliLibs/' + target_name with open(jianLi_path, 'wb') as fp: fp.write(jianLi) print(target_name, '下載成功!') print('第{}頁(yè)簡(jiǎn)歷完成下載?。?!'.format(pageNum)) print('=========================================') print('\n')
標(biāo)簽: