黑馬程序員Python爬蟲(chóng)基礎(chǔ),快速入門Scrapy爬蟲(chóng)框架

#拿到頁(yè)面原地阿瑪 #拿到所有ul上面得a標(biāo)簽 from lxml import etree import re import requests import json domain = 'https://desk.zol.com.cn' url = 'https://desk.zol.com.cn/' resp = requests.get(url) resp.encoding = 'gbk' # print(resp.text) et = etree.HTML(resp.text) result = et.xpath('//ul[@class="pic-list2 clearfix"]/li/a/@href') # print(result) for item in result: url = domain + item print(url) # url = 'https://desk.zol.com.cn/bizhi/10055_120350_2.html' resp = requests.get(url) # print(resp.text) obj = re.compile(r'var deskPicArr.*?=(?P<deskPicArr>.*?);',re.S) result = obj.search(resp.text) deskPicArr = result.group('deskPicArr') # print(deskPicArr) dic = json.loads(deskPicArr) # print(dic) for item in dic['list']: oriSize = item.get('oriSize') imgsrc = item.get('imgsrc') # print(oriSize,imgsrc) imgsrc = imgsrc.replace('##SIZE##',oriSize) print(imgsrc) #發(fā)送網(wǎng)絡(luò)請(qǐng)求 name = imgsrc.split('/')[-1] resp_img = requests.get(imgsrc) #此時(shí)拿到的不是resp.text, resp.content拿到的是字節(jié) with open(f"img/{name}",mode="wb") as f: f.write(resp_img.content) # print(f)
標(biāo)簽: