pandas數(shù)據(jù)分析(合集)

我把pandas和老師講的爬蟲結(jié)合起來了。
爬蟲小技巧
如果要爬很多二級(jí)頁(yè)面,可以用break 將for提前退出
import requests
from lxml import etree
url='http://cd.lianjia.com/ershoufang'
headers={
??'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34"
}
# 發(fā)起請(qǐng)求
res=requests.get(url=url,headers=headers).text
tree=etree.HTML(res)
li_list = tree.xpath(
??'//ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"]')
detail_list=[]
for li in li_list:
#???獲取詳情頁(yè)地址
??detail_url=li.xpath('.//div[@class="title"]/a/@href')[0]
??detail_res=requests.get(url=detail_url,headers=headers).text
??detail_tree=etree.HTML(detail_res)
??item={}
??item['title']=detail_tree.xpath('//div[@class="title"]/h1/@title')[0]
??item['community']=detail_tree.xpath('/html/body/div[5]/div[2]/div[5]/div[1]/a[1]/text()')[0]
??item['years']=detail_tree.xpath('/html/body/div[5]/div[2]/div[4]/div[3]/div[2]/text()')[0]
??item['housetype']=detail_tree.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[1]/text()')[0]??
??item['square']=detail_tree.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[3]/text()')[0]
??item['floor']=detail_tree.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li[2]/text()')[0]
??item['taxtype']=detail_tree.xpath('//*[@id="introduction"]/div/div/div[2]/div[2]/ul/li[5]/span[2]/text()')[0]
??item['totalPrice']=detail_tree.xpath('//html/body/div[5]/div[2]/div[3]/div/span[1]/text()')[0]
??item['unitPrice']=detail_tree.xpath('/html/body/div[5]/div[2]/div[3]/div/div[1]/div[1]/span/text()')[0]
??item['followInfo']=detail_tree.xpath('//*[@id="favCount"]/text()')[0]
??detail_list.append(item)
df=pd.DataFrame(detail_list)
df.to_csv('./resource/data/house.csv')