商品——采集和分析
import requests,os
from lxml import etree
class Spider:
? ?def __init__(self):
? ? ? ?self.__site="http://39.106.48.176/"
? ? ? ?self.__file=f"{os.getcwd()}/goods.txt"
? ? ? ?self.__maxID=1277
? ? ? ?pass
? ?
? ?def __getData(self):
? ? ? ?header={
? ? ? ? ? ?"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
? ? ? ?}
? ? ? ?
? ? ? ?base=self.__site "/index.php?s=/index/goods/index/id/{}.html"
? ? ? ?
? ? ? ?file=open(self.__file,"a ",encoding="utf8")
? ? ? ?for num in range(1,self.__maxID 1):
? ? ? ? ? ?response=requests.get(base.format(num),headers=header)
? ? ? ? ? ?if response.status_code==200:
? ? ? ? ? ? ? ?if '資源不存在或已被刪除' not in response.text:
? ? ? ? ? ? ? ? ? ?html=etree.HTML(response.text)
? ? ? ? ? ? ? ? ? ?titles=html.xpath('/html/body/div[4]/div[2]/div[2]/div[1]/h1')
? ? ? ? ? ? ? ? ? ?title=titles[0].text.strip()
? ? ? ? ? ? ? ? ? ?prices=html.xpath('//b[@class="goods-price"]')
? ? ? ? ? ? ? ? ? ?price=prices[0].text
? ? ? ? ? ? ? ? ? ?counts=html.xpath('//span[@class="tm-count"]/text()')
? ? ? ? ? ? ? ? ? ?view=counts[1]
? ? ? ? ? ? ? ? ? ?sale=counts[0]
? ? ? ? ? ? ? ? ? ?stocks=html.xpath('//span[@class="stock"]/text()')
? ? ? ? ? ? ? ? ? ?stock=stocks[0]
? ? ? ? ? ? ? ? ? ?data=f"{num},{title},{price},{view},{sale},{stock}\n"
? ? ? ? ? ? ? ? ? ?file.write(data)
? ? ? ? ? ?else:
? ? ? ? ? ? ? ?continue
? ? ? ?file.close()
? ?def run(self):
? ? ? ?self.__getData()
if __name__=="__main__":
? ?spider=Spider()
? ?spider.run()
——————————————————————————————————————
create database if not exists shopxo;
use shopxo;
create table if not exists goods (
? ? id int,
? ? title string,
? ? price double,
? ? views int,
? ? sales int,
? ? stock int
) row format delimited
fields terminated by ','
;
load data local inpath '/root/college020/goods.txt' overwrite into table goods;
select * from goods limit 10;
insert overwrite local directory '/root/college023/'
row format delimited fields terminated by '\t'
select * from goods
where price is null
;
create table goods1 as
select * from goods
where title not like '%連衣裙%' or title not like '%女士%' and
price is not null
;
insert overwrite local directory '/root/college024/'
row format delimited fields terminated by '\t'
select count(*) from goods1;
insert overwrite local directory '/root/college025/'
row format delimited fields terminated by '\t'
select distinct title,price from goods1
order by price desc
limit 3
;
insert overwrite local directory '/root/college026/'
row format delimited fields terminated by '\t'
select split(title," ")[0] as brand,count(*) as brand_count from goods1
group by split(title," ")[0]
order by brand_count desc
limit 10
;
insert overwrite local directory '/root/college027/'
row format delimited fields terminated by '\t'
select perona,count(*) as p_count from goods1
lateral view explode(split(title," "))brand as perona
where title rlike ""
group by perona
order by p_count desc
limit 6
;