基于Python的爬蟲演示示例-以電影網(wǎng)站為例
一,項(xiàng)目簡介
? ?基于Python實(shí)現(xiàn)豆瓣電影數(shù)據(jù)的抓去,并存入本在數(shù)據(jù)庫。
? ?數(shù)據(jù)庫結(jié)構(gòu)準(zhǔn)備:
? ?
create table if not exists `categories` (
? ?`id` int(11) NOT NULL PRIMARY KEY,
? ?`type` varchar (255) NOT NULL DEFAULT '') ENGINE=InnoDB ?DEFAULT CHARSET=utf8;create table if not exists `movies`(
? ?`id` int(11) NOT NULL PRIMARY KEY AUTO_INCREMENT,
? ?`cover` varchar (255) NOT NULL DEFAULT '',
? ?`title` varchar (50) NOT NULL DEFAULT '',
? ?`date` varchar (10) NOT NULL DEFAULT '',
? ?`rate` float DEFAULT 0,
? ?`director` varchar (100) NOT NULL DEFAULT '',
? ?`scriptwriter` varchar(100) NOT NULL DEFAULT '',
? ?`actors` text,
? ?`district` varchar(255) DEFAULT '',
? ?`language` varchar (255) DEFAULT '',
? ?`duration` varchar (100) DEFAULT '',
? ?`abs` text, ? ?UNIQUE (`title`)
)ENGINE=InnoDB ?DEFAULT CHARSET=utf8;create table if not exists `movie-category` (
? ?`id` BIGINT NOT NULL PRIMARY KEY AUTO_INCREMENT,
? ?`mid` int(11) NOT NULL,
? ?`cid` int(11) NOT NULL,
? ?KEY `fk_on_movie_id` (`mid`), ? ?CONSTRAINT `fk_on_movie_id` FOREIGN KEY (`mid`) REFERENCES `movies` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
? ?KEY `fk_on_category_id` (`cid`), ? ?CONSTRAINT `fk_on_category_id` FOREIGN KEY (`cid`) REFERENCES `categories` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
)ENGINE=InnoDB ?DEFAULT CHARSET=utf8;
INSERT INTO `categories` VALUES (1,'劇情');INSERT INTO `categories` VALUES (2,'喜劇');INSERT INTO `categories` VALUES (3,'動(dòng)作');INSERT INTO `categories` VALUES (4,'愛情');INSERT INTO `categories` VALUES (5,'科幻');INSERT INTO `categories` VALUES (6,'動(dòng)畫');INSERT INTO `categories` VALUES (7,'懸疑');INSERT INTO `categories` VALUES (8,'驚悚');INSERT INTO `categories` VALUES (9,'恐怖');INSERT INTO `categories` VALUES (10,'犯罪');INSERT INTO `categories` VALUES (11,'同性');INSERT INTO `categories` VALUES (12,'音樂');INSERT INTO `categories` VALUES (13,'歌舞');INSERT INTO `categories` VALUES (14,'傳記');INSERT INTO `categories` VALUES (15,'歷史');INSERT INTO `categories` VALUES (16,'戰(zhàn)爭');INSERT INTO `categories` VALUES (17,'西部');INSERT INTO `categories` VALUES (18,'奇幻');INSERT INTO `categories` VALUES (19,'冒險(xiǎn)');INSERT INTO `categories` VALUES (20,'災(zāi)難');INSERT INTO `categories` VALUES (21,'武俠');INSERT INTO `categories` VALUES (22,'情色');
二,環(huán)境介紹
語言環(huán)境:Python3.7+scrapy
數(shù)據(jù)庫:Mysql: mysql5.7
開發(fā)工具:IDEA或eclipse
三,核心代碼展示
數(shù)據(jù)模型:items.py
# Define here the models for your scraped items## See documentation in:# https://docs.scrapy.org/en/latest/topics/items.htmlimport scrapyclass DoubanItem(scrapy.Item): ? ?# define the fields for your item here like:
? ?# name = scrapy.Field()
? ?#電影標(biāo)題
? ?title = scrapy.Field() ? ?#導(dǎo)演
? ?director = scrapy.Field() ? ?#編劇
? ?scriptwriter = scrapy.Field() ? ?#演員
? ?actors = scrapy.Field() ? ?#上映日期
? ?date = scrapy.Field() ? ?#評(píng)分
? ?rate = scrapy.Field() ? ?#國家/地區(qū)
? ?district = scrapy.Field() ? ?#語言
? ?language = scrapy.Field() ? ?#封面圖片
? ?cover = scrapy.Field() ? ?#簡介
? ?abs = scrapy.Field() ? ?#類型
? ?categories = scrapy.Field() ? ?#時(shí)長
? ?duration = scrapy.Field()
數(shù)據(jù)存儲(chǔ)工具定義:pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from scrapy.exceptions import DropItem
from scrapy.http import Request
from scrapy.pipelines.images import ImagesPipeline
import pymysql
import random
class DoubanPipeline:
? ?def process_item(self, item, spider):
? ? ? ?return item
#根據(jù)取得的圖片url重新請(qǐng)求,下載圖片到本地
class DownloadImagePipeline(ImagesPipeline):
? ?default_headers = {
? ? ? ?"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36",
? ? ? ?#"Cookie":'_vwo_uuid_v2=D65EBF690D9454DE4C13354E37DC5B9AA|3bb7e6e65f20e31141b871b4fea88dc2; __yadk_uid=QBp8bLKHjCn5zS2J5r8xV7327R0wnqkU; douban-fav-remind=1; gr_user_id=0a41d8d1-fe39-4619-827a-17961cf31795; viewed="35013197_10769749_23008813_26282806_34912177_22139960_35003794_30249691_26616244_27035127"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.21320; bid=gplG4aEN4Xc; ll="108288"; ap_v=0,6.0; __utma=30149280.819011260.1572087992.1604448803.1604453561.105; __utmc=30149280; __utmz=30149280.1604453561.105.65.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __gads=ID=eddb65558a1da756-223ab4f88bc400c8:T=1604453562:RT=1604453562:S=ALNI_MZGB_I69qmiL2tt3lm57JVX1i4r2w; __utmb=30149280.4.10.1604453561; dbcl2="213202515:Ip9mjwUAab4"; ck=wxUS; __utma=223695111.897479705.1572088003.1604448803.1604455298.71; __utmb=223695111.0.10.1604455298; __utmc=223695111; __utmz=223695111.1604455298.71.42.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1604455298%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _pk_id.100001.4cf6=e11874c5506d4ab1.1572088003.71.1604455342.1604450364.'
? ?}
? ?def get_media_requests(self, item, info):
? ? ? ?#print('到這里來了...')
? ? ? ?image_url = item['cover']
? ? ? ?yield Request(
? ? ? ? ? ?image_url,
? ? ? ? ? ?headers=self.default_headers)
? ?#get_media_requests函數(shù)返回后執(zhí)行
? ?def item_completed(self, results, item, info):
? ? ? ?image_paths = [x['path'] for ok, x in results if ok]
? ? ? ?if not image_paths:
? ? ? ? ? ?raise DropItem("Item contains no images")
? ? ? ?#返回的圖片地址是full+文件名的格式,由于我是邊爬邊下載,所以每次只有一張圖片,但是返回的是
? ? ? ?#數(shù)組,函數(shù)設(shè)計(jì)為多張圖片,我將‘full’替換成了自己后臺(tái)接口的地址,方便數(shù)據(jù)庫中的存儲(chǔ)
? ? ? ?image_paths = str(image_paths[0]).replace('full','http://localhost:8443/api/file')
? ? ? ?item['cover'] = image_paths
? ? ? ?return item
# 將電影信息存入到數(shù)據(jù)庫中
class DBPipeline(object):
? ?def __init__(self):
? ? ? ?# connection database
? ? ? ?# 后面三個(gè)依次是數(shù)據(jù)庫連接名、數(shù)據(jù)庫密碼、數(shù)據(jù)庫名稱
? ? ? ?self.connect = pymysql.connect(host='127.0.0.1', user='root', password='root',
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? db='fivesix',charset='utf8',port=3306)
? ? ? ?# get cursor
? ? ? ?self.cursor_1 = self.connect.cursor()
? ? ? ?self.cursor_2 = self.connect.cursor()
? ? ? ?self.type_to_id = {
? ? ? ? ? ?'劇情': 1,'喜劇':2, '動(dòng)作':3,
? ? ? ? ? ?'愛情': 4, '科幻':5, '動(dòng)畫':6,
? ? ? ? ? ?'懸疑': 7, '驚悚' : 8, '恐怖' : 9,
? ? ? ? ? ?'犯罪': 10, '同性':11, '音樂':12,
? ? ? ? ? ?'歌舞':13, '傳記':14,'歷史':15,
? ? ? ? ? ?'戰(zhàn)爭':16, '西部':17, '奇幻':18,
? ? ? ? ? ?'冒險(xiǎn)':19, '災(zāi)難':20,'武俠':21, '情色':22
? ? ? ?}
? ? ? ?print("連接數(shù)據(jù)庫成功")
? ?def process_item(self, item, spider):
? ? ? ?if item['title'] == '':
? ? ? ? ? ?return
? ? ? ?# sql語句
? ? ? ?insert_movie_sql = """
? ? ? ?insert ignore into `movies`(cover,title, director, scriptwriter, actors, district,rate,date,language,duration,abs) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
? ? ? ?"""
? ? ? ?insert_mc_sql = """
? ? ? ?insert into `movie-category` (mid,cid) values (%s,%s)
? ? ? ?"""
? ? ? ?# 執(zhí)行插入數(shù)據(jù)到數(shù)據(jù)庫操作
? ? ? ?self.cursor_1.execute(insert_movie_sql, (item['cover'], item['title'], item['director'], item['scriptwriter'],
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? item['actors'],item['district'],item['rate'],
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? item['date'],item['language'],item['duration'],item['abs']))
? ? ? ?mid = self.cursor_1.lastrowid
? ? ? ?#處理標(biāo)簽
? ? ? ?cids = []
? ? ? ?categories = item['categories'].split('/')
? ? ? ?for c in categories:
? ? ? ? ? ?if c not in self.type_to_id.keys():continue
? ? ? ? ? ?cids.append(self.type_to_id.get(c))
? ? ? ?#插入關(guān)聯(lián)表
? ? ? ?print(cids)
? ? ? ?for cid in cids:
? ? ? ? ? ?self.cursor_2.execute(insert_mc_sql,(mid,cid))
? ? ? ?# 提交,不進(jìn)行提交無法保存到數(shù)據(jù)庫
? ? ? ?self.connect.commit()
? ?def close_spider(self, spider):
? ? ? ?# 關(guān)閉游標(biāo)和連接
? ? ? ?self.cursor_1.close()
? ? ? ?self.cursor_2.close()
? ? ? ?self.connect.close()
爬蟲核心代碼:movies.py
# -*- coding: utf-8 -*-import scrapyimport jsonimport reimport timefrom douban.items import DoubanItemfrom fake_useragent import UserAgentimport randomclass MovieHotSpider(scrapy.Spider): ? ?#爬蟲的名稱,在命令行可以方便的運(yùn)行爬蟲
? ?name = "movie_hot"
? ?allowed_domains = ["movie.douban.com"] ? ?#pro = ['139.224.37.83','115.223.7.110','221.122.91.75']
? ?# 拼接豆瓣電影URL
? ?BASE_URL = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%s&sort=recommend&page_limit=%s&page_start=%s'
? ?MOVIE_TAG = '華語'
? ?PAGE_LIMIT = 20
? ?page_start = 0
? ?domains = BASE_URL % (MOVIE_TAG, PAGE_LIMIT, page_start) ? ?#偽裝瀏覽器
? ?headers = { ? ? ? ?"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
? ? ? ?#,"Cookie":'_vwo_uuid_v2=D65EBF690D9454DE4C13354E37DC5B9AA|3bb7e6e65f20e31141b871b4fea88dc2; __yadk_uid=QBp8bLKHjCn5zS2J5r8xV7327R0wnqkU; douban-fav-remind=1; gr_user_id=0a41d8d1-fe39-4619-827a-17961cf31795; viewed="35013197_10769749_23008813_26282806_34912177_22139960_35003794_30249691_26616244_27035127"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.21320; bid=gplG4aEN4Xc; ll="108288"; ap_v=0,6.0; __utma=30149280.819011260.1572087992.1604448803.1604453561.105; __utmc=30149280; __utmz=30149280.1604453561.105.65.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __gads=ID=eddb65558a1da756-223ab4f88bc400c8:T=1604453562:RT=1604453562:S=ALNI_MZGB_I69qmiL2tt3lm57JVX1i4r2w; __utmb=30149280.4.10.1604453561; dbcl2="213202515:Ip9mjwUAab4"; ck=wxUS; __utma=223695111.897479705.1572088003.1604448803.1604455298.71; __utmb=223695111.0.10.1604455298; __utmc=223695111; __utmz=223695111.1604455298.71.42.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1604455298%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _pk_id.100001.4cf6=e11874c5506d4ab1.1572088003.71.1604455342.1604450364.'
? ?} ? ?#總共爬取的頁數(shù)
? ?pages = 100
? ?# 爬蟲從此開始
? ?def start_requests(self): ? ? ? ?print('~~~~爬取列表: '+ self.domains) ? ? ? ?yield scrapy.Request(
? ? ? ? ? ?url = self.domains,
? ? ? ? ? ?headers=self.headers,
? ? ? ? ? ?callback=self.request_movies
? ? ? ?) ? ?# 分析列表頁
? ?def request_movies(self, response):
? ? ? ?infos = response.text ? ? ? ?# 使用JSON模塊解析響應(yīng)結(jié)果
? ? ? ?infos = json.loads(infos) ? ? ? ?# 迭代影片信息列表
? ? ? ?for movie_info in infos['subjects']: ? ? ? ? ? ?print('~~~爬取電影: ' + movie_info['title'] + '/'+ movie_info['rate']) ? ? ? ? ? ?# 提取影片頁面url,構(gòu)造Request發(fā)送請(qǐng)求,并將item通過meta參數(shù)傳遞給影片頁面解析函數(shù)
? ? ? ? ? ?yield scrapy.Request(
? ? ? ? ? ? ? ?url = str(movie_info['url']),
? ? ? ? ? ? ? ?headers = self.headers,
? ? ? ? ? ? ? ?callback = self.request_movie,
? ? ? ? ? ? ? ?dont_filter=True
? ? ? ? ? ?) ? ? ? ?#如果已經(jīng)爬完pages或者當(dāng)前標(biāo)簽下沒有更多電影時(shí)退出
? ? ? ?if self.pages > 0 and len(infos['subjects']) == self.PAGE_LIMIT:
? ? ? ? ? ?self.pages -= 1
? ? ? ? ? ?self.page_start += self.PAGE_LIMIT
? ? ? ? ? ?url = self.BASE_URL % (self.MOVIE_TAG,self.PAGE_LIMIT,self.page_start)
? ? ? ? ? ?time.sleep(5) ? ? ? ? ? ?print('-----爬取列表: ' + url) ? ? ? ? ? ?yield scrapy.Request(
? ? ? ? ? ? ? ?url=url,
? ? ? ? ? ? ? ?headers=self.headers,
? ? ? ? ? ? ? ?callback=self.request_movies,
? ? ? ? ? ? ? ?dont_filter=True
? ? ? ? ? ?) ? ?# 分析詳情頁
? ?def request_movie(self, response): ? ? ? ?#組裝數(shù)據(jù)
? ? ? ?movie_item = DoubanItem()
? ? ? ?title = response.css('div#content>h1>span:nth-child(1)::text').extract_first()
? ? ? ?t = re.findall('[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5_0-9]', title) ? ? ? ?#獲取非info區(qū)域數(shù)據(jù)
? ? ? ?movie_item['title'] = ''.join(t)
? ? ? ?movie_item['date'] = response.css('div#content>h1>span.year::text').extract_first()[1:-1]
? ? ? ?movie_item['rate'] = response.css('strong.rating_num::text').extract_first() ? ? ? ?#movie_item['commentCount'] = response.css('div.rating_sum>a.rating_people>span::text').extract_first()
? ? ? ?#movie_item['start'] = '/'.join(response.css('span.rating_per::text').extract())
? ? ? ?#movie_item['better'] = '/'.join(response.css('div.rating_betterthan>a::text').extract())
? ? ? ?movie_item['abs'] = response.css('#link-report>span::text').extract_first().strip()
? ? ? ?movie_item['cover'] = response.css('#mainpic>a>img::attr(src)').extract_first() ? ? ? ?# 獲取整個(gè)信息字符串
? ? ? ?info = response.css('div.subject div#info').xpath('string(.)').extract_first() ? ? ? ?# 提取所以字段名
? ? ? ?fields = [s.strip().replace(':', '') for s in response.css('div#info span.pl::text').extract()] ? ? ? ?# 提取所有字段的值
? ? ? ?values = [re.sub('\s+', '', s.strip()) for s in re.split('\s*(?:%s):\s*' % '|'.join(fields), info)][1:] ? ? ? ?# 處理列名稱
? ? ? ?for i in range(len(fields)): ? ? ? ? ? ?if '導(dǎo)演' == fields[i]:
? ? ? ? ? ? ? ?fields[i] = 'director'
? ? ? ? ? ?if '編劇' == fields[i]:
? ? ? ? ? ? ? ?fields[i] = 'scriptwriter'
? ? ? ? ? ?if '主演' == fields[i]:
? ? ? ? ? ? ? ?fields[i] = 'actors'
? ? ? ? ? ?if '類型' == fields[i]:
? ? ? ? ? ? ? ?fields[i] = 'categories'
? ? ? ? ? ?if '制片國家/地區(qū)' == fields[i]:
? ? ? ? ? ? ? ?fields[i] = 'district'
? ? ? ? ? ?if '語言' == fields[i]:
? ? ? ? ? ? ? ?fields[i] = 'language'
? ? ? ? ? ?if '片長' == fields[i]:
? ? ? ? ? ? ? ?fields[i] = 'duration'
? ? ? ?# 將所有信息填入item
? ? ? ?other_info = list(zip(fields,values)) ? ? ? ?for field,value ?in other_info: ? ? ? ? ? ?if field in ['IMDb鏈接','上映日期','官方網(wǎng)站','又名']:
? ? ? ? ? ? ? ?other_info.remove((field,value))
? ? ? ?final_info = dict(other_info[:-1])
? ? ? ?movie_item.update(final_info) ? ? ? ?# 處理缺失字段
? ? ? ?if not 'director' in movie_item.keys():
? ? ? ? ? ?movie_item['director'] = '/'
? ? ? ?if not 'scriptwriter' in movie_item.keys():
? ? ? ? ? ?movie_item['scriptwriter'] = '/'
? ? ? ?if not 'actors' in movie_item.keys():
? ? ? ? ? ?movie_item['actors'] = '/'
? ? ? ?if not 'categories' in movie_item.keys():
? ? ? ? ? ?movie_item['categories'] = '/'
? ? ? ?if not 'district' in movie_item.keys():
? ? ? ? ? ?movie_item['district'] = '/'
? ? ? ?if not 'language' in movie_item.keys():
? ? ? ? ? ?movie_item['language'] = '/'
? ? ? ?if not 'duration' in movie_item.keys():
? ? ? ? ? ?movie_item['duration'] = '/'
? ? ? ?print('~完成爬取電影: ' + movie_item['title'] + '/' + movie_item['rate']) ? ? ? ?#將數(shù)據(jù)加入到字典中
? ? ? ?yield movie_item
四,項(xiàng)目總結(jié)
? ? ? 爬取的數(shù)據(jù)最終會(huì)存到MYSQL服務(wù)器的表中,可以寫程序?qū)?shù)據(jù)展示出來。注意的時(shí)會(huì)對(duì)IP進(jìn)行限制封號(hào),200條為限,超過IP會(huì)被限制,可以換一個(gè)IP進(jìn)行抓去。主要研究爬蟲的基本使用規(guī)范和語法,相對(duì)較為簡單,供大家學(xué)習(xí)參考