Python爬蟲(chóng)幾個(gè)步驟教你寫(xiě)入mysql數(shù)據(jù)庫(kù)
Python爬蟲(chóng)幾個(gè)步驟教你寫(xiě)入mysql數(shù)據(jù)庫(kù)
Python爬蟲(chóng)實(shí)現(xiàn)爬取網(wǎng)站中的數(shù)據(jù)并存入MySQL數(shù)據(jù)庫(kù)中,在爬取的時(shí)候總要涉及到數(shù)據(jù)持久化存儲(chǔ),當(dāng)然有很多中存儲(chǔ)的方式,簡(jiǎn)單點(diǎn)的有excel、txt、json、csv等等。存入mysql我覺(jué)的有好多操作空間,如果是開(kāi)發(fā)python后端也可以熟悉一下sql語(yǔ)句,存入數(shù)據(jù)庫(kù)的方法也是試了些許網(wǎng)上一些方法,現(xiàn)在把完整功能供大家參考。
直接搜索?phpStudy安裝即可,按照下圖配置數(shù)據(jù)庫(kù)。用戶(hù)名密碼自行設(shè)置,然后返回首頁(yè)啟動(dòng)即可。
pip?install?pymysql
打開(kāi)剛安裝的phpstudy安裝一個(gè)mysql客戶(hù)端連接,數(shù)據(jù)庫(kù)是本地的host可以填 127.0.0.1 或 localhost用戶(hù)名密碼是上面設(shè)置的
MySQL創(chuàng)建對(duì)應(yīng)的表
CREATE?TABLE?`text_archives`??( ??`id`?int(10)?UNSIGNED?NOT?NULL?AUTO_INCREMENT?COMMENT?'ID', ??`url`?varchar(255)?CHARACTER?SET?utf8mb4?COLLATE?utf8mb4_unicode_ci?NOT?NULL?DEFAULT?''?COMMENT?'鏈接', ??`title`?varchar(255)?CHARACTER?SET?utf8mb4?COLLATE?utf8mb4_unicode_ci?NOT?NULL?DEFAULT?''?COMMENT?'標(biāo)題', ??`image`?varchar(255)?CHARACTER?SET?utf8mb4?COLLATE?utf8mb4_unicode_ci?NOT?NULL?DEFAULT?''?COMMENT?'圖片', ??`keywords`?varchar(255)?CHARACTER?SET?utf8mb4?COLLATE?utf8mb4_unicode_ci?NOT?NULL?COMMENT?'關(guān)鍵描述', ??`description`?varchar(600)?CHARACTER?SET?utf8mb4?COLLATE?utf8mb4_unicode_ci?NOT?NULL?DEFAULT?''?COMMENT?'內(nèi)容描述', ??`content`?text?CHARACTER?SET?utf8mb4?COLLATE?utf8mb4_unicode_ci?NULL?COMMENT?'內(nèi)容', ??`weigh`?int(10)?NOT?NULL?DEFAULT?0?COMMENT?'權(quán)重', ??`createtime`?bigint(16)?NOT?NULL?DEFAULT?0?COMMENT?'創(chuàng)建時(shí)間', ??`updatetime`?bigint(16)?NOT?NULL?DEFAULT?0?COMMENT?'更新時(shí)間', ??`deletetime`?bigint(16)?NULL?DEFAULT?NULL?COMMENT?'刪除時(shí)間', ??PRIMARY?KEY?(`id`)?USING?BTREE )?ENGINE?=?InnoDB?AUTO_INCREMENT?=?2692?CHARACTER?SET?=?utf8mb4?COLLATE?=?utf8mb4_unicode_ci?COMMENT?=?'內(nèi)容表'?ROW_FORMAT?=?Dynamic; SET?FOREIGN_KEY_CHECKS?=?1;
構(gòu)造 SQL 語(yǔ)句的字符串 sql ,然后通過(guò) cursor.excute(sql) 執(zhí)行,下面簡(jiǎn)單的封裝,直接復(fù)制即可用。
import?pymysql class?Mysql(object): ????def?__init__(self): ????????self._connect?=?pymysql.connect( ????????????host='127.0.0.1', ????????????user='test', ????????????password='######', ????????????database='test', ????????????charset='utf8mb4', ????????????cursorclass=pymysql.cursors.DictCursor ????????) ????????self._cursor?=?self._connect.cursor() ????def?inset_db(self,?table_name,?insert_data): ????????try: ????????????data?=?self.get_mysql_data(data=insert_data) ????????????fields?=?data[0] ????????????values?=?data[1] ????????????sql?=?"INSERT?INTO?{table_name}({fields})?values?({values})".format(table_name=table_name,?fields=fields, ????????????????????????????????????????????????????????????????????????????????values=values) ????????????self._cursor.execute(sql) ????????????self._connect.commit() ????????except?Exception?as?e: ????????????self._connect.rollback()??#?如果這里是執(zhí)行的執(zhí)行存儲(chǔ)過(guò)程的sql命令,那么可能會(huì)存在rollback的情況,所以這里應(yīng)該考慮到 ????????????print("數(shù)據(jù)插入失敗,失敗原因:",?e) ????????????print(insert_data) ????????else: ????????????#?self.db_close() ????????????return?self._cursor.lastrowid ????def?update_db(self,?table_name,?update_data,?wheres=None): ????????try: ????????????if?wheres?is?not?None: ????????????????sql?=?"UPDATE?{table_name}?SET?{update_data}?WHERE?{wheres}".format( ????????????????????table_name=table_name, ????????????????????update_data=update_data, ????????????????????wheres=wheres ????????????????) ????????????else: ????????????????sql?=?"UPDATE?{table_name}?SET?{update_data}".format( ????????????????????table_name=table_name, ????????????????????update_data=update_data) ????????????self._cursor.execute(sql) ????????????self._connect.commit() ????????except?Exception?as?e: ????????????print("更新失敗:",?e) ????????????return?False ????????else: ????????????#?self.db_close() ????????????return?True ????def?delete_db(self,?table_name,?wheres): ????????try: ????????????#?構(gòu)建sql語(yǔ)句 ????????????sql?=?"DELETE?FROM?{table_name}?WHERE?{wheres}".format(table_name=table_name,?wheres=wheres) ????????????self._cursor.execute(sql) ????????????self._connect.commit() ????????except?Exception?as?e: ????????????print('刪除失?。?#39;,?e) ????????????return?False ????????else: ????????????#?self.db_close() ????????????return?True ????def?select_db(self,?table_name,?fields,?wheres=None,?get_one=False): ????????try: ????????????if?wheres?is?not?None: ????????????????sql?=?"SELECT?{fields}?FROM?{table_name}?WHERE?{wheres}".format( ????????????????????fields=fields, ????????????????????table_name=table_name, ????????????????????wheres=wheres ????????????????) ????????????else: ????????????????sql?=?"SELECT?{fields}?FROM?{table_name}".format(fields=fields,?table_name=table_name) ????????????self._cursor.execute(sql) ????????????self._connect.commit() ????????????if?get_one: ????????????????result?=?self._cursor.fetchone() ????????????else: ????????????????result?=?self._cursor.fetchall() ????????except?Exception?as?e: ????????????print("查詢(xún)失敗",?e) ????????????return?None ????????else: ????????????#?self.db_close() ????????????return?result ????def?get_mysql_data(self,?data): ????????fields?=?"" ????????insert_data?=?"" ????????for?k,?v?in?data.items(): ????????????fields?=?fields?+?k?+?',' ????????????insert_data?=?insert_data?+?"'"?+?str(v)?+?"'"?+?',' ????????fields?=?fields.strip(',') ????????insert_data?=?insert_data.strip(',') ????????return?[fields,?insert_data] ????def?db_close(self): ????????self._cursor.close() ????????self._connect.close()
這次簡(jiǎn)單點(diǎn)咱們用xpath就行,有一個(gè)小技巧咱們?cè)谂廊〉木W(wǎng)頁(yè)打開(kāi)開(kāi)發(fā)都模式F12.如下圖紅框復(fù)制第一個(gè)或都第二個(gè)就行。
下面代碼是實(shí)現(xiàn)爬取數(shù)據(jù)然后存入數(shù)據(jù)庫(kù)類(lèi),大家可參考
from?model.nav?import?Nav import?requests from?urllib?import?parse from?lxml?import?etree from?fake_useragent?import?UserAgent from?lib.reptile?import?Reptile import?json class?Common(object): ????def?__init__(self,?params): ????????self.url?=?params['url'] ????????self.params?=?params ????????self.blog?=?1 ?? ????def?get_header(self): ????????ua?=?UserAgent() ????????headers?=?{ ????????????'User-Agent':?ua.random ????????} ????????return?headers ????def?get_html(self,?url): ????????#?在超時(shí)間內(nèi),對(duì)于失敗頁(yè)面嘗試請(qǐng)求三次 ????????if?self.blog?<=?3: ????????????try: ????????????????res?=?requests.get(url=url,?headers=self.get_header(),?timeout=3) ????????????????res.encoding?=?res.apparent_encoding ????????????????html?=?res.text ????????????????return?html ????????????except?Exception?as?e: ????????????????print(e) ????????????????self.blog?+=?1 ????????????????self.get_html(url) ????def?json_insert_data(self,?params): ??????category_id?=?self.insert_category(cname=params['category_name'],?pid=params['pid'],?icon='') ??????print("分類(lèi)插入成功:{}".format(params['category_name'])) ??????if?category_id: ??????????url?=?params['url'] ??????????title?=?params['title'] ??????????image?=?params['image'] ??????????description?=?params['description'] ??????????keywords?=?params['keywords'] ??????????content?=?params['content'] ??????????self.insert_archives(category_id,?url,?title,?image,?description,?keywords,?content) ??????????print("內(nèi)容插入成功:{}".format(title)) ??????????print("------------------------------------------------------------") ????def?get_item(self,?xpath_html): ????????item_list?=?xpath_html.xpath(self.params['item_xpath']) ????????print(item_list) ????????for?row?in?item_list: ????????????url_list?=?row.xpath(self.params['url_xpath']) ????????????if?len(url_list)?>?0: ????????????????self.get_content(url_list[0]) ????def?get_content(self,?url): ????????print("正在抓取鏈接:{}".format(url)) ????????domain?=?parse.urlparse(url).netloc ????????d_domain?=?parse.urlparse(self.url).netloc ????????if?domain?==?d_domain: ????????????html?=?self.get_html(url) ????????????self.reptile.blog?=?1 ????????????if?html: ????????????????p?=?etree.HTML(html) ????????????????title?=?self.get_conmon_content(p,?self.params['title_xpath']) ????????????????print("標(biāo)題為:{}".format(title)) ????????????????category_name?=?self.get_conmon_content(p,?self.params['category_xpath']) ????????????????print("分類(lèi)為:{}".format(category_name)) ????????????????image?=?self.get_conmon_content(p,?self.params['image_xpath']) ????????????????print("圖片為:{}".format(image)) ????????????????link?=?self.get_conmon_content(p,?self.params['link_xpaht']) ????????????????print("鏈接為:{}".format(link)) ????????????????description?=?self.get_conmon_content(p,?self.params['description_xpath']) ????????????????print("描述為:{}".format(description)) ????????????????keywords?=?self.get_conmon_content(p,?self.params['keywords_xpath']) ????????????????print("關(guān)鍵描述:{}".format(keywords)) ????????????????content?=?self.get_conmon_content(p,?self.params['content_xpath']) ????????????????print("內(nèi)容為:{}".format(content)) ????????????????params?=?{ ????????????????????"pid":?158, ????????????????????"title":?title, ????????????????????"category_name":?category_name, ????????????????????"image":?image, ????????????????????'url':?link, ????????????????????'description':?description, ????????????????????'keywords':?keywords, ????????????????????'content':?content, ????????????????} ????????????????if?title?and?category_name?and?link: ????????????????????self.json_insert_data(params)#存入數(shù)據(jù)庫(kù) ????def?get_conmon_content(self,?xpath_html,?xpath): ????????content_list?=?xpath_html.xpath(xpath) ????????content?=?'' ????????if?len(content_list)?>?0: ????????????content?=?content_list[0].strip() ????????return?content ????def?run(self): ????????print("url:{}".format(self.url)) ????????html?=?self.get_html(self.url) ????????if?html: ????????????p?=?etree.HTML(html) ????????????self.get_item(p) #爬取的xpath params?=?{ ????"url":?"https://www.widiz.com/",?#爬取url ????"url_xpath":?'.//a[1]/@href', ????"title_xpath":?'/html/body/div[1]/div[2]/div[3]/div/div[3]/div/h1/text()', ????"category_xpath":?'/html/body/div[1]/div[2]/div[3]/div/div[3]/div/a[1]/text()', ????"image_xpath":?'/html/body/div[1]/div[2]/div[3]/div/div[2]/div/img/@src', ????"link_xpaht":?'/html/body/div[1]/div[2]/div[3]/div/div[3]/div/div/div[1]/span/a/@href', ????"description_xpath":?'/html/head/meta[10]/@content', ????"keywords_xpath":?'/html/head/meta[5]/@content', ????"content_xpath":?'/html/body/div[1]/div[2]/div[3]/main/div[1]/div/div[1]/div/div[2]/text()' } Common(params).run()
最終效果: