最美情侣中文字幕电影,在线麻豆精品传媒,在线网站高清黄,久久黄色视频

歡迎光臨散文網(wǎng) 會(huì)員登陸 & 注冊(cè)

Python爬蟲(chóng)幾個(gè)步驟教你寫(xiě)入mysql數(shù)據(jù)庫(kù)

2023-02-16 23:00 作者:IFELES  | 我要投稿

Python爬蟲(chóng)幾個(gè)步驟教你寫(xiě)入mysql數(shù)據(jù)庫(kù)

Python爬蟲(chóng)實(shí)現(xiàn)爬取網(wǎng)站中的數(shù)據(jù)并存入MySQL數(shù)據(jù)庫(kù)中,在爬取的時(shí)候總要涉及到數(shù)據(jù)持久化存儲(chǔ),當(dāng)然有很多中存儲(chǔ)的方式,簡(jiǎn)單點(diǎn)的有excel、txt、json、csv等等。存入mysql我覺(jué)的有好多操作空間,如果是開(kāi)發(fā)python后端也可以熟悉一下sql語(yǔ)句,存入數(shù)據(jù)庫(kù)的方法也是試了些許網(wǎng)上一些方法,現(xiàn)在把完整功能供大家參考。

直接搜索?phpStudy安裝即可,按照下圖配置數(shù)據(jù)庫(kù)。用戶(hù)名密碼自行設(shè)置,然后返回首頁(yè)啟動(dòng)即可。

pip?install?pymysql

打開(kāi)剛安裝的phpstudy安裝一個(gè)mysql客戶(hù)端連接,數(shù)據(jù)庫(kù)是本地的host可以填 127.0.0.1 或 localhost用戶(hù)名密碼是上面設(shè)置的

MySQL創(chuàng)建對(duì)應(yīng)的表

CREATE?TABLE?`text_archives`??(
??`id`?int(10)?UNSIGNED?NOT?NULL?AUTO_INCREMENT?COMMENT?'ID',
??`url`?varchar(255)?CHARACTER?SET?utf8mb4?COLLATE?utf8mb4_unicode_ci?NOT?NULL?DEFAULT?''?COMMENT?'鏈接',
??`title`?varchar(255)?CHARACTER?SET?utf8mb4?COLLATE?utf8mb4_unicode_ci?NOT?NULL?DEFAULT?''?COMMENT?'標(biāo)題',
??`image`?varchar(255)?CHARACTER?SET?utf8mb4?COLLATE?utf8mb4_unicode_ci?NOT?NULL?DEFAULT?''?COMMENT?'圖片',
??`keywords`?varchar(255)?CHARACTER?SET?utf8mb4?COLLATE?utf8mb4_unicode_ci?NOT?NULL?COMMENT?'關(guān)鍵描述',
??`description`?varchar(600)?CHARACTER?SET?utf8mb4?COLLATE?utf8mb4_unicode_ci?NOT?NULL?DEFAULT?''?COMMENT?'內(nèi)容描述',
??`content`?text?CHARACTER?SET?utf8mb4?COLLATE?utf8mb4_unicode_ci?NULL?COMMENT?'內(nèi)容',
??`weigh`?int(10)?NOT?NULL?DEFAULT?0?COMMENT?'權(quán)重',
??`createtime`?bigint(16)?NOT?NULL?DEFAULT?0?COMMENT?'創(chuàng)建時(shí)間',
??`updatetime`?bigint(16)?NOT?NULL?DEFAULT?0?COMMENT?'更新時(shí)間',
??`deletetime`?bigint(16)?NULL?DEFAULT?NULL?COMMENT?'刪除時(shí)間',
??PRIMARY?KEY?(`id`)?USING?BTREE
)?ENGINE?=?InnoDB?AUTO_INCREMENT?=?2692?CHARACTER?SET?=?utf8mb4?COLLATE?=?utf8mb4_unicode_ci?COMMENT?=?'內(nèi)容表'?ROW_FORMAT?=?Dynamic;

SET?FOREIGN_KEY_CHECKS?=?1;

構(gòu)造 SQL 語(yǔ)句的字符串 sql ,然后通過(guò) cursor.excute(sql) 執(zhí)行,下面簡(jiǎn)單的封裝,直接復(fù)制即可用。

import?pymysql


class?Mysql(object):
????def?__init__(self):
????????self._connect?=?pymysql.connect(
????????????host='127.0.0.1',
????????????user='test',
????????????password='######',
????????????database='test',
????????????charset='utf8mb4',
????????????cursorclass=pymysql.cursors.DictCursor
????????)
????????self._cursor?=?self._connect.cursor()

????def?inset_db(self,?table_name,?insert_data):
????????try:
????????????data?=?self.get_mysql_data(data=insert_data)
????????????fields?=?data[0]
????????????values?=?data[1]
????????????sql?=?"INSERT?INTO?{table_name}({fields})?values?({values})".format(table_name=table_name,?fields=fields,
????????????????????????????????????????????????????????????????????????????????values=values)
????????????self._cursor.execute(sql)
????????????self._connect.commit()
????????except?Exception?as?e:
????????????self._connect.rollback()??#?如果這里是執(zhí)行的執(zhí)行存儲(chǔ)過(guò)程的sql命令,那么可能會(huì)存在rollback的情況,所以這里應(yīng)該考慮到
????????????print("數(shù)據(jù)插入失敗,失敗原因:",?e)
????????????print(insert_data)
????????else:
????????????#?self.db_close()
????????????return?self._cursor.lastrowid

????def?update_db(self,?table_name,?update_data,?wheres=None):
????????try:
????????????if?wheres?is?not?None:
????????????????sql?=?"UPDATE?{table_name}?SET?{update_data}?WHERE?{wheres}".format(
????????????????????table_name=table_name,
????????????????????update_data=update_data,
????????????????????wheres=wheres
????????????????)
????????????else:
????????????????sql?=?"UPDATE?{table_name}?SET?{update_data}".format(
????????????????????table_name=table_name,
????????????????????update_data=update_data)
????????????self._cursor.execute(sql)
????????????self._connect.commit()
????????except?Exception?as?e:
????????????print("更新失敗:",?e)
????????????return?False
????????else:
????????????#?self.db_close()
????????????return?True

????def?delete_db(self,?table_name,?wheres):
????????try:
????????????#?構(gòu)建sql語(yǔ)句
????????????sql?=?"DELETE?FROM?{table_name}?WHERE?{wheres}".format(table_name=table_name,?wheres=wheres)
????????????self._cursor.execute(sql)
????????????self._connect.commit()
????????except?Exception?as?e:
????????????print('刪除失?。?#39;,?e)
????????????return?False
????????else:
????????????#?self.db_close()
????????????return?True

????def?select_db(self,?table_name,?fields,?wheres=None,?get_one=False):
????????try:
????????????if?wheres?is?not?None:
????????????????sql?=?"SELECT?{fields}?FROM?{table_name}?WHERE?{wheres}".format(
????????????????????fields=fields,
????????????????????table_name=table_name,
????????????????????wheres=wheres
????????????????)
????????????else:
????????????????sql?=?"SELECT?{fields}?FROM?{table_name}".format(fields=fields,?table_name=table_name)
????????????self._cursor.execute(sql)
????????????self._connect.commit()
????????????if?get_one:
????????????????result?=?self._cursor.fetchone()
????????????else:
????????????????result?=?self._cursor.fetchall()
????????except?Exception?as?e:
????????????print("查詢(xún)失敗",?e)
????????????return?None
????????else:
????????????#?self.db_close()
????????????return?result

????def?get_mysql_data(self,?data):
????????fields?=?""
????????insert_data?=?""
????????for?k,?v?in?data.items():
????????????fields?=?fields?+?k?+?','
????????????insert_data?=?insert_data?+?"'"?+?str(v)?+?"'"?+?','
????????fields?=?fields.strip(',')
????????insert_data?=?insert_data.strip(',')
????????return?[fields,?insert_data]

????def?db_close(self):
????????self._cursor.close()
????????self._connect.close()

這次簡(jiǎn)單點(diǎn)咱們用xpath就行,有一個(gè)小技巧咱們?cè)谂廊〉木W(wǎng)頁(yè)打開(kāi)開(kāi)發(fā)都模式F12.如下圖紅框復(fù)制第一個(gè)或都第二個(gè)就行。

下面代碼是實(shí)現(xiàn)爬取數(shù)據(jù)然后存入數(shù)據(jù)庫(kù)類(lèi),大家可參考

from?model.nav?import?Nav
import?requests
from?urllib?import?parse
from?lxml?import?etree
from?fake_useragent?import?UserAgent
from?lib.reptile?import?Reptile
import?json


class?Common(object):
????def?__init__(self,?params):
????????self.url?=?params['url']
????????self.params?=?params
????????self.blog?=?1
??
????def?get_header(self):
????????ua?=?UserAgent()
????????headers?=?{
????????????'User-Agent':?ua.random
????????}
????????return?headers

????def?get_html(self,?url):
????????#?在超時(shí)間內(nèi),對(duì)于失敗頁(yè)面嘗試請(qǐng)求三次
????????if?self.blog?<=?3:
????????????try:
????????????????res?=?requests.get(url=url,?headers=self.get_header(),?timeout=3)
????????????????res.encoding?=?res.apparent_encoding
????????????????html?=?res.text
????????????????return?html
????????????except?Exception?as?e:
????????????????print(e)
????????????????self.blog?+=?1
????????????????self.get_html(url)
????def?json_insert_data(self,?params):
??????category_id?=?self.insert_category(cname=params['category_name'],?pid=params['pid'],?icon='')
??????print("分類(lèi)插入成功:{}".format(params['category_name']))
??????if?category_id:
??????????url?=?params['url']
??????????title?=?params['title']
??????????image?=?params['image']
??????????description?=?params['description']
??????????keywords?=?params['keywords']
??????????content?=?params['content']
??????????self.insert_archives(category_id,?url,?title,?image,?description,?keywords,?content)
??????????print("內(nèi)容插入成功:{}".format(title))
??????????print("------------------------------------------------------------")
????def?get_item(self,?xpath_html):
????????item_list?=?xpath_html.xpath(self.params['item_xpath'])
????????print(item_list)
????????for?row?in?item_list:
????????????url_list?=?row.xpath(self.params['url_xpath'])
????????????if?len(url_list)?>?0:
????????????????self.get_content(url_list[0])

????def?get_content(self,?url):
????????print("正在抓取鏈接:{}".format(url))
????????domain?=?parse.urlparse(url).netloc
????????d_domain?=?parse.urlparse(self.url).netloc
????????if?domain?==?d_domain:
????????????html?=?self.get_html(url)
????????????self.reptile.blog?=?1
????????????if?html:
????????????????p?=?etree.HTML(html)
????????????????title?=?self.get_conmon_content(p,?self.params['title_xpath'])
????????????????print("標(biāo)題為:{}".format(title))
????????????????category_name?=?self.get_conmon_content(p,?self.params['category_xpath'])
????????????????print("分類(lèi)為:{}".format(category_name))
????????????????image?=?self.get_conmon_content(p,?self.params['image_xpath'])
????????????????print("圖片為:{}".format(image))
????????????????link?=?self.get_conmon_content(p,?self.params['link_xpaht'])
????????????????print("鏈接為:{}".format(link))
????????????????description?=?self.get_conmon_content(p,?self.params['description_xpath'])
????????????????print("描述為:{}".format(description))
????????????????keywords?=?self.get_conmon_content(p,?self.params['keywords_xpath'])
????????????????print("關(guān)鍵描述:{}".format(keywords))
????????????????content?=?self.get_conmon_content(p,?self.params['content_xpath'])
????????????????print("內(nèi)容為:{}".format(content))
????????????????params?=?{
????????????????????"pid":?158,
????????????????????"title":?title,
????????????????????"category_name":?category_name,
????????????????????"image":?image,
????????????????????'url':?link,
????????????????????'description':?description,
????????????????????'keywords':?keywords,
????????????????????'content':?content,
????????????????}
????????????????if?title?and?category_name?and?link:
????????????????????self.json_insert_data(params)#存入數(shù)據(jù)庫(kù)

????def?get_conmon_content(self,?xpath_html,?xpath):
????????content_list?=?xpath_html.xpath(xpath)
????????content?=?''
????????if?len(content_list)?>?0:
????????????content?=?content_list[0].strip()
????????return?content

????def?run(self):
????????print("url:{}".format(self.url))
????????html?=?self.get_html(self.url)
????????if?html:
????????????p?=?etree.HTML(html)
????????????self.get_item(p)

#爬取的xpath
params?=?{
????"url":?"https://www.widiz.com/",?#爬取url
????"url_xpath":?'.//a[1]/@href',
????"title_xpath":?'/html/body/div[1]/div[2]/div[3]/div/div[3]/div/h1/text()',
????"category_xpath":?'/html/body/div[1]/div[2]/div[3]/div/div[3]/div/a[1]/text()',
????"image_xpath":?'/html/body/div[1]/div[2]/div[3]/div/div[2]/div/img/@src',
????"link_xpaht":?'/html/body/div[1]/div[2]/div[3]/div/div[3]/div/div/div[1]/span/a/@href',
????"description_xpath":?'/html/head/meta[10]/@content',
????"keywords_xpath":?'/html/head/meta[5]/@content',
????"content_xpath":?'/html/body/div[1]/div[2]/div[3]/main/div[1]/div/div[1]/div/div[2]/text()'
}

Common(params).run()

最終效果:


Python爬蟲(chóng)幾個(gè)步驟教你寫(xiě)入mysql數(shù)據(jù)庫(kù)的評(píng)論 (共 條)

分享到微博請(qǐng)遵守國(guó)家法律
樟树市| 洞口县| 柞水县| 南昌市| 广昌县| 宽甸| 巫山县| 宁强县| 彭阳县| 盐城市| 仪陇县| 任丘市| 新密市| 石河子市| 浦城县| 彰化市| 承德市| 武山县| 鄂州市| 怀来县| 永德县| 海林市| 兰考县| 鹿邑县| 宁远县| 贺州市| 江华| 会同县| 利辛县| 吉水县| 滨州市| 隆回县| 满城县| 特克斯县| 定陶县| 鲜城| 巴林右旗| 江源县| 寿宁县| 清远市| 崇仁县|