e-hentai爬⾍(更新)
使⽤了BeautifulSoup,re和urllib模块
#!/usr/bin/envpython
#-*-coding:utf-8-*-
__author__='Orange'
fromurllibimportrequest,par
importre
frombs4importBeautifulSoup
classItem(object):
time=''
img_url=''
tittle=''
down=''
def__init__(lf,time,img_url,tittle,down)->None:
=time
_url=img_url
=tittle
=down
defgetPages(html):
l(r'onclick="returnfal">(d+)',html)
defget_url(pages,key):
data={
'page':pages,
'f_doujinshi':'on',
'f_manga':'on',
'f_artistcg':'on',
'f_gamecg':'on',
'f_western':'on',
'f_non-h':'on',
'f_imaget':'on',
'f_cosplay':'on',
'f_asianporn':'on',
'f_misc':'on',
'f_arch':key,
'f_apply':'ApplyFilter'
}
url_parame=ode(data)
url="/?"
url_all=url+url_parame
returnurl_all
defgethtml(url):
req=t(url)
_header('Ur-Agent','Mozilla/5.0(Macintosh;IntelMacOSX10_12_5)AppleWebKit/537.36(KHTML,likeGecko)Chrome/59.0.3071.115Safari/537.36')
n(req)asf:
html=().decode('utf-8')
returnhtml
defgetDown(url):
html=gethtml(url)
soup=BeautifulSoup(html,'')
_all(href=e(r'/get/'))[0]['href']
defgetItems(html):
soup=BeautifulSoup(html,'')
_all('tr',class_=['gtr0','gtr1']):
time=_all('td',{'style':"white-space:nowrap"})[0].string
content=_all(class_='it2')[0].string
ifnotcontent:
img_url=_all('img')[1]['src']
tittle=_all('img')[1]['alt']
el:
mix=(r'~',content)
img_url='/'+mix[2]
tittle=mix[3]
_all(href=e(r'/')):
down=getDown(_all(href=e(r'/'))[0]['href'])
el:
down='em'
item=Item(time,img_url,tittle,down)
print(time,img_url,tittle,down)
returnitem
definit(key):
url=get_url(0,key)
html=gethtml(url)
pages=getPages(html)[-2]
items=[]
foriinrange(1,int(pages)):
url=get_url(i,key)
html=gethtml(url)
(getItems(html))
print(items)
if__name__=="__main__":
#key=input('输⼊搜索关键字(english):')
key='chine'
init(key)
可爬取图⽚,标题,时间,bt下载连接
⽆编写代理池,防爬⾍功能
**(后续更新)**
增加了bt转磁⼒链接,可直接对接到mysql存储,代码之后贴出来
本文发布于:2023-01-01 18:49:26,感谢您对本站的认可!
本文链接:http://www.wtabcd.cn/fanwen/fan/90/73684.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |