# >>#### ⽅式⼆ >>##### import requests
ssion = requests.Session()
i1 = (url="",
headers={"Ur-Agent": agent.random})
i2 = ssion.post(
url="/login",
data={
'phone': "8615057101356",
'password': "199SulkyBuckets",
'oneMonth': "1"
},
headers={"Ur-Agent": agent.random}
)
i3 = ssion.post(
url="/link/vote?linksId=19444596",
headers={"Ur-Agent": agent.random}
)
)
2.Scrapy框架
Scrapy是⼀个为了爬取⽹站数据,提取结构性数据⽽编写的应⽤框架。其可以应⽤在数据挖掘,信息处理或存储历史数据等⼀系列的程序中。
其最初是为了页⾯抓取 (更确切来说, ⽹络抓取 )所设计的,也可以应⽤在获取API所返回的数据(例如 Amazon Associates Web Services ) 或者通⽤的⽹络爬⾍。Scrapy⽤途⼴泛,可以⽤于数据挖掘、监测和⾃动化测试。
Scrapy 使⽤了 Twisted异步⽹络库来处理⽹络通讯。整体架构⼤致如下
Scrapy主要包括了以下组件:
引擎(Scrapy)
⽤来处理整个系统的数据流处理, 触发事务(框架核⼼)
调度器(Scheduler)
⽤来接受引擎发过来的请求, 压⼊队列中, 并在引擎再次请求的时候返回. 可以想像成⼀个URL(抓取⽹页的⽹址或者说是链接)的优先队列, 由它来决定下⼀个要抓取的⽹址是什么, 同时去除重复的⽹址
下载器(Downloader)
⽤于下载⽹页内容, 并将⽹页内容返回给蜘蛛(Scrapy下载器是建⽴在twisted这个⾼效的异步模型上的)
爬⾍(Spiders)
爬⾍是主要⼲活的, ⽤于从特定的⽹页中提取⾃⼰需要的信息, 即所谓的实体(Item)。⽤户也可以从中提取出链接,让Scrapy继续抓取下⼀个页⾯
项⽬管道(Pipeline)
负责处理爬⾍从⽹页中抽取的实体,主要的功能是持久化实体、验证实体的有效性、清除不需要的信息。当页⾯被爬⾍解析后,将被发送到项⽬管道,并经过⼏个特定的次序处理数据。
下载器中间件(Downloader Middlewares)
位于Scrapy引擎和下载器之间的框架,主要是处理Scrapy引擎与下载器之间的请求及响应。
爬⾍中间件(Spider Middlewares)
介于Scrapy引擎和爬⾍之间的框架,主要⼯作是处理蜘蛛的响应输⼊和请求输出。
调度中间件(Scheduler Middewares)
介于Scrapy引擎和调度之间的中间件,从Scrapy引擎发送到调度的请求和响应。
Scrapy运⾏流程⼤概如下:
1. 引擎从调度器中取出⼀个链接(URL)⽤于接下来的抓取
2. 引擎把URL封装成⼀个请求(Request)传给下载器
3. 下载器把资源下载下来,并封装成应答包(Respon)
4. 爬⾍解析Respon
5. 解析出实体(Item),则交给实体管道进⾏进⼀步的处理
6. 解析出的是链接(URL),则把URL交给调度器等待抓取
2.1 基本命令
1. scrapy startproject 项⽬名称
- 在当前⽬录中创建中创建⼀个项⽬⽂件(类似于Django)
2. scrapy genspider [-t template] <name> <domain>
- 创建爬⾍应⽤
如:
scrapy gensipider -t basic
scrapy gensipider -t xmlfeed
PS:
查看所有命令:scrapy gensipider -l
查看模板命令:scrapy gensipider -d 模板名称
3. scrapy list
- 展⽰爬⾍应⽤列表
4. scrapy crawl 爬⾍应⽤名称 --nolog(⽆运⾏⽇志显⽰)
- 运⾏单独爬⾍应⽤
2.2 选择器SELECTOR
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from scrapy.lector import Selector, HtmlXPathSelector
from scrapy.http import HtmlRespon
html = """<!DOCTYPE html>
<html>
<head lang="en">
<meta chart="UTF-8">
<title></title>
</head>
<body>
<ul>
<li class="item-"><a id='i1' href="link.html">first item</a></li>
<li class="item-0"><a id='i2' href="llink.html">first item</a></li>
<li class="item-1"><a href="llink2.html">cond item<span>vv</span></a></li>
</ul>
<div><a href="llink2.html">cond item</a></div>
</body>
</html>
"""
respon = HtmlRespon(url='', body=html,encoding='utf-8')
# hxs = HtmlXPathSelector(respon)
# print(hxs)
# hxs = Selector(respon=respon).xpath('//a')
# print(hxs)
# hxs = Selector(respon=respon).xpath('//a[2]')
# print(hxs)
# hxs = Selector(respon=respon).xpath('//a[@id]')
# print(hxs)
# hxs = Selector(respon=respon).xpath('//a[@id="i1"]')
# print(hxs)
# hxs = Selector(respon=respon).xpath('//a[@href="link.html"][@id="i1"]')
# print(hxs)
# hxs = Selector(respon=respon).xpath('//a[contains(@href, "link")]')
# print(hxs)
# hxs = Selector(respon=respon).xpath('//a[starts-with(@href, "link")]')
# print(hxs)
# hxs = Selector(respon=respon).xpath('//a[re:test(@id, "i\d+")]')
# print(hxs)
# hxs = Selector(respon=respon).xpath('//a[re:test(@id, "i\d+")]/text()').extract()
# print(hxs)
# hxs = Selector(respon=respon).xpath('//a[re:test(@id, "i\d+")]/@href').extract() # print(hxs)
# hxs = Selector(respon=respon).xpath('/html/body/ul/li/a/@href').extract()
# print(hxs)
# hxs = Selector(respon=respon).xpath('//body/ul/li/a/@href').extract_first()
# print(hxs)
# ul_list = Selector(respon=respon).xpath('//body/ul/li')
# for item in ul_list:
# v = item.xpath('./a/span')
# # 或
# # v = item.xpath('a/span')
# # 或
# # v = item.xpath('*/a/span')
# print(v)
chouti ⾃动登⼊点赞
import scrapy
from scrapy.lector import HtmlXPathSelector
from quest import Request
from kies import CookieJar
from scrapy import FormRequest
class ChouTiSpider(scrapy.Spider):
# 爬⾍应⽤的名称,通过此名称启动爬⾍命令
name = "chouti"
# 允许的域名
allowed_domains = [""]
cookie_dict = {}
has_request_t = {}
# 重写起始函数
def start_requests(lf):
url = '/'
# return [Request(url=url, callback=lf.login)]
yield Request(url=url, callback=lf.login)
def login(lf, respon):
cookie_jar = CookieJar()
act_cookies(respon, quest)
for k, v in cookie_jar._cookies.items():
for i, j in v.items():
for m, n in j.items():
kie_dict)
req = Request(
url='/login',家乡歌曲
method='POST',
headers={'Content-Type': 'application/x-www-form-urlencoded; chart=UTF-8'}, body='phone=8615057101356&password=199SulkyBuckets&Month=1',
kie_dict,
callback=lf.check_login
)
yield req
def check_login(lf, respon):
# )
req = Request(
url='/',
method='GET',
callback=lf.show,
kie_dict,
dont_filter=True
)
yield req
def show(lf, respon):
# )
hxs = HtmlXPathSelector(respon)
news_list = hxs.lect('//div[@id="content-list"]/div[@class="item"]') for new in news_list:
# temp = new.xpath('div/div[@class="part2"]/@share-linkid').extract()
link_id = new.xpath('*/div[@class="part2"]/@share-linkid').extract_first() yield Request(
url='/link/vote?linksId=%s' %(link_id,),
method='POST',
kie_dict,
callback=lf.do_favor
)
# page_list = hxs.lect('//div[@id="dig_lcpage"]//a[re:test(@href, "/all/hot/recent/\d+")]/@href').extract()
# for page in page_list:
#
# page_url = '%s' % page
# import hashlib
# hash = hashlib.md5()
森林防火人人有责
# hash.update(bytes(page_url,encoding='utf-8'))
# key = hash.hexdigest()
# if key in lf.has_request_t:
# pass
# el:
# lf.has_request_t[key] = page_url
# yield Request(
# url=page_url,
# method='GET',
# callback=lf.show
# )
def do_favor(lf, respon):
)
注意:ttings.py中设置DEPTH_LIMIT = 1来指定“递归”的层数。注意:ttings.py中设置DEPTH_LIMIT = 1来指定“递归”的层数。多次爬取同⼀个页⾯注意设置REQUEST:dont_filter=True,防⽌爬⾍⾃⾏去重
2.3 避免重复访问
scrapy默认使⽤ scrapy.dupefilter.RFPDupeFilter 进⾏去重,相关配置有:
DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'
DUPEFILTER_DEBUG = Fal
JOBDIR = "保存范⽂记录的⽇志路径,如:/root/"# 最终路径为 /
2.4 爬取mzitu图⽚
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from scrapy.lector import Selector,XmlXPathSelector
from ..items import MzituItem
class MeizituSpider(scrapy.Spider):
name = 'meizitu'
allowed_domains = ['']
# start_urls = ['/']
def start_requests(lf):
url = '/all/'
yield Request(url=url,method='GET',callback=lf.main_page)
def main_page(lf,respon):
# 取得所有套图地址
hxs = Selector(respon = respon).xpath('//p[contains(@class,"url")]/a/@href').extract()
for url in hxs:
req = Request(url = url,
callback=lf.fenye)
yield req
def fenye(lf,respon):
# 取得图⽚路径和标题
img_url = Selector(respon=respon).xpath('//div[@class="main-image"]//img/@src').extract_first().strip()
title = Selector(respon=respon).xpath('//div[@class="main-image"]//img/@alt').extract_first().strip()
yield MzituItem(img_url=img_url,title=title)
# 取得下⽅导航条页⾯路径
xhs = Selector(respon=respon).xpath('//div[@class="pagenavi"]/a/@href').extract()
for url in xhs:满月宝宝喝多少毫升奶粉
req = Request(
url=url,
callback=lf.fenye,
)
yield req
meizitu.py
import scrapy
class MzituItem(scrapy.Item):
# define the fields for your item here like: