Scrapy项目实战之爬取某社区用户详情

更新时间:2023-04-04 04:57:27 阅读：评论：0

本文介绍了scrapy项目实战之爬取某社区用户详情，分享给大家，具有如下：

get_cookies.py

from lenium import webdriverfrom pymongo import mongoclientfrom scrapy.crawler import overridden_ttings# from gmentfault import ttingsimport timeimport ttingsclass getcookies(object): def __init__(lf):  # 初始化组件  # 设定webdriver选项  lf.opt = webdriver.chromeoptions()  # lf.opt.add_argument("--headless")  # 初始化用户列表  lf.ur_list = ttings.ur_list  # 初始化mongodb参数  lf.client = mongoclient(ttings.mongo_uri)  lf.db = lf.client[ttings.mongo_db]  lf.collection = lf.db["cookies"] def get_cookies(lf,urname,password):  """  :param urname:  :param password:  :return: cookies  """  # 使用webdriver选项创建driver  driver = webdriver.chrome(executable_path="/urs/hank/scrapy/gmentfault/gmentfault/chromedriver",options=lf.opt)  driver.get("https://gmentfault.com/ur/login")  driver.find_element_by_name("urname").nd_keys(urname)  driver.find_element_by_name("password").nd_keys(password)  driver.find_element_by_xpath("//button[@type='submit']").click()  time.sleep(2)  driver.get("https://gmentfault.com/u/luwangmeilun/urs/following")  # 登陆之后获取页面cookies  cookies = driver.get_cookies()  driver.quit()  return cookies def format_cookies(lf,cookies):  """  :param cookies:  从driver.get_cookies的形式为：  [{'domain': 'gmentfault.com', 'httponly': fal, 'name': 'phpssid',  'path': '/', 'cure': fal, 'value': 'web2~5grmfa89j12eksub8hja3bvaq4'},  {'domain': '.gmentfault.com', 'expiry': 1581602940, 'httponly': fal,  'name': 'hm_lvt_e23800c454aa573c0ccb16b52665ac26', 'path': '/', 'cure': fal,  'value': '1550066940'},  {'domain': '.gmentfault.com', 'httponly': fal,  'name': 'hm_lpvt_e23800c454aa573c0ccb16b52665ac26',  'path': '/', 'cure': fal, 'value': '1550066940'},  {'domain': '.gmentfault.com', 'expiry': 1550067000, 'httponly': fal,  'name': '_gat', 'path': '/', 'cure': fal, 'value': '1'},  {'domain': '.gmentfault.com', 'expiry': 1550153340, 'httponly': fal,  'name': '_gid', 'path': '/', 'cure': fal, 'value': 'ga1.2.783265084.1550066940'},  {'domain': '.gmentfault.com', 'expiry': 1613138940, 'httponly': fal, 'name': '_ga',  'path': '/', 'cure': fal, 'value': 'ga1.2.1119166665.1550066940'}]  只需提取每一项的name与value即可  :return:  """  c = dict()  for item in cookies:   c[item['name']] = item['value']  return c def save(lf):  print("开始获取cookies....")  # 从用户列表中获取用户名与密码，分别登陆获取cookies  for urname,password in lf.ur_list:   cookies = lf.get_cookies(urname,password)   f_cookies = lf.format_cookies(cookies)   print("inrt cookie:{}".format(f_cookies))   # 将格式整理后的cookies插入mongodb数据库   lf.collection.inrt_one(f_cookies)  # s = db[lf.collection].find()  # for i in s:  #  print(i)if __name__ == '__main__': cookies = getcookies() for i in range(20):  cookies.save()

item.py

# -*- coding: utf-8 -*-# define here the models for your scraped items## e documentation in:# https://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass gmentfaultitem(scrapy.item): # define the fields for your item here like: # 个人属性 # 姓名 name = scrapy.field() # 声望 rank = scrapy.field() # 学校 school = scrapy.field() # 专业 majors = scrapy.field() # 公司 company = scrapy.field() # 工作 job = scrapy.field() # blog blog = scrapy.field() # 社交活动数据 # 关注人数 following = scrapy.field() # 粉丝数 fans = scrapy.field() # 回答数 answers = scrapy.field() # 提问数 questions = scrapy.field() # 文章数 articles = scrapy.field() # 讲座数 lives = scrapy.field() # 徽章数 badges = scrapy.field() # 技能属性 # 点赞数 like = scrapy.field() # 技能 skills = scrapy.field() # 注册日期 register_date = scrapy.field() # 问答统计 # 回答最高得票数 answers_top_score = scrapy.field() # 得票数最高的回答对应的问题的标题 answers_top_title = scrapy.field() # 得票数最高的回答对应的问题的标签 answers_top_tags = scrapy.field() # 得票数最高的回答对应的问题的内容 answers_top_question = scrapy.field() # 得票数最高的回答对应的问题的内容 answers_top_content = scrapy.field()

pipeline.py

# -*- coding: utf-8 -*-# define your item pipelines here## don't forget to add your pipeline to the item_pipelines tting# e: https://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport pymongoclass gmentfaultpipeline(object): # 设定mongodb集合名称 collection_name = 'urinfo' def __init__(lf,mongo_uri,mongo_db):  lf.mongo_uri = mongo_uri  lf.mongo_db = mongo_db # 通过crawler获取ttings.py中设定的mongodb连接信息 @classmethod def from_crawler(cls,crawler):  return cls(   mongo_uri = crawler.ttings.get('mongo_uri'),   mongo_db = crawler.ttings.get('mongo_db','gmentfault')  ) # 当爬虫启动时连接mongodb def open_spider(lf,spider):  lf.client = pymongo.mongoclient(lf.mongo_uri)  lf.db = lf.client[lf.mongo_db] # 当爬虫关闭时断开mongodb连接 def clo_spider(lf,spider):  lf.client.clo() # 将item插入数据库保存 def process_item(lf, item, spider):  lf.db[lf.collection_name].inrt_one(dict(item))  return item

ttings.py

# -*- coding: utf-8 -*-# scrapy ttings for gmentfault project## for simplicity, this file contains only ttings considered important or# commonly ud. you can find more ttings consulting the documentation:##  /d/file/titlepic/ttings.html  /d/file/titlepic/downloader-middleware.html  https://doc.scrapy.org/en/latest/topics/spider-middleware.htmlbot_name = 'gmentfault'spider_modules = ['gmentfault.spiders']newspider_module = 'gmentfault.spiders'# crawl responsibly by identifying yourlf (and your website) on the ur-agentur_agent = 'mozilla/5.0 (windows nt 6.1; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/71.0.3578.98 safari/537.36'# obey robots.txt rulesrobotstxt_obey = fal# configure maximum concurrent requests performed by scrapy (default: 16)concurrent_requests = 100# configure a delay for requests for the same website (default: 0)# e /d/file/titlepic/ttings.htmldownload-delay# e also autothrottle ttings and docs# download_delay = 2# the download delay tting will honor only one of:# concurrent_requests_per_domain = 32# concurrent_requests_per_ip = 32# disable cookies (enabled by default)# cookies_enabled = fal# disable telnet console (enabled by default)#telnetconsole_enabled = falretry_enabled = falredirect_enabled = faldownload_timeout = 5# httpallow# override the default request headers:#default_request_headers = {# 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',# 'accept-language': 'en',#}# enable or disable spider middlewares# e https://doc.scrapy.org/en/latest/topics/spider-middleware.htmlspider_middlewares = { 'gmentfault.middlewares.gmentfaultspidermiddleware': 543,}# enable or disable downloader middlewares# e https://doc.scrapy.org/en/latest/topics/downloader-middleware.htmldownloader_middlewares = { # 'gmentfault.middlewares.gmentfaulthttpproxymiddleware': 543, 'gmentfault.middlewares.gmentfaulturagentmiddleware':643, 'gmentfault.middlewares.gmentfaultcookiesmiddleware':743, 'scrapy.downloadermiddlewares.httpproxy.httpproxymiddleware': none, 'scrapy.downloadermiddlewares.uragent.uragentmiddleware': none, # 'scrapy.downloadermiddlewares.cookies.cookiesmiddleware':none,}# enable or disable extensions# e http四川自然保护区s://doc.scrapy.org/en/latest/topics/extensions.html#extensions = {# 'scrapy.extensions.telnet.telnetconsole': none,#}# configure item pipelines# e https://doc.scrapy.org/en/latest/topics/item-pipeline.htmlitem_pipelines = { 'gmentfault.pipelines.gmentfaultpipeline': 300,}# enable and configure the autothrottle extension (disabled by default)# e /d/file/titlepic/autothrottle.html autothrottle_enabled = true# # the initial download delay# autothrottle_start_delay = 5# # the maximum download delay to be t in ca of high latencies# autothrottle_max_delay = 60# # the average number of requests scrapy should be nding in parallel to# # each remote rver# autothrottle_target_concurrency = 1.0# # enable showing throttling stats for every respon received:# autothrottle_debug = fal# enable and configure http caching (disabled by default)# e /d/file/titlepic/downloader-middleware.htmlhttpcache-middleware-ttings#httpcache_enabled = true#httpcache_expiration_cs = 0#httpcache_dir = 'httpcache'#httpcache_ignore_http_codes = []#httpcache_storage = 'scrapy.extensions.httpcache.filesystemcachestorage'# 配置mongodbmongo_uri = 'localhost:27017'mongo_db = 'gmentfault'# 用户列表ur_list = [ ("798549150@qq.com","guoqing1010"), ("learnscrapy@163.com","guoqing1010"),]# 配置代理列表proxy_list = [ 'http://115.182.212.169:8080', 'http://121.61.25.149:9999', 'http://180.118.247.189:9000', 'http://115.151.3.12:9999', 'http://183.154.213.160:9000', 'http://113.128.9.106:9999', 'http://124.42.68.152:90', 'http://49.70.48.50:9999', 'http://113.128.11.172:9999', 'http://111.177.177.40:9999', 'http://59.62.83.253:9999', 'http://39.107.84.185:8123', 'http://124.94.195.107:9999', 'http://111.177.160.132:9999', 'http://120.25.203.182:7777']ur_agent_list = [ 'mozilla/5.0 (windows nt 6.1; wow64) applewebkit/537.36 (khtml, like gecko) chrome/39.0.2171.95 safari/537.36 opr/26.0.1656.60', 'opera/8.0 (windows nt 5.1; u; en)', 'mozilla/5.0 (windows nt 5.1; u; en; rv:1.8.1) gecko/20061208 firefox/2.0.0 opera 9.50', 'mozilla/4.0 (compatible; msie 6.0; windows nt 5.1; en) opera 9.50', 'mozilla/5.0 (windows nt 6.1; wow64; rv:34.0) gecko/20100101 firefox/34.0', 'mozilla/5.0 (x11; u; linux x86_64; zh-cn; rv:1.9.2.10) gecko/20100922 ubuntu/10.10 (maverick) firefox/3.6.10', 'mozilla/5.0 (windows nt 6.1; wow64) applewebkit/537.36 (khtml, like gecko) chrome/39.0.2171.71 safari/537.36', 'mozilla/5.0 (x11; linux x86_64) applewebkit/537.11 (khtml, like gecko) chrome/23.0.1271.64 safari/537.11', 'mozilla/5.0 (windows; u; windows nt 6.1; en-us) applewebkit/534.16 (khtml, like gecko) chrome/10.0.648.133 safari/534.16', 'mozilla/5.0 (windows nt 6.1; wow64) applewebkit/537.1 (khtml, like gecko) chrome/21.0.1180.71 safari/537.1 lbbrowr', 'mozilla/4.0 (compatible; msie 6.0; windows nt 5.1; sv1; qqdownload 732; .net4.0c; .net4.0e; lbbrowr)', 'mozilla/4.0 (compatible; msie 6.0; windows nt 5.1; sv1; qqdownload 732; .net4.0c; .net4.0e)', 'mozilla/5.0 (windows nt 5.1) applewebkit/535.11 (khtml, like gecko) chrome/17.0.963.84 safari/535.11  2.x metasr 1.0', 'mozilla/4.0 (compatible; msie 7.0; windows nt 5.1; trident/4.0; sv1; qqdownload 732; .net4.0c; .net4.0e;  2.x metasr 1.0)', 'mozilla/5.0 (windows nt 6.1; wow64) applewebkit/537.36 (khtml, like gecko) maxthon/4.4.3.4000 chrome/30.0.1599.101 safari/537.36', 'mozilla/5.0 (windows nt 6.1; wow64) applewebkit/537.36 (khtml, like gecko) chrome/38.0.2125.122 ubrowr/4.0.3214.0 safari/537.36']

urinfo.py

# -*- coding: utf-8 -*-import scrapyimport timefrom scrapy import requestfrom pymongo import mongoclientfrom scrapy.linkextractors import linkextractorfrom scrapy.spiders import crawlspider,rulefrom scrapy.http import formrequestfrom gmentfault.items import gmentfaultitemclass urinfospider(crawlspider): name = 'urinfo' allowed_domains = ['gmentfault.com'] start_urls = ['https://gmentfault.com/u/mybigbigcat/urs/following'] rules = (  # 用户主页地址，跟进并进行解析  rule(linkextractor(allow=r'/u/\w+$'),callback='par_item',follow=true),  # 用户关注列表，跟进列表页面，抓取用户主页地址进行后续操作  # rule(linkextractor(allow=r'/urs/followed$'),follow=true),  # 用户粉丝列表，跟进列表页面，抓取用户主页地址进行后续操作  rule(linkextractor(allow=r'/urs/following$'),follow=true),  # 跟进其他页面地址  # rule(linkextractor(allow=r'/urs/[followed|following]?page=\d+'),follow=true), ) def start_requests(lf):  # 从mongodb中获取一条cookie，添加到开始方法  client = mongoclient(lf.crawler.ttings['mongo_uri'])  db = client[lf.crawler.ttings['mongo_db']]  cookies_collection = db.cookies  # 获取一条cookie  cookies = cookies_collection.find_one()  # cookie中的'hm_lpvt_e23800c454aa573c0ccb16b52665ac26'参数是当前时间的10位表示法，因此重新填充  cookies['hm_lpvt_e23800c454aa573c0ccb16b52665ac26'] = str(int(time.time()))  return [request("https://gmentfault.com",      cookies=cookies,      meta={'cookiejar':1},      callback=lf.after_login)] # 登录之后从start_url中开始抓取数据 def after_login(lf,respon):  for url in lf.start_urls:   return lf.make_requests_from_url(url) # def after_login(lf,respon): #  yield request(lf.start_urls[0], #     meta={'cookiejar':respon.meta['cookiejar']}, #     callback=lf.par_item) def par_item(lf, respon):  """  :param respon:  :return:  """  item = gmentfaultitem()  # 个人属性模块  profile_head = respon.css('.profile__heading')  # 姓名  item['name'] = profile_head.css('h2[class*=name]::text').re_first(r'\w+')  # 声望  item['rank'] = profile_head.css('.profile__rank-btn > span::text').extract_first()  # 学校专业信息  school_info = profile_head.css('.profile__school::text').extract()  if school_info:   # 学校   item['school'] = school_info[0]   # 专业   item['majors'] = school_info[1].strip()  el:   item['school'] = ''   item['majors'] = ''  # 公司职位信息  company_info = profile_head.css('.profile__company::text').extract()  if company_info:   # 公司   item['company'] = company_info[0]   # 职位   item['job'] = company_info[1].strip()  el:   item['company'] = ''   item['job'] = ''  # 个人博客  item['blog'] = profile_head.css('a[class*=other-item-link]::attr(href)').extract_first()  # 统计面板模块  profile_active = respon.xpath("//div[@class='col-md-2']")  # 关注人数  item['following'] = profile_active.css('div[class*=info] a > .h5::text').re(r'\d+')[0]  # 粉丝人数  item['fans'] = profile_active.css('div[class*=info] a > .h5::text').re(r'\d+')[1]  # 回答问题数  item['answers'] = profile_active.css('a[href*=answer] .count::text').re_first(r'\d+')  # 提问数  item['questions'] = profile_active.css('a[href*=questions] .count::text').re_first(r'\d+')  # 文章数  item['articles'] = profile_active.css('a[href*=articles] .count::text').re_first(r'\d+')  # 讲座数  item['lives'] = profile_active.css('a[href*=lives] .count::text').re_first(r'\d+')  # 徽章数  item['badges'] = profile_active.css('a[href*=badges] .count::text').re_first(r'\d+')  # 徽章详细页面地址  badge_url = profile_active.css('a[href*=badges]::attr(href)').extract_first()  # 技能面板模块  profile_skill = respon.xpath("//div[@class='col-md-3']")  # 技能标签列表  item['skills'] = profile_skill.css('.tag::text').re(r'\w+')  # 获得的点赞数  item['like'] = profile_skill.css('.authlist').re_first(r'获得 (\d+) 次点赞')  # 注册日期  item['register_date'] = profile_skill.css('.profile__skill--other p::text').extract_first()  # if register_time:  #  item['register_date'] = ''.join(re.findall(r'\d+',register_time))  # el:  #  item['register_date'] = ''  # 产出数据模块  profile_work = respon.xpath("//div[@class='col-md-7']")  # 回答获得的最高分  item['answers_top_sc咱们家的那些事ore'] = profile_work.css('#navanswer .label::text').re_first(r'\d+')  # 最高分回答对应的问题的标题  item['answers_top_title'] = profile_work.css('#navanswer div[class*=title-warp] > a::text').extract_first()  # 最高分回答对应的问题的url  answer_url = profile_work.css('#navanswer div[class*=title-warp] > a::attr(href)').extract_first()  # 将需要继续跟进抓取数据的url与item作为参数传递给相应方法继续抓取数据  request = scrapy.request(   # 问题详细页url   url=respon.urljoin(answer_url),   meta={   # item需要传递   'item':item,   # 徽章的url   'badge_url':respon.urljoin(badge_url)},   # 调用par_ansr继续处理   callback=lf.par_answer)  yield request def par_answer(lf,respon):  # 取出传递的item  item = respon.meta['item']  # 取出传递的徽章详细页url  badge_url = respon.meta['badge_url']  # 问题标签列表  item['answers_top_tags'] = respon.css('.question__title--tag .tag::text').re(r'\w+')  # 先获取组成问题内容的字符串列表  question_content = respon.css('.widget-question__item p').re(r'>(.*?)<')  # 拼接后传入item  item['answers_top_question'] = ''.join(question_content)  # 先获取组成答案的字符串列表  answer_content = respon.css('.qa-answer > article .answer').re(r'>(.*?)<')  # 拼接后传入item  item['answers_top_content'] = ''.join(answer_content)  # 问题页面内容抓取后继续抓取徽章页内容，并将更新后的item继续传递  request = scrapy.request(url=badge_url,         meta={'item':item},         callback=lf.par_badge)  yield request def par_badge(lf,respon):  item = respon.meta['item']  badge_name = respon.css('span.badge span::text').extract()  badge_count = respon.css('span[class*=badges-count]::text').re(r'\d+')  name_count = {}  for i in range(len(badge_count)):   name_count[badge_name[i]] = badge_count[i]  item['badges'] = name_count  yield item

middlewars.py

# -*- coding: utf-8 -*-# define here the models for your spider middleware## e documentation in:# https://doc.scrapy.org/en/latest/topics/spider-middleware.htmlimport randomimport reimport datetimeimport scrapyimport loggingimport timefrom scrapy.conf import ttingsfrom pymongo import mongoclientfrom scrapy.downloadermiddleware文化的传承s.httpproxy import httpproxymiddlewareimport pymongologger = logging.getlogger(__name__)class gmentfaultspidermiddleware(object): """ 处理item中保存的三种类型注册日期数据： 1. 注册于 2015年12月12日 2. 注册于 3 天前 3. 注册于 5 小时前 """ def process_spider_output(lf,respon,result,spider):  """  输出respon时调用此方法处理item中register_date  :param respon:  :param result: 包含item  :param spider:  :return:处理过注册日期的item  """  for item in result:   # 判断获取的数据是否是scrapy.item类型   if isinstance(item,scrapy.item):    # 获取当前时间    now = datetime.datetime.now()    register_date = item['register_date']    logger.info("获取注册日志格式为{}".format(register_date))    # 提取注册日期字符串，如'注册于2015年12月12日' => '20151212'    day = ''.join(re.findall(r'\d+',register_date))    # 如果提取数字字符串长度大于4位，则为'注册于2015年12月12日'形式    if len(day) > 4:  喝彩作文   date = day    # 如果‘时'在提取的字符串中，则为'注册于8小时前'形式    elif '时' in register_date:     d = now - datetime.timedelta(hours=int(day))     date = d.strftime("%y%m%d")    # 最后一种情况就是'注册于3天前'形式    el:     d = now - datetime.timedelta(days=int(day))     date = d.strftime("%y%m%d")    # 更新register_date值    item['register_date'] = date   yield itemclass gmentfaulthttpproxymiddleware(object): # not all methods need to be defined. if a method is not defined, # scrapy acts as if the downloader middleware does not modify the # pasd objects. def __init__(lf):  lf.proxy_list = ttings['proxy_list'] def process_request(lf, request, spider):  proxy = random.choice(lf.proxy_list)  logger.info('使用代理:{}'.format(proxy))  request.meta['proxy'] = proxyclass gmentfaulturagentmiddleware(object): def __init__(lf):  lf.uragent_list = ttings['ur_agent_list'] def process_request(lf,request,spider):  ur_agent = random.choice(lf.uragent_list)  # logger.info('使用的u ur-agent:{}'.format(ur_agent))  request.headers['ur-agent'] = ur_agentclass gmentfaultcookiesmiddleware(object): client = mongoclient(ttings['mongo_uri']) db = client[ttings['mongo_db']] collection = db['cookies'] def get_cookies(lf):  """  随机获取cookies  :return:  """  cookies = random.choice([cookie for cookie in lf.collection.find()])  # 将不需要的"_id"与"_gat"参数删除  cookies.pop('_id')  cookies.pop('_gat')  # 将"hm_lpvt_e23800c454aa573c0ccb16b52665ac26"填充当前时间  cookies['hm_lpvt_e23800c454aa573c0ccb16b52665ac26'] = str(int(time.time()))  return cookies def remove_cookies(lf,cookies):  """  删除已失效的cookies  :param cookies:  :return:  """  # 随机获取cookies中的一对键值,返回结果是一个元祖  i = cookies.popitem()  # 删除cookies  try:   logger.info("删除cookies{}".format(cookies))   lf.collection.remove({i[0]:i[1]})  except exception as e:   logger.info("no this cookies:{}".format(cookies)) def process_request(lf,request,spider):  """  为每一个request添加一个cookie  :param request:  :param spider:  :return:  """  cookies = lf.get_cookies()  request.cookies = cookies def process_respon(lf,request,respon,spider):  """  对于登录失效的情况，可能会重定向到登录页面，这时添加新的cookies继续，将请求放回调度器  :param request:  :param respon:  :param spider:  :return:  """  if respon.status in [301,302]:   logger.info("redirect respon:{}".format(respon))   redirect_url = respon.headers['location']   if b'/ur/login' in redirect_url:    logger.info("cookies失效")    # 请求失败，重新获取一个cookie，添加到request，并停止后续中间件处理此request，将此request放入调度器    new_cookie = lf.get_cookies()    logger.info("获取新cookie:{}".format(new_cookie))    # 删除旧cookies  正弦函数的导数  lf.remove_cookies(request.cookies)    request.cookies = new_cookie   return request  #  return respon

run.py

from scrapy import cmdline# from gmentfault.get_cookies import getcookiesfrom get_cookies import getcookiesif __name__ == '__main__': cookies = getcookies() cookies.save() name = 'urinfo' "" cmd = 'scrapy crawl {}'.format(name) cmdline.execute(cmd.split())

到此这篇关于scrapy项目实战之爬取某社区用户详情的文章就介绍到这了,更多相关scrapy 爬取某社区用户内容请搜索www.887551.com以前的文章或继续浏览下面的相关文章希望大家以后多多支持www.887551.com！

本文发布于:2023-04-04 04:57:13，感谢您对本站的认可！

本文链接：https://www.wtabcd.cn/fanwen/zuowen/de39a056379db7d34a5b41e860142bfb.html

本文word下载地址：Scrapy项目实战之爬取某社区用户详情.doc

本文 PDF 下载地址：Scrapy项目实战之爬取某社区用户详情.pdf

上一篇：使用.NET 6开发TodoList应用之引入数据存储的思路详解

下一篇：返回列表