Python爬⾍实战+数据分析+数据可视化(猫眼电影)
⼀、爬⾍部分
爬⾍说明:
1、本爬⾍是以⾯向对象的⽅式进⾏代码架构的
2、本爬⾍爬取的数据存⼊到MongoDB数据库中
3、爬⾍代码中有详细注释
代码展⽰
import re
import time
from pymongo import MongoClient
import requests
from lxml import html
from urllib import par
class CatMovie():
def__init__(lf):
lf.start_url ='/films?showType=3&offt=0'
lf.url_temp ='/films?showType=3&offt={}'
lf.detail_url ='/films/{}'
# 构造响应头
lf.headers ={
"Cookie":"__mta=143397386.1607405956154.1608533524873.1608533569928.76; _lxsdk_cuid=174f6b873b49b-005ed8da7476a-3d634f03-1440 00-174f6b873b5c8; uuid_n_v=v1; __utma=17099173.1780976830.1607406113.1607406113.1607406113.1; __utmz=17099173.16074
06113.1.1.utmcsr=( direct)|utmccn=(direct)|utmcmd=(none); recentCis=52%3D73%3D1; _lxsdk=92DE8D903FA311EB97145540D12763BA74A99EC69EF74E288E03A6373ED 78378; _lx_utm=utm_source%3Dmeituanweb; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1608533269,1608533300,1608533544,1608533623; Hm_ lpvt_703e94591e87be68cc8da0da7cbd0be2=1608533623; _lxsdk_s=176840dd994-603-17f-1df%7C%7C34; __mta=143397386.1607405956154.160853 3569928.1608533622616.77; uuid=70554980435911EB91B6516866DCC34951FCE88748C84B79A0630808E1889048; _csrf=be6825573a1247a5dcf2ed 5a6100bacaacccd21643c45ee79a3a7a28c1bb32e9; lt=3dN05zd6hwM_WEa3scYBnu5qcEoAAAAARgwAAMUreBNzDKR9eCuGuYWtOPWt5ULO65alj1dffu IQJisgN0lrWp0kJkyABp6Ly8cJ2A; lt.sig=y5Xz3WT9ooI2TpIM7pzKU9CROfo",
"Ur-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
# 猫眼电影的⾃定义字体映射
lf.font_dict ={'unif4ef':'6','unif848':'3','unif88a':'7','unie7a1':'9',
aback'unie343':'1','unie137':'8','unif489':'0','unie5e2':'4','unif19b':'2','unie8cd':'5'}
lf.client = MongoClient()
# 构造请求详情页url
def get_url_list(lf):
total_num =2000
page = total_num//30+1
return[lf.url_temp.format(i*30)for i in range(0,page+1)]
# 解析请求并解析url地址
def par_url(lf,url):
rest = (url,headers=lf.headers)
t.decode()
# 解析并获取列表页数据
matey什么意思
def get_content_list(lf,html_str):
movie_ids = re.findall(r'href="/films/(.*?)" target',html_str)
item_list =[]
for i in movie_ids[::3]:
item ={}
detail_url = lf.detail_url.format(i)
# 获取到每⼀个详情数据的唯⼀标志在通过urljoin构造详情页url
item['detail_url']= par.urljoin(lf.start_url,detail_url)
item = lf.par_detail(item['detail_url'],item)
print(item)
item_list.append(item)
return item_list
# 解析并获取详情页数据
def par_detail(lf,url,item):
time.sleep(0.1)
rest = (url,headers=lf.headers)
# 先替换掉页⾯中加密字体的&#x 通过⽤**包裹⽅便后续锁定
html_str = re.sub(r'&#x(\w+)?;',r'*uni\1*',t.decode())
html_str = HTML(html_str)
# 获取信息多采⽤三⽬运算符的⽅式防⽌因获取的内容不存在⽽报异常
# 通过三⽬运算符进⾏多重判断可以增加程序的稳定性
movie_name = html_str.xpath('//div[@class="movie-brief-container"]/h1/text()')
item['movie_name']= movie_name[0]if len(movie_name)>0el None
movie_type = html_str.xpath('//div[@class="movie-brief-container"]/ul/li[1]/a/text()')
movie_type = movie_type if len(movie_type)>0el None
if movie_type is not None:
item['movie_type']='·'.join([i.strip()for i in movie_type])
el:
item['movie_type']='类型未知'
area_time = html_str.xpath('//div[@class="movie-brief-container"]/ul/li[2]/text()')
area_time = area_time[0]if len(area_time)>0el None
if area_time is not None:
area_time = area_time.split('/')
item['movie_area']= area_time[0].strip()if len(area_time)>0el'上映国家未知'
item['movie_duration']= area_time[1].strip()if len(area_time)>1el'电影时长未知'
el:
item['movie_area']='上映国家未知'
item['movie_duration']='电影时长未知'
movie_publish = html_str.xpath('//div[@class="movie-brief-container"]/ul/li[3]/text()')
movie_publish = movie_publish[0]if len(movie_publish)>0el None
if movie_publish is not None:
item['movie_publish']= re.findall(r'(\d+-\d+-\d+)',movie_publish)
item['movie_publish']= item['movie_publish'][0]if len(item['movie_publish'])>0el movie_publish el:
item['movie_publish']='上映时间未知'
movie_score = html_str.xpath('//div[@class="movie-index-content score normal-score"]/span/span/text()')
movie_score = movie_score[0]if len(movie_score)>0el None
if movie_score is not None:
item['movie_score']= re.sub(r'(\*[a-z0-9]+?\*)',lambda x:lf.font_up(1).strip('*')],movie_score) el:
item['movie_score']='电影评分未知'
movie_comments = html_str.xpath('//span[@class="score-num"]/span/text()')
movie_comments = movie_comments[0]if len(movie_comments)>0el None
if movie_comments is not None:
item['movie_comments']= re.sub(r'(\*[a-z0-9]+?\*)',lambda x:lf.font_up(1).strip('*')],movie_comments) el:
item['movie_comments']='评论⼈数未知'
movie_booking = html_str.xpath('//div[@class="movie-index-content box"]/span[1]/text()')
movie_booking = movie_booking[0]if len(movie_booking)>0el None
if movie_booking is not None:
unit = html_str.xpath('//div[@class="movie-index-content box"]/span[2]/text()')
unit = unit[0]if len(unit)>0el''
item['movie_booking']= re.sub(r'(\*[a-z0-9]+?\*)',lambda x: lf.font_up(1).strip('*')],movie_booking)+ unit el:
item['movie_booking']='电影票房未知'
movie_director = html_str.xpath('//div[@class="celebrity-container"]//div[1]//div[@class="info"]//a/text()
')
movie_director = movie_director[0]if len(movie_director)>0el None
if movie_director is not None:
item['movie_director']= movie_director.strip()
el:
item['movie_director']='导演未知'
return item
# 保存数据
def save(lf,content_list):
for i in content_list:
# 程序主⽅法
def run(lf):
url_list = lf.get_url_list()
for i in url_list:
time.sleep(0.5)
html_str = lf.par_url(i)
item_list = lf.get_content_list(html_str)
# lf.save(item_list)
if __name__ =='__main__':
movie = CatMovie()2018高考英语
movie.run()
⼆、数据分析和数据可视化部分
数据分析和数据可视化说明:
1、本博客通过Flask框架来进⾏数据分析和数据可视化中考志愿填报
2、项⽬的架构图为
代码展⽰
数据分析代码展⽰(analysis.py)
import re
from pymongo import MongoClient
import pandas as pd
import numpy as np
import pymysql
# 不同年份上映的电影数量
def movie_date_publish_count(df):
grouped = df.groupby('movie_publish_year')['movie_type'].count().ret_index()
grouped = df.groupby('movie_publish_year')['movie_type'].count().ret_index()
data = _dict(orient='records')
# 将数据转换成数组嵌套数组的格式
data =[[str(i['movie_publish_year']),i['movie_type']]for i in data]
return data
# 不同地区的电影上映数量最多的前⼗个地区
prada什么意思
def movie_country_publish_top10(df):
# 原数据中可能每个电影上映在多个地区且不同地区之间使⽤逗号分隔因此将movie_area列数据以逗号进⾏分隔变成列表
ries_country = df['movie_area'].str.split(',').tolist()
# 利⽤t函数的特性当数据出现重复时只保留⼀个数据
list_country =t([j for i in ries_country for j in i])
# 创建0矩阵统计不同地区电影上映的数量
zero_list = pd.s((len(ries_country),len(list_country))),columns=list_country)
for i in range(len(zero_list)):
zero_list.loc[i][ries_country[i]]=1
# 使⽤聚合函数对不同地区的电影进⾏统计
country_movie_counts = zero_list.sum().astype(np.int)
country_movie_counts = country__index()
country_lumns =['movie_area','count']
# 对数据进⾏排序并取出数量最多的前⼗个地区
country_movie_counts = country_movie_counts.sort_values(by='count',ascending=Fal)[:10]
data =[[i['movie_area'],i['count']]for i in country__dict(orient='records')]
return data
# 统计票房前⼗的电影
def movie_booking_top10(df):
# 按照票房数量进⾏排序并取出前⼗的数据
df = df.sort_values(by ='movie_booking',ascending=Fal)
movie_name_to_booking = df[['movie_name','movie_booking']][:10]
data =[[i['movie_name'],i['movie_booking']]for i in movie_name__dict(orient='records')]
print(data)
return data
# 统计评论前⼗的电影
def movie_comment_top10(df):
# 按照评论数量进⾏排序并取出前⼗的数据
df = df.sort_values(by ='movie_comments',ascending=Fal)
movie_name_to_booking = df[['movie_name','movie_comments']][:10]
data =[[i['movie_name'],i['movie_comments']]for i in movie_name__dict(orient='records')]
上海加拿大留学中介print(data)
return data
# 统计不同评分区间的电影数量
def movie_score_count(df):
# 根据不同区间划分电影评分数据区间分别为 <7.0 7.0-8.0 >8.0 三个区间
考雅思哪个培训机构好grouped1 = df[df['movie_score']<7.0]['movie_score']
grouped2 = df[(df['movie_score']>=7.0)&(df['movie_score']<=8.0)]['movie_score']
grouped3 = df[df['movie_score']>8]['movie_score']
movie_score_to_count =[{'movie_score':'<7.0','count':len(grouped1)},{'movie_score':'7.0-8.0','count':len(grouped2)},{'movie_score':'>8.0','count':len(gro uped3)}]
data =[[i['movie_score'],i['count']]for i in movie_score_to_count]
return data
# 统计不同类型的电影数量最多的前⼗个类型
def movie_type_count(df):
# 原数据中可能每个电影有多个电影类型且不同电影类型之间使⽤点号分隔因此将movie_type列数据以点号进⾏分隔变成列表
monster歌词ries_movie_type = df['movie_type'].str.split('·').tolist()
movie_type_list =[j for i in ries_movie_type for j in i]
# 利⽤t函数的特性当数据出现重复时只保留⼀个数据
movie_type =t(movie_type_list)
如何寒暄
# 创建0矩阵统计不同电影类型的数量
zero_list = pd.s((len(df),len(movie_type))),columns=movie_type)
for i in range(len(df)):
zero_list.loc[i][ries_movie_type[i]]=1
zero_list.loc[i][ries_movie_type[i]]=1
# 使⽤聚合函数对不同类型的电影进⾏统计
movie_type_counts = zero_list.sum().astype(np.int)
movie_type_counts = movie__index()
movie_lumns =['movie_type','count']
# 对数据进⾏排序并取出数量最多的前⼗个类型
movie_type_counts = movie_type_counts.sort_values(by='count',ascending=Fal)[:10]
data =[[i['movie_type'],i['count']]for i in movie__dict(orient='records')]
return data
if __name__ =='__main__':
# 初始化MondoDB数据库
client = MongoClient()
collection = client['test']['cat_movie']
# 获取数据
movies = collection.find({},{'_id':0})
df = pd.DataFrame(movies)
# 打印基础信息
print(df.info())
print(df.head())
麻烦的英文# 保留有⽤字段
df = df[['movie_name','movie_type','movie_area','movie_duration','movie_publish','movie_score','movie_comments','movie_booking']]
# 过滤数据
# 过滤movie_type列数据
df = df[df['movie_type'].ains('类型未知')==Fal]
# 过滤movie_area列数据
df = df[df['movie_area'].ains('上映国家未知')==Fal]
# 过滤movie_duration列数据
df = df[df['movie_duration'].ains('电影时长未知')==Fal]
# 过滤movie_publish列数据
df = df[df['movie_publish'].ains('上映时间未知')==Fal]
# 过滤movie_score列数据
df = df[df['movie_score'].ains('电影评分未知')==Fal]
# 过滤movie_comments列数据
df = df[df['movie_comments'].ains('评论⼈数未知')==Fal]
# 过滤movie_booking列数据
df = df[df['movie_booking'].ains('暂⽆')==Fal]
# 处理数据转换数据类型
# 去掉movie_duration列数据中的分钟并将数据转换成int数据类型
df['movie_duration']= df['movie_duration'].apply(lambda x:int(re.findall(r'(\d+)分钟',x)[0]))
# 将movie_score列数据转换成float类型
df['movie_score']= df['movie_score'].apply(lambda x:float(x))
# 将movie_comments列数据统⼀单位
df['movie_comments']= df['movie_comments'].apply(lambda x:int(float(re.findall(r'(.*)万',x)[0])*10000)if len(re.findall(r'万',x))>0el int(x))
# 将movie_booking列数据统⼀单位
df['movie_booking']= df['movie_booking'].apply(lambda x:float(re.findall(r'(.*)亿', x)[0])if len(re.findall('亿', x))>0el round(float(x.split('万')[0])/10000,2 ))
# 将movie_publish转换成pandas时间类型并将数据转换成具体的年添加到pd中
df['movie_publish']= df['movie_publish'].apply(lambda x:re.findall(r'(.*)中国⼤陆上映',x)[0]if len(re.findall(r'中国⼤陆上映',x))>0el x)
df['movie_publish']= pd.to_datetime(df['movie_publish'])
date = pd.DatetimeIndex(df['movie_publish'])
df['movie_publish_year']= ar
# 不同年份上映的电影数量
# data = movie_date_publish_count(df)
# 不同地区的电影上映数量最多的前⼗个地区
# data = movie_country_publish_top10(df)
# 统计票房前⼗的电影
# data = movie_booking_top10(df)