⽤户使⽤率及使⽤概率分布
1 ⽬标
计算⽤户使⽤率及使⽤概率分布
1.2 程序顺序
df1: 三会终端⽤户登陆数据
df2: ⽤户信息数据
df3: 公告数据
df4: df1+df2
df5: ⽤户⾸次登陆时间
df: ⽤户登录时间与公告时间左连接,并去除登陆前公告时间
df6: 统计公告数S,df⽤户与公告时间去重后groupby⽤户
df7: 统计使⽤次数T,df⽤户公告时间与登录时间之差⼩于等于7 days
df8: ⽤户使⽤率,df7/df6
df9: 有效公告时间,df⽤户公告时间与登录时间之差⼩于等于7 days
df10: 总体公告时间df, 并根据公告时间⽤户代码排序(例如⽤户公告数S=5,根据时间前后关系排序1、2、3、4、5 df11: df10,df9左连接,新增列判断有效公告时间对应总体公告时间位置
df12: 归⼀化分⼦计算
df13: 归⼀化分母计算
df14: df12,df13连接
df15: 归⼀化计算
df16:整合⽤户信息,使⽤率,⽤户使⽤时间轴分布(归⼀化)
data: 输出结果
2 数据清洗
2.1 处理⽤户登陆数据
》下载三会终端数据⽤户登录数据:
登录kibana:
import pandas as pd
df1 = pd.read_csv("D:/0. data_analysis/2. data_analysis/sanhuizhongduan/data/table.csv",names = ["companyCode","date","loginTimes"]) df1 = df1.drop(index=0) # 去除⾸⾏⾃带标题
# df.columns = ["companyCode","date","loginTimes"]
df1['companyCode']=df1['companyCode'].str[0:6] # 清洗⽤户代码
"""
提取上市公司信息,去除⼲扰项
"""妈妈简笔画
def listedCompanyExtract(df3): # 上市公司
import pandas as pd
d1 = panyCode.str.startswith('00')] # 中⼩板和深主板
d2 = panyCode.str.startswith('30')] # 创业板
d3 = panyCode.str.startswith('43')] # 新三板
d4 = panyCode.str.startswith('60')] # 沪主板
d5 = panyCode.str.startswith('83')] # 新三板
d6 = panyCode.str.startswith('20')] # 深B
d7 = panyCode.str.startswith('90')] # 沪B
df9 = pd.concat([d1,d2,d3,d4,d5,d6,d7]).ret_index(drop=True)
df6 = panyCode.str.startswith('58')] # 提取以companyCode以58开头的⾏监管部门
df4 = panyCode.str.startswith('1')] # 提取以companyCode以1开头的⾏中介机构
df7 = panyCode.str.startswith('78')] # 提取以companyCode以78开头的⾏拟上市
df8 = df3.append([df9,df7,df6,df4])
df8.drop_duplicates(keep = Fal,inplace = True) # 提取剩余其他数据
return df9,df7,df6,df4,df8
d = listedCompanyExtract(df1)
df1 = d[0]
df1.shape
2.2 处理⽤户信息数据
》下载⽤户数据:
登录内部管理系统:恐慌沸腾
df2 = pd.read_excel("D:/0. data_analysis/2. data_analysis/sanhuizhongduan/data/客户管理_191030165630.xls")
df2 = df2[["公司代码","公司简称","所在省市","所属区域","签约状态"]] # 提取所需特征
社会公益组织
"所在省市":"provinceOrcity","所属区域":"region","签约状态":"contractStatus"}, inplace = True)
"""处理所在省市"""
provinceOrcity1 = df2["provinceOrcity"].tolist()
provinceOrcity = []
for i in provinceOrcity1:
provinceOrcity.append(i.split("-")[0])
df2["provinceOrcity"] = provinceOrcity
"""公司代码处理"""
def companyCodeDeal(df3):
import numpy as np
df3=df3.dropna(subt=['companyCode']) # 去除companyCode列空值⾏
df3['companyCode'] = df3['companyCode'].astype(np.int64) # 转换数据列类型,去掉末尾.0
df3['companyCode']=df3['companyCode'].astype(str).str.zfill(6) # 补齐代码前⾯的0
return df3
df2 = companyCodeDeal(df2)
df2.shape
2.3 处理⽤户公告数
》下载⽤户公告数据:
登录易董平台:
"""
1 同⼀⽂件夹下多个格式相同的数据集合并
path为⽂件夹所在位置,例如path = 'D:/3. shiyan/data' ,data = MergeData(path)
调⽤:
path = 'D:/3. shiyan/data'
data = MergeData(path)
"""
def MergeData(path):
import os
import pandas as pd
# path = 'D:/3. shiyan/data' #设置csv所在⽂件夹
path = path
竖线
files = os.listdir(path) #获取⽂件夹下所有⽂件名形成⼀个列表
df1 = pd.read_excel(path + '/' + files[0],encoding='utf-8',error_bad_lines=Fal)#读取⾸个csv⽂件,保存到df1中
for file in files[1:]:
df2 = pd.read_excel(path +'/' + file,encoding='utf-8',error_bad_lines=Fal ) #打开csv⽂件,注意编码问题,保存到df2中 df1 = pd.concat([df1,df2],axis=0,ignore_index=True) #将df2数据与df1合并
df1 = df1.drop_duplicates() #去重
df1 = _index(drop=True) #重新⽣成index
# _csv(path + '/' + 'total.csv') #将结果保存为新的csv⽂件不知义
return df1
path = 'D:/0. data_analysis/2. data_analysis/sanhuizhongduan/data/sanhui'
df3 = MergeData(path)
数一
大礼仪之争# df4 = df3.drop(columns=["Unnamed: 10", "Unnamed: 9","企业性质","公告类型","公告类型/业务分类"]) # 删除列
df3 = df3[["公司代码","公告⽇期"]]
""" 处理同时存在两个代码的情况"""
l1 = df3["公司代码"].tolist()
l23 = []
for i in l1:
companyCode = i.split(",")
if len(companyCode) == 2:
companyCode = [i for i in companyCode if not i.startswith("2") ]
companyCode = [i for i in companyCode if not i.startswith("9") ]
companyCode = str(companyCode).replace("['","").replace("']","")
# print(companyCode)
l23.append(str(companyCode))
df3["公司代码"] = l23
df3.drop_duplicates(subt=["companyCode","bulletinData"],inplace = True) # 去除同⼀天的重复值
3 数据整合与计算
3.1 整合⽤户信息并选择时间范围内⽤户数据
数据时间范围选择
"""⽤户登录信息与⽤户信息合并"""
df4 = pd.merge(df1, df2, on = "companyCode", how="left")
df4 = df4[["date","companyCode","companyName","provinceOrcity","region","contractStatus","loginTimes"]]
# df4["loginTimes"] = 1
df4 = df4.dropna(subt=['companyCode', 'companyName'])
"""选取时间范围内有效数据"""
import time
df4 = df4.sort_values(['companyCode','date'],ascending = True)
df4['date'] = pd.to_datetime(df4['date'])
df4 = df4.t_index('date')
df4 = df4['2019-10-14':'2019-10-20']
df4['date']= _level_values(0).values
df4 = _index(drop=True)
3.2 时间范围内⽤户⾸次登录时间
df5 = df4.sort_values(['companyCode','date'],ascending = True)
df5 = df5.drop_duplicates(subt = ["companyCode"], keep='first')
df5.shape
3.3 整合⽤户登录时间与发布公告时间
df = pd.merge(df4, df3, on = "companyCode",how = "left") # 基于登录⽤户进⾏连接
df.shape
"""去除⽤户⾸次登陆前发布的公告时间"""
import numpy as np
悲伤的句子说说心情df["bulletinData"] = pd.to_datetime(df["bulletinData"])
df["minus"] = df["bulletinData"] - df["date"]
# df["minus1"] = df["minus"]
df['minus'] = df['minus'].astype(np.int64) # 转换数据列类型
df = df.loc[df["minus"] >=0]
df['minus'] = df['minus'].astype(np.str) # 转换数据列类型
df['minus'] = df['minus'].str[0:-9]
# df['minus'] = df['minus'].astype(np.int64) # 转换数据列类型
""" 将时间差由秒改为天数"""
listd = df["minus"].tolist()
for i in listd:
if i =="":
listd[listd.index(i)] = 0
listd1= []
for i in listd:
i = int(i)
listd1.append(i/60/60/24)
df['minus'] = listd1
df.shape
3.4 ⽤户登陆后发布公告次数 (开会次数S)
df6 = df.sort_values(['companyCode','bulletinData'],ascending = True)
df6 = df6.drop_duplicates(subt = ["companyCode","bulletinData"], keep='first') # 同⼀天多次公告视为⼀次df6 = upby(["companyCode"],sort = Fal).count()
df6.shape
3.5 ⽤户发布公告前登录使⽤次数 T (发布公告前7天内使⽤过算1次)
df7 = df.loc[df["minus"] <=7]
df7 = df7.sort_values(['companyCode','bulletinData'],ascending = True)
df7 = df7.drop_duplicates(subt = ["companyCode","bulletinData"], keep='first')
# df7 = df7.drop_duplicates(subt = ["companyCode","date"], keep='first')
df7 = upby(["companyCode"],sort = Fal).count()
df7.shape
3.6 ⽤户使⽤率(T/S)
df8 = pd.merge(df6,df7,on = "companyCode",how = "left")
df8 = df8[["companyName_x","companyName_y"]]
df8 = df8.fillna(0)
df8["usageRate"] = df8["T"]/df8["S"]
3.7 ⽤户使⽤程度归⼀化⽐较
"""获取⽤户有效时间范内使⽤三会发布公告时间"""
df9 = df.loc[df["minus"] <=7]
df9 = df9.sort_values(['companyCode','bulletinData'],ascending = True)
df9 = df9.drop_duplicates(subt = ["companyCode","bulletinData"], keep='first')
df9 = df9[["companyCode","bulletinData"]]
"""获取⽤户有效时间内发布公告时间"""
df10 = df.sort_values(['companyCode','bulletinData'],ascending = True)
df10 = df10.drop_duplicates(subt = ["companyCode","bulletinData"], keep='first')
df10 = df10[["companyCode","bulletinData"]]
df10['sort_id'] = df10['bulletinData'].groupby(df10['companyCode']).rank() # 根据companyCode 单独排序
df11 = pd.merge(df10,df9,on = "companyCode",how = "left")
"""获取时间范围内发布时间与使⽤时间统⼀的位置"""
df11['equl'] = df11[['bulletinData_x', 'bulletinData_y']].apply(lambda x: x['bulletinData_x'] == x['bulletinData_y'], axis=1) # 使⽤会议管理时间和发布公告时间是否相等
df11 = df11.loc[df11["equl"] == True]
df11 = df11[["companyCode","sort_id"]]
"""归⼀化分⼦,使⽤次数位置除以总体公告数之和"""
df12 = pd.merge(df6,df11, on ="companyCode",how = "left")
df12["n1/S"] = df12["sort_id"]/df12["minus"]
df12 = upby("companyCode",sort =True).sum()
df12 = df12[["n1/S"]]
df12.head(10)
"""归⼀化分母,公告次数位置除以总体公告数之和"""
df13 =df10
df13 = df13.drop_duplicates(subt = ["companyCode"], keep='last')
df14 = pd.merge(df10,df13, on = "companyCode", how = "left")
# df13["sort_id1"] = df13["sort_id"]
df14["n2/S"] = df14["sort_id_x"]/df14["sort_id_y"]
df14 = upby("companyCode",sort =True).sum()
df14 = df14[["n2/S"]]
df14.head(10)
"""归⼀化,计算⽤户使⽤概率分布"""
df15 = pd.merge(df12,df14,on = "companyCode")
df15["ProbabilityDistributions"] = df15["n1/S"]/df15["n2/S"]
df15.shape
df15.head(10)
4 结果输出
df16 = pd.merge(df8,df15,on = "companyCode")
df16 = df16[["S","T","usageRate","ProbabilityDistributions"]]
data = pd.merge(df2,df16, on = "companyCode", how = "right")
data.shape
data["使⽤率"] = data["使⽤率"].apply(lambda x: format(x, '.2%'))
data["连续性归⼀化处理"] = data["连续性归⼀化处理"].apply(lambda x: format(x, '.2%'))
data.head(1)
# 统计⽤户使⽤情况分布
list1 = data["companyCode"].tolist()
list2 = df5["companyCode"].tolist()
dd = data.loc[data["使⽤率"] != "0.00%"]
dict1 = {"登录三会模块总⽤户数":len(list2),
"登陆后有发布公告⽤户数":len(list1),
"登陆后有使⽤⽤户数":dd.shape[0]}
ddd = pd.DataFrame(dict1,index=["数量"]) # ⽂本表格
ddd = ddd.T
from datetime import datetime
writer = pd.ExcelWriter("D:/0. data_analysis/2. data_analysis/sanhuizhongduan/三会终端数据分析%s.xls"%(w().strftime("%Y%m%d")),encoding = "sig-utf-8") _excel(writer,sheet_name='登陆后发布公告⽤户数')
<_excel(writer,sheet_name='登陆总⽤户数')
<_excel(writer,sheet_name='数据统计')
writer.save()
writer.clo()