pandas实现分类汇总--⼩计,总计
有⼀批数据需要分类汇总和总计,看了⼀下pandas的groupby,可以实现。具体思路:先分组,分组后计算改分类的汇总⼩计,然后对dataframe进⾏拼接;分类汇总计算好了之后,计算总体的汇总,然后在进⾏拼接
具体代码:
"""
pandas 实现分类汇总总计
"""
caughtimport os
import sys
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(BASE_DIR)
import pandas as pd
def gen_pivot_table(df, calculate_fields, special_fields, groupby_fields):
"""
:param df:
:param calculate_fields: 需要计算的字段
free money:param special_fields: 需要进⾏特殊计算的字段两列⽐值两列之和/差等
:param groupby_fields: 需要分组的字段
:return:
"""
last_total_df = calculation_total(df, calculate_fields, special_fields, name=u"总计")
print("last_total_df: {}\n".format(last_total_df))
sub_total_df = calculation_sub_total(df, groupby_fields, calculate_fields, special_fields)
print("sub_total_df: {}\n".format(sub_total_df))
new_df = pd.concat([sub_total_df, last_total_df], axis=0)
# new_df["累计播放时间(秒)"] = new_df["累计播放时间(秒)"].apply(conds_2_minutes)
new_df["⼈均播放时长(秒)"]= new_df["⼈均播放时长(秒)"].apply(conds_2_minutes)
_excel(os.path.join(BASE_DIR, u"分类汇总.xlsx"), index=Fal, encoding="utf8")
def calculation_total(df, calculation_fields, specical_fields, name=u"⼩计"):
"""
获取df的总计
财务会计报告包括
columns: 需要计算的列
北京新东方课程
position: 总计字符串出现的位置
"""
records =[]
columns = list()
column_len =len(columns)
records.append(name)
records.append('')
四六级成绩公布的时间2021for field in calculation_fields:
val = df[field].sum()
莎士比亚的十四行诗records.append(val)
if specical_fields:
衰微的意思for item in specical_fields:
first, cond = item
val =round(df[first].sum()/df[cond].sum(),1)
records.append(val)
total_records =[]
total_records.append(records)
total_records.append(['']* column_len)
total_df = pd.DataFrame()
total_df = total_df.from_records(total_records, columns=columns)
return total_df
def calculation_sub_total(df, gfields, calculation_fields, specical_fields):
"""
索引是什么
分类汇总
:param df:
:return:
"""
total_df = pd.DataFrame()
group = df.groupby(gfields)
for group_name, val in group:
new_df = calculation_total(val, calculation_fields, specical_fields, name=u"⼩计")
total_df = pd.concat([total_df, val, new_df], axis=0)
return total_df
def conds_2_minutes(number):
"""
秒转成分
"""
try:
toxic是什么意思
number =int(number)
except:
return number
if number <60:
return"{}秒".format(number)
minute = number//60
conds = number - minute *60
if conds ==0:
return"{}分".format(minute)
return"{}分{}秒".format(minute, conds)
if __name__ =="__main__":
calculate_fields =["页⾯展⽰pv","页⾯展⽰uv","播放点击按钮⼈数","视频播放次数","完整播放次数", "累计播放时间(秒)"]
special_fields =[("累计播放时间(秒)","播放点击按钮⼈数")]
groupby_fields =["视频名称"]
file_path = os.path.join(BASE_DIR, u"视频题⽬统计数据.xlsx")
df = pd.read_excel(file_path)
gen_pivot_table(df, calculate_fields, special_fields, groupby_fields)
back to the future原始数据:
处理之后的数据: