逻辑回归预测银⾏客户是否开通定期存款账户
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime as dt
from sklearn import preprocessing #数据预处理
from sklearn.linear_model import LogisticRegression #导⼊逻辑回归库
del_lection import train_test_split #⽤来划分测试集与训练集
import aborn as sns #画图库
import tensorflow as tf
<("font", size=14)
sns.t()
sns.t(, color_codes=True)
data = pd.read_csv('D:\\pycode\\data\\bank-additional\\bank-additional-full.csv',header = 0)
data = data.dropna() #去掉带nan的⾏
print(data.shape)
print(lumns))
col_name = lumns)
col1 = ''.join(col_name)
print(col1)
col2 = col1.split(';')
print(col2)
和宝宝#print(type(data))
data = np.array(data)
(41188, 1)
['age;"job";"marital";"education";"default";"housing";"loan";"contact";"month";"day_of_week";"duration";"campaign";"pdays";"previous";"poutcome";"emp.var.rate";"cons.price.idx";"f.idx";"euri bor3m";"nr.employed";"y"']
age;"job";"marital";"education";"default";"housing";"loan";"contact";"month";"day_of_week";"duration";"campaign";"pdays";"previous";"poutcome";"emp.var.rate";"cons.price.idx";"f.idx";"eurib or3m";"nr.employed";"y"
['age', '"job"', '"marital"', '"education"', '"default"', '"housing"', '"loan"', '"contact"', '"month"', '"day_of_week"', '"duration"', '"campaign"', '"pdays"', '"previous"', '"poutcome"', '"emp.var.rate"', '"cons.price .idx"', '"f.idx"', '"euribor3m"', '"nr.employed"', '"y"']
lis = []
col2[0]='"age"'
for i in range(len(col2)):
if (col2[i][0] >= '0' and col2[i][0] <= '9') or col2[i][0]=='-':
if '.' in col2[i]:
lis.append(float(col2[i]))
el:
lis.append(int(col2[i]))
el:
lis.append(col2[i][1:-1])
水松林print(lis)
['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', 'f.idx', 'eur ibor3m', 'nr.employed', 'y']
#print(data[0])
#str = ''.join(data[0]) #array转换成字符串
#print(str)
#tmp = str.split(';')
#print(tmp)
#print(len(tmp))
#print((tmp[0][0]>'0' and tmp[0][0]<'9'))
#print(type(tmp[1]))
#li = []
#for i in range(len(tmp)):
# if (tmp[i][0] >= '0' and tmp[i][0] <= '9') or tmp[i][0]=='-':
# if '.' in tmp[i]:
# li.append(float(tmp[i]))
# el:
# li.append(int(tmp[i]))
# el:
咸鸭蛋怎么煮# li.append(tmp[i][1:-1])
#print(li)
[ '56;"houmaid";"married";"basic.4y";"no";"no";"no";"telephone";"may";"mon";261;1;999;0;"nonexistent";1.1;93.994;-36.4;4.857;5191;"no"'] 56;"houmaid";"married";"basic.4y";"no";"no";"no";"telephone";"may";"mon";261;1;999;0;"nonexistent";1.1;93.994;-36.4;4.857;5191;"no"
['56', '"houmaid"', '"married"', '"basic.4y"', '"no"', '"no"', '"no"', '"telephone"', '"may"', '"mon"', '261', '1', '999', '0', '"nonexistent"', '1.1', '93.994', '-36.4', '4.857', '5191', '"no"']
21
True
<class 'str'>
[56, 'houmaid', 'married', 'basic.4y', 'no', 'no', 'no', 'telephone', 'may', 'mon', 261, 1, 999, 0, 'nonexistent', 1.1, 93.994, -36.4, 4.857, 5191, 'no']
li = [[] for _ in range(data.shape[0])]
for i in range(data.shape[0]):
str = ''.join(data[i]) #array转换成字符串
tmp = str.split(';')
for j in range(len(tmp)):
if (tmp[j][0] >= '0' and tmp[j][0] <= '9') or tmp[j][0]=='-':
if '.' in tmp[j]:
li[i].append(float(tmp[j]))
el:
li[i].append(int(tmp[j]))
el:
li[i].append(tmp[j][1:-1])
#print(li)
import csv
with open("D:\\pycode\\data\\bank-additional\\mybank.csv","w") as csvfile:
writer = csv.writer(csvfile)
#先写⼊columns_name
writer.writerow(lis)
#写⼊多⾏⽤writerows
writer.writerows(li)
datat = pd.read_csv('D:\\pycode\\data\\bank-additional\\mybank.csv',header = 0)
datat = datat.dropna()
print(datat.shape)
print(lumns))
(41188, 21)
['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', 'f.idx', 'eur
ibor3m', 'nr.employed', 'y']
datat['education'].unique() #某⼀列去重
datat['education']=np.where(datat['education'] =='basic.9y', 'Basic', datat['education'])
datat['education']=np.where(datat['education'] =='basic.6y', 'Basic', datat['education'])
datat['education']=np.where(datat['education'] =='basic.4y', 'Basic', datat['education'])
datat['education'].unique()
array(['Basic', 'high.school', 'ur', 'unknown',
'university.degree', 'illiterate'], dtype=object)
datat['y']=np.where(datat['y'] =='no', 0, datat['y'])
datat['y']=np.where(datat['y'] =='yes', 1, datat['y'])
datat['y'].value_counts()
0 36548
1 4640
Name: y, dtype: int64
#直⽅图
plt.show()
plt.savefig('count_plot')
[外链图⽚转存失败(img-TsH0RTkS-1565963315031)(output_10_0.png)]
#开户统计
count_no_sub = len(datat[datat['y']==0])
count_sub = len(datat[datat['y']==1])
pct_of_no_sub = count_no_sub/(count_no_sub+count_sub)
print('未开户的百分⽐: %.2f%%' % (pct_of_no_sub*100))
pct_of_sub = count_sub/(count_no_sub+count_sub)
print('开户的百分⽐: %.2f%%' % (pct_of_sub*100))
#⽤均值查看⼀下0与1的情况
age duration campaign pdays previous emp.var.rate cons.price.f.idx h_oct month_p day_of_week y
039.911185220.844807 2.633085984.1138780.1323740.24887593.603757-40.593097 3.0.0110270.008591
140.913147553.191164 2.051724792.0355600.492672-1.23344893.354386-39.789784 2.0.0678880.055172
2 rows × 61 columns
#计算其他特征值(如教育和婚姻状况)的分布
#upby('job').mean()
#upby('marital').mean()
#upby('education').mean()
%matplotlib inline
sstab(datat.job,datat.y)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Job title vs Purcha')
plt.xlabel('Job')
plt.ylabel('Proportion of Purcha')
plt.savefig('purcha_vs_job')
#我们发现具有不同职位的⼈购买存款的频率不⼀样。因此,职称可以是良好的预测因素。
[外链图⽚转存失败(img-JPv7Bo5B-1565963315040)(output_14_0.png)]
sstab(datat.marital,datat.y)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Marital Status vs Purcha')
plt.xlabel('Marital Status')
plt.ylabel('Proportion of Customers')
plt.savefig('mariral_vs_pur_stack')
#婚姻状况似乎不是好的预测因素
[外链图⽚转存失败(img-dEIRor9r-1565963315045)(output_15_0.png)]
#下⾯是教育属性
sstab(datat.education,datat.y)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Education vs Purcha')
plt.xlabel('Education')
plt.ylabel('Proportion of Customers')
plt.savefig('edu_vs_pur_stack')
#可以看出教育似乎是结果变量的良好预测指标
[外链图⽚转存失败(img-HRdTNPbZ-1565963315051)(output_16_0.png)]
#下⾯是时间特征
sstab(datat.day_of_week,datat.y)#.plot(kind='bar')
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Day of Week vs Purcha')
plt.xlabel('Day of Week')
plt.ylabel('Proportion of Purcha')
plt.savefig('dow_vs_purcha')
#⼀周⼯作时间不是预测结果的良好预测因素
[外链图⽚转存失败(img-miRajsCk-1565963315054)(output_17_0.png)]
cat_vars=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']
print(datat.shape)
print(datat[cat_vars].head())
data_final = pd.DataFrame(columns = [])
data_final = datat
data_final = data_final.drop(cat_vars,axis=1)
#将离散型数据one-hot编码后接⼊原数据集
for var in cat_vars:
cat_list = pd.get_dummies(datat[var], prefix=var) #将数据集中某⼀列进⾏数值编码
#print(cat_list)
data_final=data_final.join(cat_list) #这样便于算法处理
#data_final = pd.get_dummies(datat[cat_vars], prefix=cat_vars)
print(lumns)
print(data_final.shape)
(41188, 21)
job marital education default housing loan contact month \
0 houmaid married Basic no no no telephone may
1 rvices married high.school unknown no no telephone may
2 rvices married high.school no yes no telephone may
3 admin. married Basic no no no telephone may
4 rvices married high.school no no yes telephone may
day_of_week poutcome
0 mon nonexistent
1 mon nonexistent
2 mon nonexistent
3 mon nonexistent
4 mon nonexistent
Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
'cons.price.idx', 'f.idx', 'euribor3m', 'nr.employed', 'y',
'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_houmaid',
'job_management', 'job_retired', 'job_lf-employed', 'job_rvices',
'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
'marital_divorced', 'marital_married', 'marital_single',
'marital_unknown', 'education_Basic', 'education_high.school',
'education_illiterate', 'ur',
'education_university.degree', 'education_unknown', 'default_no',
'default_unknown', 'default_yes', 'housing_no', 'housing_unknown',
'housing_yes', 'loan_no', 'loan_unknown', 'loan_yes',南门中学
'contact_cellular', 'contact_telephone', 'month_apr', 'month_aug',
'month_dec', 'month_jul', 'month_jun', 'month_mar', 'month_may',
'month_nov', 'month_oct', 'month_p', 'day_of_week_fri',
'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue',
'day_of_week_wed', 'poutcome_failure', 'poutcome_nonexistent',
'poutcome_success'],
dtype='object')
(41188, 62)
data_final[data_final['y']=='unknown']
age duration campaign pdays previous emp.var.rate cons.price.f.idx h_oct month_p day_of_week_fri 0 rows × 62 columns
X = data_final.loc[:, lumns != 'y']
y = data_final.loc[:, lumns == 'y'].values.ravel()
print(X.shape)
print(y.shape)取数函数
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) columns = lumns
os_data_X,os_data_y=os.fit_sample(X_train, y_train.astype('int'))
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])
# we can Check the numbers of our data
print("过采样以后的数据量: ",len(os_data_X))
print("未开户的⽤户数量: ",len(os_data_y[os_data_y['y']==0]))
其实很爱你print("开户的⽤户数量: ",len(os_data_y[os_data_y['y']==1]))
print("未开户的⽤户数量的百分⽐: ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("开户的⽤户数量的百分⽐: ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))
(41188, 61)
(41188,)
过采样以后的数据量: 51158
未开户的⽤户数量: 25579
即身成佛开户的⽤户数量: 25579
未开户的⽤户数量的百分⽐: 0.5
开户的⽤户数量的百分⽐: 0.5
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) logreg = LogisticRegression(solver='liblinear')
logreg.fit(os_data_X, os_data_shape(-1))
y_pred = logreg.predict(X_test)
print('在测试数据集上⾯的预测准确率: {:.2f}'.format(logreg.score(X_test, y_test.astype('int'))))在测试数据集上⾯的预测准确率: 0.87
ics import classification_report
print(classification_report(y_test.astype('int'), y_pred))
precision recall f1-score support
0 0.98 0.86 0.92 10969
1 0.45 0.89 0.60 1388
micro avg 0.87 0.87 0.87 12357
macro avg 0.72 0.88 0.76 12357
weighted avg 0.92 0.87 0.88 12357
# 最终效果。
ics import roc_auc_score
ics import roc_curve
logit_roc_auc = roc_auc_score(y_test.astype('int'), logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test.astype('int'), logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Fal Positive Rate')
方便英语plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
<Figure size 432x288 with 0 Axes>
[外链图⽚转存失败(img-QjlgmeqK-1565963315060)(output_23_1.png)]