「二分类算法」提供银行精准营销解决方案代码存档

未分类 3年前 (2022) 程序员胖胖胖虎阿

274 0 0

import mglearn
from numpy import int64
from sklearn import metrics
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, Lasso, LinearRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFECV
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
sns.set(style="darkgrid")
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示

# 字段说明
#
# NO    字段名称    数据类型    字段描述
# 1    ID    Int    客户唯一标识
# 2    age    Int    客户年龄
# 3    job    String    客户的职业
# 4    marital    String    婚姻状况
# 5    education    String    受教育水平
# 6    default    String    是否有违约记录
# 7    balance    Int    每年账户的平均余额
# 8    housing    String    是否有住房贷款
# 9    loan    String    是否有个人贷款
# 10    contact    String    与客户联系的沟通方式
# 11    day    Int    最后一次联系的时间（几号）
# 12    month    String    最后一次联系的时间（月份）
# 13    duration    Int    最后一次联系的交流时长
# 14    campaign    Int    在本次活动中，与该客户交流过的次数
# 15    pdays    Int    距离上次活动最后一次联系该客户，过去了多久（999表示没有联系过）
# 16    previous    Int    在本次活动之前，与该客户交流过的次数
# 17    poutcome    String    上一次活动的结果
# 18    y    Int    预测客户是否会订购定期存款业务
from sklearn.tree import DecisionTreeClassifier

data_train = pd.read_csv('train_set.csv')
data_test = pd.read_csv('test_set.csv')
ids_test = data_test['ID']

print(data_train.shape[0])

# data_train['cppv']=data_train['campaign']+data_train['previous']
# data_test['cppv']=data_test['campaign']+data_test['previous']
# data_train.drop(['campaign','previous'], axis=1, inplace=True)
# data_test.drop(['campaign','previous'], axis=1, inplace=True)

# Rela_grouped=data_train.groupby(['cp'])
# Rela_Survival_Rate=(Rela_grouped.sum()/Rela_grouped.count())['y']
# Rela_count=Rela_grouped.count()['y']
#
# ax1=Rela_count.plot(kind='bar',color='g')
# ax2=ax1.twinx()
# ax2.plot(Rela_Survival_Rate.values,color='r')
# ax1.set_xlabel('Relatives')
# ax1.set_ylabel('Number')
# ax2.set_ylabel('Survival Rate')
# plt.title('Survival Rate by Relatives')
# plt.grid(True,linestyle='-',color='0.7')
# plt.show()

# g = sns.FacetGrid(data_train, col='y')
# g.map(plt.hist, 'day', bins=30)
# plt.show()


print("数值处理1：标签指标one-hot编码处理")


data_train.drop(['ID'], axis=1, inplace=True)
data_test.drop(['ID'], axis=1, inplace=True)

dummy = pd.get_dummies(data_train[['month','job','marital','education','default','housing','loan','contact','poutcome']])
dummyTest = pd.get_dummies(data_test[['month','job','marital','education','default','housing','loan','contact','poutcome']])
data_train = pd.concat([dummy, data_train], axis=1)
data_train.drop(['job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1)
data_test = pd.concat([dummyTest, data_test], axis=1)
data_test.drop(['job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1)

data_train['day'].replace([30,13,15,4,14,12,18],4,inplace=True)
data_train['day'].replace([5,20,21,11,8,16,2,3],3,inplace=True)
data_train['day'].replace([17,9,6,27,7,22,28],2,inplace=True)
data_train['day'].replace([23,25,26,10,29,19],1,inplace=True)
data_train['day'].replace([1,24,31],0,inplace=True)

data_test['day'].replace([30,13,15,4,14,12,18],4,inplace=True)
data_test['day'].replace([5,20,21,11,8,16,2,3],3,inplace=True)
data_test['day'].replace([17,9,6,27,7,22,28],2,inplace=True)
data_test['day'].replace([23,25,26,10,29,19],1,inplace=True)
data_test['day'].replace([1,24,31],0,inplace=True)


# data_train['month1'] = data_train.month.apply(lambda x: 4 if x in ['may'] else 0)
# data_train['month1'] = data_train.month.apply(lambda x: 3 if x in ['aug','jul','apr'] else 0)
# data_train['month1'] = data_train.month.apply(lambda x: 2 if x in ['jun','feb','nov','oct'] else 0)
# data_train['month1'] = data_train.month.apply(lambda x: 1 if x in ['sep','mar'] else 0)
#
# data_test['month1'] = data_test.month.apply(lambda x: 4 if x in ['may'] else 0)
# data_test['month1'] = data_test.month.apply(lambda x: 3 if x in ['aug','jul','apr'] else 0)
# data_test['month1'] = data_test.month.apply(lambda x: 2 if x in ['jun','feb','nov','oct'] else 0)
# data_test['month1'] = data_test.month.apply(lambda x: 1 if x in ['sep','mar'] else 0)
# #
data_train.drop(['month'], inplace=True, axis=1)
data_test.drop(['month'], inplace=True, axis=1)
# data_train.drop(['day','job_management','marital_single'], axis=1, inplace=True)
# data_test.drop(['day','job_management','marital_single'], axis=1, inplace=True)


# data_train['month'].replace(['may'],4,inplace=True)
# data_train['month'].replace(['aug','jul','apr'],3,inplace=True)
# data_train['month'].replace(['jun','feb','nov','oct'],2,inplace=True)
# data_train['month'].replace(['sep','mar'],1,inplace=True)
# data_train['month'].replace(['jan','dec'],0,inplace=True)

# 多删特征
# data_train.drop(['age','balance','duration','pdays','previous','day','month','job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1)
# data_test.drop(['age','balance','duration','pdays','previous','day','month','job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1)

#default、housing、loan都是2分类的指标，删除其中一个即可
# data_train.drop(['default_no','housing_no','loan_no'], inplace=True, axis=1)
# data_test.drop(['default_no','housing_no','loan_no'], inplace=True, axis=1)


################################
########### 数据整理 ###########
################################

data_train['pdays'].replace(-1,9999,inplace=True)
data_test['pdays'].replace(-1,9999,inplace=True)
print("数值处理2：pdays将-1替换为999")
# data_train.drop(['pdays'], inplace=True, axis=1)
# data_test.drop(['pdays'], inplace=True, axis=1)


# g = sns.FacetGrid(data_train, col='y')
# g.map(plt.hist, 'pdays', bins=20)
# plt.show()
# data_train.drop(['pdays'], inplace=True, axis=1)
# data_test.drop(['pdays'], inplace=True, axis=1)

y = data_train['y']
X = data_train[data_train.columns[: -1]]
# # X.info()
# pdays的平均值先前看到是45，而-1距离45很近，距离max值854很远，故还是需要将所有的-1替换为999
#数据预处理：
#数据中pdays=-1表示从未联络过，替换为999



#对方差较大的数据指标进行变换，MinMaxScaler或者StandardScaler
print("数值处理3：数值指标Scaler变换")
scaler = MinMaxScaler()
# numerical = ['age','balance', 'duration', 'pdays', 'previous']
# X[numerical] = scaler.fit_transform(X[numerical])
# data_test[numerical] = scaler.fit_transform(data_test[numerical])
print(data_test.shape)
X = scaler.fit_transform(X)
data_test = scaler.fit_transform(data_test)

# tsvd = TruncatedSVD(n_components=46)
# data_test = tsvd.fit_transform(data_test)
#数据分割，用于测试
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.06, random_state=1)
# X_train = tsvd.fit_transform(X_train)
# X_test = tsvd.fit_transform(X_test)
# print(X_train.shape)

#增加二项式特征
# polynomial_interaction = PolynomialFeatures(degree=2,include_bias=False)
# #增加二项式特征，仅仅是交叉特征
# polynomial_interaction = PolynomialFeatures(degree=2,interaction_only=True,include_bias=False)
# X_train = polynomial_interaction.fit_transform(X_train)
# X_test = polynomial_interaction.fit_transform(X_test)
# data_test = polynomial_interaction.fit_transform(data_test)
# print('after Polynomial:',X_train.shape)
#
# # #保留99%的信息，进行朱成本分析
# pca = PCA(n_components=100,whiten=True)
# X_train = pca.fit_transform(X_train)
# X_test = pca.fit_transform(X_test)
# data_test = pca.fit_transform(data_test)
# print('after PCA:',X_train.shape)

# #卡方分类筛选
# selector = SelectKBest(f_classif,k=300)
# X_train = selector.fit_transform(X_train,y_train)
# X_test = selector.fit_transform(X_test,y_test)
# print('after SelectKBest:',X_train.shape)

# print(X_train['pdays'])

################################
########### 性能计算 ###########
################################


# print('决策树，分数不理想')
# clf = DecisionTreeClassifier(random_state=11)
# clf.fit(X_train, y_train)
# predictions = clf.predict(X_test)
# print(classification_report(y_test, predictions))
# print(cross_val_score(clf,X_train, y_train,scoring='f1'))
# print(cross_val_score(clf,X_test, y_test,scoring='f1'))
# print(clf.score(X_test, y_test))
#
# y_predprob = clf.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
#
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
#
print('随机森林，0.919203')
clf = RandomForestClassifier(n_estimators=90, random_state=0,oob_score=True,n_jobs=-1)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))
# print(cross_val_score(clf,X_train, y_train,scoring='f1'))
# print(cross_val_score(clf,X_test, y_test,scoring='f1'))
print(clf.score(X_test, y_test))
y_predprob = clf.predict_proba(X_test)
y_predprob = y_predprob[:, 1]
print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))

#穷举随机森林的最佳参数,答案：90
# param_test1 ={'n_estimators':range(10,100,5)}
# gsearch1= GridSearchCV(estimator =RandomForestClassifier(min_samples_split=100,
#                                  min_samples_leaf=20,max_depth=8,max_features='sqrt',random_state=10),
#                        param_grid =param_test1,scoring='roc_auc',cv=5)
# gsearch1.fit(X_train, y_train)
# print(gsearch1.best_params_)
# y_predprob = gsearch1.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
# predictions = gsearch1.predict(X_test)
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
#
# print('逻辑回归,0.904655,0.915316')
# # print(X_train)
# #clf = Lasso(alpha=0.5)
# clf = LogisticRegression(random_state=0,solver='newton-cg',class_weight='balanced',penalty='l2',n_jobs=-1)
# # solver : str, {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, optional (default=’liblinear’).
# clf.fit(X_train, y_train)
# # clf.fit(X_train, y_train)
# predictions = clf.predict(X_test)
# # print(classification_report(y_test, predictions))
# # print(cross_val_score(clf,X_train, y_train,scoring='f1'))
# # print(cross_val_score(clf,X_test, y_test,scoring='f1'))
# print(clf.score(X_test, y_test))
# y_predprob = clf.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
#
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
#
# raletion = pd.DataFrame({"columns":list(data_train.columns)[0:-1], "coef":list(clf.coef_.T)})
# print('相关性：',raletion)

# #穷举逻辑回归的最佳参数,答案:
# # best C : LogisticRegression(C=7.742636826811269, class_weight=None, dual=False,
# #                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
# #                    max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
# #                    random_state=None, solver='warn', tol=0.0001, verbose=0,
# #                    warm_start=False)
# penalty = ['l1','l2']
# C=np.logspace(0,4,10)
# hyperparameters = dict(C=C,penalty=penalty)
# gridsearch = GridSearchCV(clf,hyperparameters,cv=5,verbose=0)
# best_clf= gridsearch.fit(X_train, y_train)
# print('best C :',best_clf.best_estimator_)
# print(gridsearch.best_params_)
# y_predprob = gridsearch.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
# predictions = gridsearch.predict(X_test)
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))

# print('AdaBoost')
# clf = AdaBoostClassifier(n_estimators=60, random_state=90)
#
# clf.fit(X_train, y_train)
# predictionsByadaBoost = clf.predict(X_test)
# print(classification_report(y_test, predictionsByadaBoost))
# print(cross_val_score(clf,X_train, y_train,scoring='f1'))
# print(cross_val_score(clf,X_test, y_test,scoring='f1'))
# print(clf.score(X_test, y_test))
# pred = clf.predict_proba(X_test)
# dataPred = pd.DataFrame(pred, columns=['pred0', 'pred'])
# dataPred.drop('pred0', axis=1, inplace=True)
# print(dataPred)
#
# y_predprob = clf.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
#
# predictions_train =  clf.predict(X_train)
# y_predprob_train = clf.predict_proba(X_train)
# y_predprob_train = y_predprob_train[:, 1]
#
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictionsByadaBoost))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
# print("Accuracy y_train : %.4g" % metrics.accuracy_score(y_train, predictions_train))
# print("AUC Score (Train): %f" % metrics.roc_auc_score(y_train, y_predprob_train))
# #
#
#
# # #
# print('神经网络')
# # ‘lbfgs’ is an optimizer in the family of quasi-Newton methods.
# # ‘sgd’ refers to stochastic gradient descent.
# # ‘adam’ refers to a stochastic gradient-based optimizer proposed by Kingma, Diederik, and Jimmy Ba
# clf = MLPClassifier(solver='adam', hidden_layer_sizes=(80,80),
#                     random_state=1)
# clf.fit(X_train, y_train)
# predictions = clf.predict(X_test)
# print(clf.score(X_test, y_test))
# y_predprob = clf.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
#
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
# print('神经网络 end')
# # #导出结果
ID = list(range(25318,36170))
submission = pd.DataFrame(ID)
submission.rename(columns = {0: 'ID'}, inplace = True)
# 将pred_y从array转化成DataFrame
y_predprob_test = clf.predict_proba(data_test)
y_predprob_test = y_predprob_test[:, 1]
y_predprob_DataFrame = pd.DataFrame(y_predprob_test)
submission['pred'] =y_predprob_DataFrame
submission.to_csv('Result.csv', index = False)

#为防止过拟合而减半步长，最大迭代次数加倍
# gbm1 = GradientBoostingClassifier(learning_rate=0.001, n_estimators=10000, max_depth=7, min_samples_leaf=70,
#                                   min_samples_split=1300, subsample=0.8, random_state=10)
# gbm1.fit(X_train, y_train)
#
# y_pred = gbm1.predict(X_test)
# y_predprob = gbm1.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
#
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pred))
# print("AUC Score (Train): %f" % metrics.roc_auc_score(y_test, y_predprob))

# print('KNN近邻，分数不理想')
# clf = KNeighborsClassifier(n_neighbors=5)
# clf.fit(X_train,y_train)
# predictions = clf.predict(X_test)
# print(classification_report(y_test, predictions))
# y_predprob = clf.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]

# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))

# print('SVM支持向量机')
# clf = SVC(kernel='rbf',C=1,gamma='auto',probability=True).fit(X_train, y_train)
# predictions = clf.predict(X_test)
# print(classification_report(y_test, predictions))
# y_predprob = clf.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
#
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))

#朴素贝叶斯
# print('朴素贝叶斯')
# clf = GaussianNB()
#
# clf_sigmoid = CalibratedClassifierCV(clf,cv=5)
# clf_sigmoid.fit(X_train,y_train)
# predictions = clf_sigmoid.predict(X_test)
# y_predprob = clf_sigmoid.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
#
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))

################################
# AdaBoost选为第一次使用的算法,提交数据
################################
# print('AdaBoost')
# adaBoost = AdaBoostClassifier(n_estimators=50, random_state=11)
# adaBoost.fit(X_train, y_train)
#
# age_null = pd.isnull(data_test['age'])
# data_null = data_test[age_null == True]
# # print(data_null)
#
# id = data_test["ID"]
# print(id)
# X_test.drop(['ID'], axis=1, inplace=True)
#
# submission = pd.DataFrame({
#         "ID": id
#     })
#
# submission[['ID']].astype(int)
# # submission[['ID']] = submission[['ID']].astype(int)
# submission.to_csv('submission.csv', index=False)

# data_test.dropna(inplace=True)
# print(np.isnan(data_test).any())
# submission.replace(np.nan, 0, inplace=True)


# predictionsByadaBoost = adaBoost.predict_proba(X_test)
#
# submission = pd.DataFrame({
#         "ID": id,
#         "pred": predictionsByadaBoost
#     })
# submission.to_csv('submission.csv', index=False)

第一次提交，没做什么特征工程，分数还不太理想

0.9157894736842105
Accuracy : 0.9158
AUC Score (Test): 0.932477

过程分析

from numpy import int64
from sklearn import metrics from sklearn.model_selection import GridSearchCV from sklearn.model_selection import cross_val_score import matplotlib.pyplot as plt import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report import seaborn as sns from sklearn.preprocessing import PolynomialFeatures from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest,chi2,f_classif from sklearn.metrics import roc_auc_score data_train = pd.read_csv('/home/kesci/input/firstdata1587/train_set.csv') data_test = pd.read_csv('/home/kesci/input/firstdata1587/test_set.csv') data_train.describe()

Out[4]:

	ID	age	balance	day	duration	campaign	pdays	previous	y
count	25317.000000	25317.000000	25317.000000	25317.000000	25317.000000	25317.000000	25317.000000	25317.000000	25317.000000
mean	12659.000000	40.935379	1357.555082	15.835289	257.732393	2.772050	40.248766	0.591737	0.116957
std	7308.532719	10.634289	2999.822811	8.319480	256.975151	3.136097	100.213541	2.568313	0.321375
min	1.000000	18.000000	-8019.000000	1.000000	0.000000	1.000000	-1.000000	0.000000	0.000000
25%	6330.000000	33.000000	73.000000	8.000000	103.000000	1.000000	-1.000000	0.000000	0.000000
50%	12659.000000	39.000000	448.000000	16.000000	181.000000	2.000000	-1.000000	0.000000	0.000000
75%	18988.000000	48.000000	1435.000000	21.000000	317.000000	3.000000	-1.000000	0.000000	0.000000
max	25317.000000	95.000000	102127.000000	31.000000	3881.000000	55.000000	854.000000	275.000000	1.000000

总计记录25317人。年龄分布：18-95； balance（存款）分布：-8019 - 102127，balance的标准差2999.822811，比较大，看到平均存款1357，上四分位1435，下四分位才只有73元，存款的差距还是蛮大的，万恶的资本主义； day（最后一次联系是几号）：1-31，很明显一个月从1号开始，从31号结束，这个特征很可能和预测无关联； duration（交流时长）：0-3881，这个猜测是持续的天数； campaign（交流次数）：1-55 pdays（上次联系后过了多久）：-1 - 854，这里没有999，应该是-1为没有联系，>-1就是期间几天前曾联系过； previous(活动前交流次数)：0-275，平均0.591737，不到1次；

In [5]:

#工作和购买理财的关系
y_0 = data_train.job[data_train.y == 0].value_counts() y_1 = data_train.job[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"job to buy") plt.ylabel(u"counts") plt.show()

In [14]:

#婚姻和购买理财的关系
#看不出啥结果
y_0 = data_train.marital[data_train.y == 0].value_counts() y_1 = data_train.marital[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"marital to buy") plt.ylabel(u"counts") plt.show()

In [15]:

#教育和购买理财的关系
y_0 = data_train.education[data_train.y == 0].value_counts() y_1 = data_train.education[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"education to buy") plt.ylabel(u"counts") plt.show()

In [24]:

#上次活动结果和购买理财的关系
#发现poutcome指标相当重要，上次活动成功的客户这次也购买理财的比例非常高
y_0 = data_train.poutcome[data_train.y == 0].value_counts() y_1 = data_train.poutcome[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"poutcome to buy") plt.ylabel(u"counts") plt.show()

判断day、month是和客户交流的月份和日份，很容易被当成噪音特征。用统计来说话。

In [3]:

#月份对结果的影响
y_0 = data_train.month[data_train.y == 0].value_counts() y_1 = data_train.month[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"poutcome to buy") plt.ylabel(u"counts") plt.show() print(y_1/data_train.shape[0]) #我们发现may（0.019789）和dec （0.001896）相差10倍，所以这个特征还是蛮重要的

may    0.019789
aug    0.014773
jul    0.014022
apr    0.012916
jun    0.011613
feb    0.009954
nov    0.009045
oct    0.007465
sep    0.006241
mar    0.005727
jan    0.003515
dec    0.001896
Name: month, dtype: float64

In [4]:

#日对结果的影响
y_0 = data_train.day[data_train.y == 0].value_counts() y_1 = data_train.day[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"poutcome to buy") plt.ylabel(u"counts") plt.show() print(y_1/data_train.shape[0]) #发现30号最容易出单，31号极不容易出单

30    0.005964
13    0.005253
15    0.005135
4     0.005016
14    0.004977
12    0.004898
18    0.004898
5     0.004661
20    0.004661
21    0.004621
11    0.004582
8     0.004463
16    0.004345
2     0.004345
3     0.004266
17    0.003950
9     0.003910
6     0.003792
27    0.003792
7     0.003476
22    0.003436
28    0.003160
23    0.002923
25    0.002646
26    0.002528
10    0.002528
29    0.002409
19    0.002370
1     0.001777
24    0.001303
31    0.000869
Name: day, dtype: float64

In [7]:

#'job','marital','education','default','housing','loan','contact','poutcome'这8个字段都
#要做one-hot编码预处理，暂时先将unknown作为一个特征项。

dummy = pd.get_dummies(data_train[['day','month','job','marital','education','default','housing','loan','contact','poutcome']]) dummyTest = pd.get_dummies(data_test[['day','month','job','marital','education','default','housing','loan','contact','poutcome']]) data_train = pd.concat([dummy, data_train], axis=1) data_train.drop(['day','month','job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1) data_test = pd.concat([dummyTest, data_test], axis=1) data_test.drop(['day','month','job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1) print("数值处理1：标签指标one-hot编码处理") #default、housing、loan都是2分类的指标，删除其中一个即可 #data_train.drop(['default_no','housing_no','loan_no'], inplace=True, axis=1) #data_test.drop(['default_no','housing_no','loan_no'], inplace=True, axis=1) data_train['pdays'].replace(-1,999,inplace=True) data_test['pdays'].replace(-1,999,inplace=True) print("数值处理2：pdays将-1替换为999")

数值处理1：标签指标one-hot编码处理
数值处理2：pdays将-1替换为999

In [20]:

data_train.head()

Out[20]:

	job_admin.	job_management	job_technician	...	poutcome_other	poutcome_unknown	age	balance	duration	campaign	pdays	previous
0	0	1	0	...	0	1	43	291	150	2	-1	0
1	0	0	1	...	1	0	42	5076	99	1	251	2
2	1	0	0	...	0	1	47	104	77	2	-1	0
3	0	1	0	...	0	1	28	-994	174	2	-1	0
4	0	0	1	...	0	1	42	2974	187	5	-1	0

5 rows × 39 columns

In [6]:

#测试单一特征和目标的关系
#print('无违约：',data_train[data_train['default_yes']==0].count())
#print('有违约：',data_train[data_train['default_yes']==1].count())
print(data_train['default_yes'].value_counts()) print(data_test['default_yes'].value_counts()) #data_train.groupby(["default_yes"], as_index=False)['y'].count()

0    24869
1      448
Name: default_yes, dtype: int64
0    24869
1      448
Name: default_yes, dtype: int64

Out[6]:

	job_admin.	job_blue-collar	job_entrepreneur	job_housemaid	job_management	job_retired	job_self-employed	job_services	job_student	job_technician	...	poutcome_other	poutcome_success	poutcome_unknown	age	balance	duration	campaign	pdays	previous	y
job_admin.	1.000000	-0.188846	-0.067402	-0.059086	-0.185311	-0.082905	-0.068534	-0.115037	-0.052838	-0.161626	...	0.013577	0.004200	-0.018840	-0.063839	-0.029366	-0.017629	-0.018559	0.021803	0.009821	0.000298
job_blue-collar	-0.188846	1.000000	-0.098047	-0.085951	-0.269568	-0.120600	-0.099695	-0.167341	-0.076863	-0.235113	...	-0.003148	-0.056453	0.025315	-0.044350	-0.056248	0.010505	0.009946	0.016488	-0.019208	-0.075065
job_entrepreneur	-0.067402	-0.098047	1.000000	-0.030677	-0.096212	-0.043044	-0.035583	-0.059726	-0.027433	-0.083915	...	-0.018659	-0.014969	0.013491	0.023331	0.010288	0.003927	-0.001803	-0.014705	-0.007958	-0.022519
job_housemaid	-0.059086	-0.085951	-0.030677	1.000000	-0.084342	-0.037733	-0.031193	-0.052357	-0.024049	-0.073562	...	-0.018467	-0.009511	0.029735	0.084754	0.008013	-0.001337	0.002692	-0.032321	-0.013129	-0.015041
job_management	-0.185311	-0.269568	-0.096212	-0.084342	1.000000	-0.118343	-0.097829	-0.164209	-0.075424	-0.230713	...	0.008288	0.025737	-0.019421	-0.027075	0.078719	-0.010090	0.016234	-0.003619	0.025946	0.035234
job_retired	-0.082905	-0.120600	-0.043044	-0.037733	-0.118343	1.000000	-0.043767	-0.073464	-0.033743	-0.103217	...	-0.001619	0.054668	-0.024616	0.451285	0.046370	0.026569	-0.031805	-0.003046	0.007511	0.083868
job_self-employed	-0.068534	-0.099695	-0.035583	-0.031193	-0.097829	-0.043767	1.000000	-0.060730	-0.027894	-0.085325	...	-0.002526	0.004632	0.000565	-0.009973	0.000782	0.002657	-0.003602	-0.007433	-0.004029	0.001078
job_services	-0.115037	-0.167341	-0.059726	-0.052357	-0.164209	-0.073464	-0.060730	1.000000	-0.046821	-0.143221	...	0.001367	-0.020796	0.005367	-0.060838	-0.036640	0.000364	-0.001615	0.011358	-0.006309	-0.026688
job_student	-0.052838	-0.076863	-0.027433	-0.024049	-0.075424	-0.033743	-0.027894	-0.046821	1.000000	-0.065784	...	0.030733	0.049948	-0.045026	-0.195720	0.000799	-0.005165	-0.021539	0.024643	0.014206	0.069058
job_technician	-0.161626	-0.235113	-0.083915	-0.073562	-0.230713	-0.103217	-0.085325	-0.143221	-0.065784	1.000000	...	-0.001704	-0.004072	0.011010	-0.063478	-0.015668	-0.011605	0.023601	-0.015579	-0.004059	-0.004942
job_unemployed	-0.060802	-0.088448	-0.031568	-0.027673	-0.086792	-0.038829	-0.032099	-0.053879	-0.024747	-0.075699	...	-0.012716	0.016013	0.009008	0.005462	0.013252	0.023554	-0.021663	-0.013660	-0.008230	0.023980
job_unknown	-0.029004	-0.042192	-0.015059	-0.013201	-0.041402	-0.018523	-0.015312	-0.025701	-0.011805	-0.036110	...	-0.016910	0.007256	0.011327	0.045026	0.015479	-0.003483	0.012938	-0.014763	-0.006241	0.001438
marital_divorced	0.027961	-0.062361	0.003040	0.016786	0.002196	0.053472	-0.017381	0.026199	-0.048590	0.007188	...	-0.001968	-0.002870	0.001999	0.165888	-0.028356	0.012815	-0.019830	0.003130	-0.004718	0.002723
marital_married	-0.056102	0.125532	0.044894	0.045362	-0.033545	0.073654	0.002060	-0.019572	-0.161869	-0.058949	...	-0.028606	-0.022959	0.028377	0.284516	0.026577	-0.022557	0.039452	-0.027329	-0.006380	-0.054746
marital_single	0.041159	-0.092241	-0.050951	-0.061204	0.034904	-0.117958	0.010081	0.002703	0.210381	0.058978	...	0.032488	0.026989	-0.032260	-0.426833	-0.008788	0.015434	-0.028825	0.027486	0.010278	0.057574
education_primary	-0.110105	0.348314	-0.011630	0.164128	-0.175814	0.119077	-0.040373	-0.058845	-0.042160	-0.161923	...	-0.004174	-0.033214	0.032773	0.194451	-0.026575	-0.000034	0.012495	-0.011621	-0.012038	-0.043154
education_secondary	0.220828	0.037604	-0.051630	-0.062505	-0.405359	-0.037429	-0.053990	0.200833	0.007825	0.155845	...	0.004079	-0.028471	0.002800	-0.093500	-0.074607	0.000568	-0.022185	0.017952	-0.011050	-0.038460
education_tertiary	-0.146154	-0.320429	0.061969	-0.055380	0.601275	-0.062459	0.095847	-0.170206	-0.024021	-0.036790	...	0.003128	0.050667	-0.030504	-0.083080	0.094686	-0.001067	0.011818	-0.006720	0.024955	0.066901
education_unknown	-0.021208	0.010760	0.008699	-0.012186	-0.041017	0.022015	-0.010919	-0.008502	0.110442	-0.014967	...	-0.009791	0.015287	0.003656	0.073640	0.018380	0.001066	0.006071	-0.008665	-0.007600	0.021087
default_yes	-0.005145	0.012717	0.029592	-0.007002	-0.008630	-0.008948	0.008743	-0.002526	-0.017596	-0.004049	...	-0.010326	-0.021432	0.038027	-0.019272	-0.068299	-0.011327	0.019978	-0.029440	-0.015293	-0.024608
housing_yes	0.043369	0.176937	0.017130	-0.074215	-0.063260	-0.159975	-0.023608	0.065284	-0.085328	-0.016506	...	0.032566	-0.096285	-0.060478	-0.187364	-0.068780	0.002778	-0.024708	0.121740	0.032667	-0.143589
loan_yes	0.032612	0.012896	0.040955	-0.012334	-0.032051	-0.016304	-0.006878	0.036603	-0.058082	0.009240	...	-0.011531	-0.053573	0.035315	-0.016286	-0.085854	-0.011356	0.020537	-0.024458	-0.006240	-0.065231
contact_cellular	-0.002431	-0.128760	-0.003751	-0.018765	0.101878	-0.010661	0.012462	-0.029756	0.027596	0.055623	...	0.107764	0.104342	-0.263887	-0.072573	0.015821	0.018666	-0.027461	0.225438	0.122062	0.134791
contact_telephone	-0.012570	-0.002537	-0.012075	0.044074	-0.031565	0.105808	0.001363	-0.015583	0.026084	-0.037147	...	0.025071	0.009642	-0.026306	0.174284	0.042785	-0.015570	0.056106	0.017672	0.021314	0.020747
contact_unknown	0.009411	0.137290	0.010535	-0.004194	-0.090346	-0.046364	-0.013896	0.039893	-0.043332	-0.038483	...	-0.127399	-0.115385	0.292862	-0.018304	-0.039998	-0.011223	-0.001567	-0.247577	-0.140445	-0.153572
poutcome_failure	0.012266	0.002967	0.003890	-0.019621	0.004027	0.000278	-0.001732	0.004389	0.007463	-0.010275	...	-0.073107	-0.064271	-0.734653	-0.006166	0.012700	-0.019398	-0.089085	0.704495	0.313898	0.011927
poutcome_other	0.013577	-0.003148	-0.018659	-0.018467	0.008288	-0.001619	-0.002526	0.001367	0.030733	-0.001704	...	1.000000	-0.038796	-0.443453	-0.021450	0.008611	-0.002584	-0.021604	0.384397	0.295747	0.038399
poutcome_success	0.004200	-0.056453	-0.014969	-0.009511	0.025737	0.054668	0.004632	-0.020796	0.049948	-0.004072	...	-0.038796	1.000000	-0.389856	0.039246	0.031758	0.045017	-0.058443	0.223025	0.174036	0.305806
poutcome_unknown	-0.018840	0.025315	0.013491	0.029735	-0.019421	-0.024616	0.000565	0.005367	-0.045026	0.011010	...	-0.443453	-0.389856	1.000000	-0.002015	-0.029327	-0.003872	0.109688	-0.868084	-0.485981	-0.170697
age	-0.063839	-0.044350	0.023331	0.084754	-0.027075	0.451285	-0.009973	-0.060838	-0.195720	-0.063478	...	-0.021450	0.039246	-0.002015	1.000000	0.093740	0.000416	0.006171	-0.026431	0.006575	0.029916
balance	-0.029366	-0.056248	0.010288	0.008013	0.078719	0.046370	0.000782	-0.036640	0.000799	-0.015668	...	0.008611	0.031758	-0.029327	0.093740	1.000000	0.026042	-0.010419	0.001032	0.015792	0.057564
duration	-0.017629	0.010505	0.003927	-0.001337	-0.010090	0.026569	0.002657	0.000364	-0.005165	-0.011605	...	-0.002584	0.045017	-0.003872	0.000416	0.026042	1.000000	-0.087780	0.000040	0.001315	0.394746
campaign	-0.018559	0.009946	-0.001803	0.002692	0.016234	-0.031805	-0.003602	-0.001615	-0.021539	0.023601	...	-0.021604	-0.058443	0.109688	0.006171	-0.010419	-0.087780	1.000000	-0.089224	-0.031667	-0.075173
pdays	0.021803	0.016488	-0.014705	-0.032321	-0.003619	-0.003046	-0.007433	0.011358	0.024643	-0.015579	...	0.384397	0.223025	-0.868084	-0.026431	0.001032	0.000040	-0.089224	1.000000	0.411688	0.107565
previous	0.009821	-0.019208	-0.007958	-0.013129	0.025946	0.007511	-0.004029	-0.006309	0.014206	-0.004059	...	0.295747	0.174036	-0.485981	0.006575	0.015792	0.001315	-0.031667	0.411688	1.000000	0.088337
y	0.000298	-0.075065	-0.022519	-0.015041	0.035234	0.083868	0.001078	-0.026688	0.069058	-0.004942	...	0.038399	0.305806	-0.170697	0.029916	0.057564	0.394746	-0.075173	0.107565	0.088337	1.000000

36 rows × 36 columns

In [8]:

#违约记录&订购理财的关系
fig = plt.figure() fig.set(alpha=0.2) # 设定图表颜色alpha参数 y_0 = data_atrain.default_yes[data_train.y == 0].value_counts() y_1 = data_train.default_yes[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"buy or not") plt.xlabel(u"default") plt.ylabel(u"counts") plt.show()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-a047910fcfb8> in <module>
 2 fig = plt.figure()  3 fig.set(alpha=0.2) # 设定图表颜色alpha参数 ----> 4 y_0 = data_atrain.default_yes[data_train.y == 0].value_counts()  5 y_1 = data_train.default_yes[data_train.y == 1].value_counts()  6 df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) NameError: name 'data_atrain' is not defined

<Figure size 432x288 with 0 Axes>

In [9]:

#住房贷款&订购理财的关系
#可以看出没有房贷购买理财的比例稍微高一些，但不明显，可能是还房贷的人资金压力稍大
fig = plt.figure() fig.set(alpha=0.2) # 设定图表颜色alpha参数 y_0 = data_train.housing_yes[data_train.y == 0].value_counts() y_1 = data_train.housing_yes[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"buy or not") plt.xlabel(u"housing") plt.ylabel(u"counts") plt.show() #发现没有违约的人买理财比例略高

<Figure size 432x288 with 0 Axes>

In [19]:

#个人贷款&订购理财的关系
#可以看出两种情况差别不大
fig = plt.figure() fig.set(alpha=0.2) # 设定图表颜色alpha参数 y_0 = data_train.loan_yes[data_train.y == 0].value_counts() y_1 = data_train.loan_yes[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"buy or not") plt.xlabel(u"loan") plt.ylabel(u"counts") plt.show() data_train[["loan_yes", "y"]].groupby(['loan_yes'], as_index=False).mean().sort_values(by='y', ascending=False) #可以看出12.6%的无个人贷的人买了理财，有贷款的只有6.89%买了理财 #说明无个贷买理财的机会比较大

<Figure size 432x288 with 0 Axes>

Out[19]:

	loan_yes	y
0	0	0.126117
1	1	0.068983

In [7]:

#使用直方图来看看那个区段年龄的人最多购买或不购买
g = sns.FacetGrid(data_train, col='y') g.map(plt.hist, 'age', bins=20) plt.show() #貌似看不出什么问题，只能说明买理财的年龄不大集中，不买的集中在30-40岁之间

In [8]:

#使用直方图来看看“距离上次活动最后一次联系该客户，过去了多久”的人最多购买或不购买
#看来是时间越短，购买率越高，说明pdays是相当重要的指标
g = sns.FacetGrid(data_train, col='y') g.map(plt.hist, 'pdays', bins=20) plt.show() #pdays指标让人读不懂，以后重点解决

In [9]:

y = data_train['y'] X = data_train[data_train.columns[: -1]] X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25317 entries, 0 to 25316
Data columns (total 51 columns):
month_apr              25317 non-null uint8
month_aug              25317 non-null uint8
month_dec              25317 non-null uint8
month_feb              25317 non-null uint8
month_jan              25317 non-null uint8
month_jul              25317 non-null uint8
month_jun              25317 non-null uint8
month_mar              25317 non-null uint8
month_may              25317 non-null uint8
month_nov              25317 non-null uint8
month_oct              25317 non-null uint8
month_sep              25317 non-null uint8
job_admin.             25317 non-null uint8
job_blue-collar        25317 non-null uint8
job_entrepreneur       25317 non-null uint8
job_housemaid          25317 non-null uint8
job_management         25317 non-null uint8
job_retired            25317 non-null uint8
job_self-employed      25317 non-null uint8
job_services           25317 non-null uint8
job_student            25317 non-null uint8
job_technician         25317 non-null uint8
job_unemployed         25317 non-null uint8
job_unknown            25317 non-null uint8
marital_divorced       25317 non-null uint8
marital_married        25317 non-null uint8
marital_single         25317 non-null uint8
education_primary      25317 non-null uint8
education_secondary    25317 non-null uint8
education_tertiary     25317 non-null uint8
education_unknown      25317 non-null uint8
default_no             25317 non-null uint8
default_yes            25317 non-null uint8
housing_no             25317 non-null uint8
housing_yes            25317 non-null uint8
loan_no                25317 non-null uint8
loan_yes               25317 non-null uint8
contact_cellular       25317 non-null uint8
contact_telephone      25317 non-null uint8
contact_unknown        25317 non-null uint8
poutcome_failure       25317 non-null uint8
poutcome_other         25317 non-null uint8
poutcome_success       25317 non-null uint8
poutcome_unknown       25317 non-null uint8
ID                     25317 non-null int64
age                    25317 non-null int64
balance                25317 non-null int64
duration               25317 non-null int64
campaign               25317 non-null int64
pdays                  25317 non-null int64
previous               25317 non-null int64
dtypes: int64(7), uint8(44)
memory usage: 2.4 MB

In [ ]:

#查看相关矩阵，连带y也作为指标
#data_train.corr()
#查看相关矩阵热图
#colormap = plt.cm.RdBu
#plt.figure(figsize=(39,37)) #plt.title('Correlation of Features', y=1.05, size=37) #sns.heatmap(data_train.astype(float).corr(),linewidths=0.1,vmax=1.0, # square=True, cmap=colormap, linecolor='white', annot=True) #plt.show()

In [11]:

print("数值处理3：数值指标Scaler变换")
scaler = StandardScaler() X = scaler.fit_transform(X) data_test = scaler.fit_transform(data_test) #数据分割，用于测试 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=90)

数值处理3：数值指标Scaler变换

In [12]:

# print('决策树')
# clf = DecisionTreeClassifier(random_state=11)
# clf.fit(X_train, y_train)
# predictions = clf.predict(X_test)
# print(classification_report(y_test, predictions)) # print(cross_val_score(clf,X_train, y_train,scoring='f1')) # print(cross_val_score(clf,X_test, y_test,scoring='f1')) # print(clf.score(X_test, y_test)) # # y_predprob = clf.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) # # print('随机森林') # clf = RandomForestClassifier(n_estimators=10, random_state=11) # clf.fit(X_train, y_train) # predictions = clf.predict(X_test) # print(classification_report(y_test, predictions)) # print(cross_val_score(clf,X_train, y_train,scoring='f1')) # print(cross_val_score(clf,X_test, y_test,scoring='f1')) # print(clf.score(X_test, y_test)) # y_predprob = clf.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) # print('逻辑回归') # clf = LogisticRegression() # clf.fit(X_train, y_train) # predictions = clf.predict(X_test) # print(classification_report(y_test, predictions)) # print(cross_val_score(clf,X_train, y_train,scoring='f1')) # print(cross_val_score(clf,X_test, y_test,scoring='f1')) # print(clf.score(X_test, y_test)) # y_predprob = clf.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) print('AdaBoost') adaBoost = AdaBoostClassifier(n_estimators=50, random_state=11) adaBoost.fit(X_train, y_train) predictionsByadaBoost = adaBoost.predict(X_test) print(classification_report(y_test, predictionsByadaBoost)) print(cross_val_score(adaBoost,X_train, y_train,scoring='f1')) print(cross_val_score(adaBoost,X_test, y_test,scoring='f1')) print(adaBoost.score(X_test, y_test)) pred = adaBoost.predict_proba(X_test) dataPred = pd.DataFrame(pred, columns=['pred0', 'pred']) dataPred.drop('pred0', axis=1, inplace=True) print(dataPred) y_predprob = adaBoost.predict_proba(X_test) y_predprob = y_predprob[:, 1] predictions_train = adaBoost.predict(X_train) y_predprob_train = adaBoost.predict_proba(X_train) y_predprob_train = y_predprob_train[:, 1] print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictionsByadaBoost)) print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) print("Accuracy y_train : %.4g" % metrics.accuracy_score(y_train, predictions_train)) print("AUC Score (Train): %f" % metrics.roc_auc_score(y_train, y_predprob_train)) ID = list(range(25318,36170)) submission = pd.DataFrame(ID) submission.rename(columns = {0: 'ID'}, inplace = True) # 将pred_y从array转化成DataFrame y_predprob_test = adaBoost.predict_proba(data_test) y_predprob_test = y_predprob_test[:, 1] y_predprob_DataFrame = pd.DataFrame(y_predprob_test) submission['pred'] =y_predprob_DataFrame submission.to_csv('Result.csv', index = False)

AdaBoost
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2249
           1       1.00      1.00      1.00       283

    accuracy                           1.00      2532
   macro avg       1.00      1.00      1.00      2532
weighted avg       1.00      1.00      1.00      2532

[1.         1.         0.99943915]
[1. 1. 1.]
1.0
              pred
0     2.220446e-16
1     1.000000e+00
2     2.220446e-16
3     2.220446e-16
4     2.220446e-16
5     2.220446e-16
6     2.220446e-16
7     2.220446e-16
8     2.220446e-16
9     2.220446e-16
10    2.220446e-16
11    2.220446e-16
12    2.220446e-16
13    2.220446e-16
14    2.220446e-16
15    2.220446e-16
16    2.220446e-16
17    2.220446e-16
18    2.220446e-16
19    2.220446e-16
20    2.220446e-16
21    2.220446e-16
22    2.220446e-16
23    2.220446e-16
24    2.220446e-16
25    2.220446e-16
26    2.220446e-16
27    2.220446e-16
28    2.220446e-16
29    2.220446e-16
...            ...
2502  2.220446e-16
2503  2.220446e-16
2504  2.220446e-16
2505  2.220446e-16
2506  2.220446e-16
2507  2.220446e-16
2508  2.220446e-16
2509  2.220446e-16
2510  2.220446e-16
2511  2.220446e-16
2512  2.220446e-16
2513  2.220446e-16
2514  2.220446e-16
2515  2.220446e-16
2516  2.220446e-16
2517  2.220446e-16
2518  2.220446e-16
2519  2.220446e-16
2520  1.000000e+00
2521  2.220446e-16
2522  2.220446e-16
2523  2.220446e-16
2524  2.220446e-16
2525  2.220446e-16
2526  2.220446e-16
2527  2.220446e-16
2528  2.220446e-16
2529  2.220446e-16
2530  1.000000e+00
2531  1.000000e+00

[2532 rows x 1 columns]
Accuracy : 1
AUC Score (Test): 1.000000
Accuracy y_train : 1
AUC Score (Train): 1.000000

/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.
  warnings.warn(CV_WARNING, FutureWarning)
/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.
  warnings.warn(CV_WARNING, FutureWarning)