失眠网 > 信贷违约风险评估预测-kaggle项目

信贷违约风险评估预测-kaggle项目

时间：2022-05-06 18:29:37

相关推荐

信贷违约风险评估预测-kaggle项目

kaggle原案例

目标：为了确保贷款的安全性，需要对客户的信用或者还款能力进行评估

数据导入并预览

import pandas as pddf = pd.read_csv("https://labfile./courses/1363/HomeCredit.csv")df.head()df.describe()df.shapedf.columns

数据可视化分析

查看贷款金额的分布情况

import matplotlib.pyplot as pltimport seaborn as snsimport warningswarnings.filterwarnings("ignore")%matplotlib inlineplt.figure(figsize=(12, 5))plt.title("Distribution of AMT_CREDIT")ax = sns.distplot(df["AMT_CREDIT"]) # 画出数据分布图#同样的，查看收入情况的分布情况plt.figure(figsize=(12, 5))plt.title("Distribution of AMT_INCOME_TOTAL")# 画出数据分布图ax = sns.distplot(df["AMT_INCOME_TOTAL"].dropna())#如果贷款的对象是货物的话，看一下这些货物的价格分布。plt.figure(figsize=(12,5))plt.title('Distribution of AMT_GOODS_PRICE')ax = sns.distplot(df['AMT_GOODS_PRICE'].dropna())

import plotly.offline as offlineimport plotly.graph_objs as goimport plotly.offline as pyfrom plotly.offline import init_notebook_mode, iplotinit_notebook_mode(connected=True)offline.init_notebook_mode()

查看陪同人员情况

temp = df["NAME_TYPE_SUITE"].value_counts()# 画出柱状图trace = [go.Bar(x=temp.index, y=(temp / temp.sum())*100,)]# 设置图的字体颜色等layout = go.Layout(title="Who accompanied client when applying for the application in % ",xaxis=dict(title='Name of type of the Suite',tickfont=dict(size=14, color='rgb(107, 107, 107)')),yaxis=dict(title='Count of Name of type of the Suite in %',titlefont=dict(size=16, color='rgb(107, 107, 107)'),tickfont=dict(size=14, color='rgb(107, 107, 107)')))fig = go.Figure(data=trace, layout=layout)iplot(fig, filename='schoolStateNames')#结论：几乎 80% 的人都没有人陪同。而只有少部分人有家人或合伙人陪同

申请人的还款能力

temp = df["TARGET"].value_counts()# 画出饼状图trace = [go.Pie(labels=temp.index, values=temp.values)]# 设置图题layout = go.Layout(title='Loan Repayed or not',)# 显示图形fig = go.Figure(data=trace, layout=layout)iplot(fig)#有超过 90% 的人没有还款能力

查看还款类型，做环形图

查看贷款类型

temp = df["NAME_CONTRACT_TYPE"].value_counts()# 画出饼状图trace = [go.Pie(labels=temp.index, values=temp.values, hole=0.6)]# 设置图题layout = go.Layout(title='Types of loan',)# 显示图形fig = go.Figure(data=trace, layout=layout)iplot(fig)#Revolving loan 表示周期性贷款，类似于分期贷款。 Cash loans 贷款表示现金贷款。#由上图可知，有超过 90% 的人申请的贷款为现金贷款。

查看申请人贷款的目的

temp1 = df["FLAG_OWN_CAR"].value_counts()temp2 = df["FLAG_OWN_REALTY"].value_counts()# 画出饼状图trace = [go.Pie(labels=temp1.index, values=temp1.values, domain={"x": [0, .48]}, hole=0.6),go.Pie(labels=temp2.index, values=temp2.values, domain={"x": [0.5, 1]}, hole=0.6)]# 设置图中的字体，图题等layout = go.Layout(title='Purpose of loan',annotations=[{"font": {"size": 20},"showarrow": False,"text": "Own Car","x": 0.15,"y": 0.5},{"font": {"size": 20},"showarrow": False,"text": "Own Realty","x": 0.85,"y": 0.5}])# 显示图形fig = go.Figure(data=trace, layout=layout)iplot(fig)#有接近 34% 的人贷款的钱要花在车上， 30% 的人要花在物业上。

查看申请人的收入来源。

temp = df["NAME_INCOME_TYPE"].value_counts()# 画出饼状图trace = [go.Pie(labels=temp.index, values=temp.values, hole=0.4)]# 设置图题layout = go.Layout(title='Income sources of Applicant',)# 画出图题fig = go.Figure(data=trace, layout=layout)iplot(fig)#52.1% 的人收入来源于工作，有 23.5% 的人收入来源于商业合作，有 18% 的申请者的收入主要来自于养老金。

查看申请人的婚姻状况

temp = df["NAME_FAMILY_STATUS"].value_counts()# 画出饼状图trace = [go.Pie(labels=temp.index, values=temp.values)]# 设置图题layout = go.Layout(title='Family Status of Applicant',)# 显示图形fig = go.Figure(data=trace, layout=layout)iplot(fig)#结论：有 63.7% 的申请都是已婚的，有 14.7% 为单身或未婚

查看申请者的职业

temp = df["OCCUPATION_TYPE"].value_counts()# 画出柱状图trace = [go.Bar(x=temp.index, y=temp.values)]# 设置图题layout = go.Layout(title='Occupation of Applicant',)# 显示图形fig = go.Figure(data=trace, layout=layout)iplot(fig)#结论：从图可以看到，人数最多的职业为工人，其次是销售员等

查看一下申请人的受教育情况

temp = df["NAME_EDUCATION_TYPE"].value_counts()# 画出饼状图trace = [go.Pie(labels=temp.index, values=temp.values, hole=0.5)]# 设置图题layout = go.Layout(title='Education of Applicant',)# 显示图形fig = go.Figure(data=trace, layout=layout)iplot(fig)#有 71.5% 的人为中等学历，24% 的人为高等学历

看这些申请人的房子类型

temp = df["NAME_HOUSING_TYPE"].value_counts()# 画出饼状图trace = [go.Pie(labels=temp.index, values=temp.values)]# 设置图题layout = go.Layout(title='Loan Repayed or not',)# 显示图形fig = go.Figure(data=trace, layout=layout)iplot(fig)#有 88.7% 的申请者有自己的房子或住在公寓，有 4.54% 的人跟父母一起住

上面通过可视化来观察数据集中一些基本的信息

import numpy as nptemp = df["NAME_INCOME_TYPE"].value_counts()temp_y0 = [] # 没有偿还能力temp_y1 = [] # 有偿还能力for val in temp.index:temp_y1.append(np.sum(df["TARGET"][df["NAME_INCOME_TYPE"] == val] == 1))temp_y0.append(np.sum(df["TARGET"][df["NAME_INCOME_TYPE"] == val] == 0))temp_y1 = np.array(temp_y1)temp_y0 = np.array(temp_y0)# 画出柱状图trace = [go.Bar(x=temp.index, y=(temp_y1 / temp.sum()) * 100, name='YES'),go.Bar(x=temp.index, y=(temp_y0 / temp.sum()) * 100, name='NO'),go.Bar(x=temp.index, y=(temp_y1 / (temp_y0+temp_y1)) * 100, name='RATE'),]# 设置图题，字体等layout = go.Layout(title="Income sources of Applicant's in terms of loan is repayed or not in %",xaxis=dict(title='Income source', tickfont=dict(size=14, color='rgb(107, 107, 107)')),yaxis=dict(title='Count in %', titlefont=dict(size=16, color='rgb(107, 107, 107)'),tickfont=dict(size=14, color='rgb(107, 107, 107)')))# 显示图形fig = go.Figure(data=trace, layout=layout)iplot(fig)#YES 表示有偿还能力，NO 表示无偿还能力，RATE 表示在该取值中有偿还能力所占的比例，例如，在 Working 中，RATE 的取值越高表示当一个人的收入来源于 Working 时，该人有很大的可能有偿还能力

婚姻状况与是否有偿还能力的关系

temp = df["NAME_FAMILY_STATUS"].value_counts()temp_y0 = [] # 没有偿还能力temp_y1 = [] # 有偿还能力for val in temp.index:temp_y1.append(np.sum(df["TARGET"][df["NAME_FAMILY_STATUS"] == val] == 1))temp_y0.append(np.sum(df["TARGET"][df["NAME_FAMILY_STATUS"] == val] == 0))temp_y1 = np.array(temp_y1)temp_y0 = np.array(temp_y0)# 画出柱状图trace = [go.Bar(x=temp.index, y=(temp_y1 / temp.sum()) * 100, name='YES'),go.Bar(x=temp.index, y=(temp_y0 / temp.sum()) * 100, name='NO'),go.Bar(x=temp.index, y=(temp_y1 / (temp_y0+temp_y1)) * 100, name='RATE')]# 设置字体、图题等layout = go.Layout(title="Family Status of Applicant's in terms of loan is repayed or not in %",xaxis=dict(title='Family Status', tickfont=dict(size=14, color='rgb(107, 107, 107)')),yaxis=dict(title='Count in %', titlefont=dict(size=16, color='rgb(107, 107, 107)'),tickfont=dict(size=14, color='rgb(107, 107, 107)')))# 显示图形fig = go.Figure(data=trace, layout=layout)iplot(fig)#偿还能力似乎与婚姻状况无关

同样的，看申请者职业与偿还能力的关系

temp = df["OCCUPATION_TYPE"].value_counts()temp_y0 = [] # 没有偿还能力temp_y1 = [] # 有偿还能力for val in temp.index:temp_y1.append(np.sum(df["TARGET"][df["OCCUPATION_TYPE"] == val] == 1))temp_y0.append(np.sum(df["TARGET"][df["OCCUPATION_TYPE"] == val] == 0))temp_y1 = np.array(temp_y1)temp_y0 = np.array(temp_y0)# 画出柱状图trace = [go.Bar(x=temp.index, y=(temp_y1 / temp.sum()) * 100, name='YES'),go.Bar(x=temp.index, y=(temp_y0 / temp.sum()) * 100, name='NO'),go.Bar(x=temp.index, y=(temp_y1 / (temp_y0+temp_y1)) * 100, name='RATE'),]# 设置图题、字体等layout = go.Layout(title="Occupation of Applicant's in terms of loan is repayed or not in %",width=1000,xaxis=dict(title='Occupation of Applicant\'s',tickfont=dict(size=14, color='rgb(107, 107, 107)')),yaxis=dict(title='Count in %', titlefont=dict(size=16, color='rgb(107, 107, 107)'),tickfont=dict(size=14, color='rgb(107, 107, 107)')))# 显示图形fig = go.Figure(data=trace, layout=layout)iplot(fig)#由图可知，像管理员、核心员工等这些职业的偿还能力都较低，而像工人、驾驶司机等职业要高一点。

预测模型

删除掉存在缺失值的特征列

df_drop = df.dropna(axis=1)df_drop.head()

编码特征

from sklearn import preprocessing# 取出非数值的列categorical_feats = [f for f in df_drop.columns if df_drop[f].dtype == 'object']# 对非数值的列进行编码for col in categorical_feats:lb = preprocessing.LabelEncoder()lb.fit(list(df_drop[col].values.astype('str')))df_drop[col] = lb.transform(list(df_drop[col].values.astype('str')))#查看编码结果df_drop.head()

划分数据

#SK_ID_CURR 列为顾客的 ID ，因此要将此列删除掉df_drop1 = df_drop.drop('SK_ID_CURR',axis=1)#提取训练特征数据和目标值。这里的目标值就是申请者的偿还能力，在数据集中为 TARGET 列。data_X = df_drop1.drop("TARGET", axis=1)data_y = df_drop1['TARGET']

划分数据集为训练数据集和测试数据集。因为数据集较大，只取了 20% 的数据来作为训练集。

from sklearn import model_selectiontrain_x, test_x, train_y, test_y = model_selection.train_test_split(data_X.values,data_y.values,test_size=0.8,random_state=0)

随机森林

from sklearn.ensemble import RandomForestClassifiermodel = RandomForestClassifier() # 构建模型model.fit(train_x, train_y) # 训练模型#测试一下模型的准确率from sklearn import metricsy_pred = model.predict(test_x) # 预测测试集metrics.accuracy_score(y_pred, test_y) # 评价预测结果

#使用 sklaern 提供的分类报告方法来得到一个全面的评估print(metrics.classification_report(y_pred, test_y))#分析特征的重要性features = data_X.columns.values # 取出数据集中的列名，即特征名# 得到特征与其重要性x, y = (list(x) for x in zip(*sorted(zip(model.feature_importances_, features),reverse=False)))# 画出柱状图trace2 = go.Bar(x=x, y=y, marker=dict(color=x, colorscale='Viridis', reversescale=True),name='Random Forest Feature importance', orientation='h',)# 设置图题、字体等layout = dict(title='Barplot of Feature importances', width=900, height=2000,yaxis=dict(showgrid=False, showline=False, showticklabels=True,), margin=dict(l=300,))# 显示图形fig1 = go.Figure(data=[trace2])fig1['layout'].update(layout)iplot(fig1, filename='plots')

使用多种方法来预测模型

from sklearn.tree import DecisionTreeClassifierfrom sklearn.neural_network import MLPClassifierfrom sklearn.ensemble import AdaBoostClassifierfrom sklearn.ensemble import BaggingClassifierfrom sklearn.ensemble import GradientBoostingClassifierfrom sklearn.linear_model import LogisticRegression# 构建 7 种算法models = [LogisticRegression(solver='lbfgs'), # 逻辑回归RandomForestClassifier(n_estimators=100), # 随机森林DecisionTreeClassifier(), # 决策树MLPClassifier(max_iter=100), # 多层感知机AdaBoostClassifier(), # 自适应梯度提升BaggingClassifier(), # 装袋算法GradientBoostingClassifier()] # 梯度提升算法model_name = ['LogisticRegression','RandomForestClassifier',"DecisionTreeClassifier",'MLPClassifier','AdaBoostClassifier','BaggingClassifier','GradientBoostingClassifier']acc = [] # 存放各算法的准确率f1 = [] # 存放各算法的 f1 值recall = []# 存放各算法的召回率for model in models: # 训练每个算法model.fit(train_x, train_y)acc.append(model.score(test_x, test_y))y_pred = model.predict(test_x)f1.append(metrics.f1_score(y_pred, test_y))recall.append(metrics.recall_score(y_pred, test_y))# 打印每种算法的评估结果pd.DataFrame({"name": model_name, "acc": acc, "f1": f1, "recall": recall})#除了决策树分类（DecisionTreeClassifier）和感知机分类（MLPClassifier）之外，大部分算法的准确率均超过了 90% 。

如果觉得《信贷违约风险评估预测-kaggle项目》对你有帮助，请点赞、收藏，并留下你的观点哦！

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。