失眠网 > 基于Kaggle心脏病数据集的数据分析和分类预测-StatisticalLearning统计学习实验报告

基于Kaggle心脏病数据集的数据分析和分类预测-StatisticalLearning统计学习实验报告

时间：2019-02-16 02:51:54

一、实验准备

本数据来源于kaggle,包含14个维度，303个样本，具体的变量说明如下表所示。

'''-*- coding: utf-8 -*-@Author: DouGang@E-mail: dorza@@Software : PyCharm, Python3.6@Time : -07-24'''

导入相关库

# 数据集特征分析相关库import pandas as pdimport matplotlib.pyplot as pltimport seaborn as sns# 数据集预处理相关库from sklearn.preprocessing import StandardScalerfrom sklearn.model_selection import train_test_split# K近邻算法相关库from sklearn.neighbors import KNeighborsClassifierfrom sklearn.model_selection import cross_val_scorefrom sklearn.metrics import precision_score,recall_score,f1_scorefrom sklearn.metrics import precision_recall_curve,roc_curve,average_precision_score,auc# 决策树相关库from sklearn.tree import DecisionTreeClassifier# 随机森林相关库from sklearn.ensemble import RandomForestClassifier# 逻辑回归相关库from sklearn.linear_model import LogisticRegression# SGD分类相关库from sklearn.linear_model import SGDClassifier

二、数据展示

plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置图表的显示样式

heart_df = pd.read_csv("./dataSet/heart.csv")print(heart_df.shape) # 查看数据的维度print(heart_df.head()) # 查看数据的前5行print(heart_df.info()) # 展示数据的详细信息print(heart_df.describe())# 描述统计相关信息print(heart_df.isnull().sum()) # 缺少值检查sns.heatmap(heart_df.isnull())plt.show()sns.pairplot(heart_df,hue='target')plt.show()

(303, 14)age sex cp trestbps chol fbs ... exang oldpeak slope ca thal target0 63 1 3 145 233 1 ...02.30 01 11 37 1 2 130 250 0 ...03.50 02 12 41 0 1 130 204 0 ...01.42 02 13 56 1 1 120 236 0 ...00.82 02 14 57 0 0 120 354 0 ...10.62 02 1[5 rows x 14 columns]

<class 'pandas.core.frame.DataFrame'>RangeIndex: 303 entries, 0 to 302Data columns (total 14 columns):# Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 303 non-null int64 1 sex 303 non-null int64 2 cp 303 non-null int64 3 trestbps 303 non-null int64 4 chol303 non-null int64 5 fbs 303 non-null int64 6 restecg 303 non-null int64 7 thalach 303 non-null int64 8 exang303 non-null int64 9 oldpeak 303 non-null float6410 slope303 non-null int64 11 ca 303 non-null int64 12 thal303 non-null int64 13 target 303 non-null int64 dtypes: float64(1), int64(13)memory usage: 33.3 KB

age sexcp ...ca thaltargetcount 303.000000 303.000000 303.000000 ... 303.000000 303.000000 303.000000mean 54.366337 0.683168 0.966997 ... 0.729373 2.313531 0.544554std9.082101 0.466011 1.032052 ... 1.022606 0.612277 0.498835min29.000000 0.000000 0.000000 ... 0.000000 0.000000 0.00000025%47.500000 0.000000 0.000000 ... 0.000000 2.000000 0.00000050%55.000000 1.000000 1.000000 ... 0.000000 2.000000 1.00000075%61.000000 1.000000 2.000000 ... 1.000000 3.000000 1.000000max77.000000 1.000000 3.000000 ... 4.000000 3.000000 1.000000

[8 rows x 14 columns]age 0sex 0cp0trestbps 0chol 0fbs 0restecg0thalach0exang 0oldpeak0slope 0ca0thal 0target0dtype: int64

sns.heatmap(heart_df.isnull())plt.show()sns.pairplot(heart_df,hue='target')plt.show()

三、数据的描述性信息

# 绘制变量的相关系数plt.figure(figsize=(10,10))sns.heatmap(heart_df.corr(),annot=True,fmt='.1f')plt.show()

# 查看样本的年龄分布heart_df['age'].value_counts()sns.barplot(x=heart_df.age.value_counts().index,y=heart_df.age.value_counts().values)plt.xlabel('Age')plt.ylabel('Age Counter')plt.title('Age Analysis System')plt.show()

# 查看年龄列的最大值、最小值以及平均值minage = min(heart_df.age)maxage = max(heart_df.age)meanage = round(heart_df.age.mean(),2)print('最小年龄:',minage)print('最大年龄:',maxage)print('平均年龄:',meanage)# 将连续变量年龄转换成分类变量年龄的状态heart_df['age_states']=0heart_df['age_states'][(heart_df['age']>=29)&(heart_df['age']<40)]='young ages'heart_df['age_states'][(heart_df['age']>=40)&(heart_df['age']<55)]='middle ages'heart_df['age_states'][(heart_df['age']>=55)&(heart_df['age']<=77)]='old ages'# 查看各年龄段的样本数量print(heart_df['age_states'].value_counts())'''x: x轴上的条形图，直接为series数据 y: y轴上的条形图，直接为series数据order代表x轴上各类别的先后顺序hue代表类别 hue_order代表带类别的先后顺序'''sns.countplot(x='age_states',data=heart_df,order=['young ages','middle ages','old ages'])plt.xlabel('Age Range')plt.ylabel('Age Counts')plt.title('Age State in Dataset')plt.show()

最小年龄: 29最大年龄: 77平均年龄: 54.37old ages 159middle ages 128young ages16Name: age_states, dtype: int64

'''通过如下图发现在样本中随着年龄的变化：样本的数据量逐渐增多，青年人16，中年人128，老年人159。'''# 性别样本数据数据占比 0代表女性 1代表男性print(heart_df['sex'].value_counts())sns.countplot(y='sex',data=heart_df)plt.title('Sex Count in Dataset')plt.show()

1 207096Name: sex, dtype: int64

# 列名代表是否换心脏病行名代表性别pd.crosstab(heart_df['sex'],heart_df['target'])# 性别与是否患有心脏病的关系 0代表女性；1代表男性pd.crosstab(heart_df['sex'],heart_df['target']).plot(kind="bar",figsize=(12,8),color=['#1CA53B','#AA1111'])plt.title('Heart Disease Frequency for Sex')plt.xlabel('sex(0=female, 1=male)')plt.xticks(rotation=0)plt.legend(["'Haven't Disease","Have Disease"])plt.ylabel('Frequency')plt.show()

# 心脏病预测-性别与患病分析# 患病的分布情况fig,axes = plt.subplots(1,2,figsize=(10,5))ax = heart_df.target.value_counts().plot(kind="bar",ax=axes[0])ax.set_title("患病分布")ax.set_xlabel("1：患病，0：未患病")heart_df.target.value_counts().plot(kind="pie",autopct="%.2f%%",labels=['患病','未患病'],ax=axes[1])plt.show()

# 性别和患病的分布ax1 = plt.subplot(121)ax = sns.countplot(x="sex",hue='target',data=heart_df,ax=ax1)ax.set_xlabel("0：女性，1：男性")ax2 = plt.subplot(222)heart_df[heart_df['target'] == 0].sex.value_counts().plot(kind="pie",autopct="%.2f%%",labels=['男性','女性'],ax=ax2)ax2.set_title("未患病性别比例")ax2 = plt.subplot(224)heart_df[heart_df['target'] == 1].sex.value_counts().plot(kind="pie",autopct="%.2f%%",labels=['男性','女性'],ax=ax2)ax2.set_title("患病性别比例")plt.show()

fig,axes = plt.subplots(2,1,figsize=(20,10))sns.countplot(x="age",hue="target",data=heart_df,ax=axes[0])# 0-45：青年人，45-59：中年人，60-100：老年人age_type = pd.cut(heart_df.age,bins=[0,45,60,100],include_lowest=True,right=False,labels=['青年人','中年人','老年人'])age_target_df = pd.concat([age_type,heart_df.target],axis=1)sns.countplot(x="age",hue='target',data=age_target_df)plt.show()

# 统一看下所有特征的分布情况fig,axes = plt.subplots(7,2,figsize=(10,20))for x in range(0,14):plt.subplot(7,2,x+1)sns.distplot(heart_df.iloc[:,x],kde=True)plt.tight_layout()plt.show()

plt.figure(figsize=(8,5))sns.heatmap(heart_df.corr(),cmap="Blues",annot=True)plt.show()

四、特征预处理

# 数据预处理features = heart_df.drop(columns=['target'])targets = heart_df['target']# 将离散型数据，从普通的0,1,2这些，转换成真正的字符串表示# sexfeatures.loc[features['sex']==0,'sex'] = 'female'features.loc[features['sex']==1,'sex'] = 'male'# cpfeatures.loc[features['cp'] == 1,'cp'] = 'typical'features.loc[features['cp'] == 2,'cp'] = 'atypical'features.loc[features['cp'] == 3,'cp'] = 'non-anginal'features.loc[features['cp'] == 4,'cp'] = 'asymptomatic'# fbsfeatures.loc[features['fbs'] == 1,'fbs'] = 'true'features.loc[features['fbs'] == 0,'fbs'] = 'false'# exangfeatures.loc[features['exang'] == 1,'exang'] = 'true'features.loc[features['exang'] == 0,'exang'] = 'false'# slopefeatures.loc[features['slope'] == 1,'slope'] = 'true'features.loc[features['slope'] == 2,'slope'] = 'true'features.loc[features['slope'] == 3,'slope'] = 'true'# thalfeatures.loc[features['thal'] == 3,'thal'] = 'normal'features.loc[features['thal'] == 3,'thal'] = 'fixed'features.loc[features['thal'] == 3,'thal'] = 'reversable'# restecg# 0：普通，1：ST-T波异常，2：可能左心室肥大features.loc[features['restecg'] == 0,'restecg'] = 'normal'features.loc[features['restecg'] == 1,'restecg'] = 'ST-T abnormal'features.loc[features['restecg'] == 2,'restecg'] = 'Left ventricular hypertrophy'# cafeatures['ca'].astype("object")# thalfeatures.thal.astype("object")features.head()features = pd.get_dummies(features)features_temp = StandardScaler().fit_transform(features)# features_temp = StandardScaler().fit_transform(pd.get_dummies(features))X_train,X_test,y_train,y_test = train_test_split(features_temp,targets,test_size=0.25)

五、各种分类方法实现分类预测和算法评估

5.1 K近邻预测

def plotting(estimator,y_test):fig,axes = plt.subplots(1,2,figsize=(10,5))y_predict_proba = estimator.predict_proba(X_test)precisions,recalls,thretholds = precision_recall_curve(y_test,y_predict_proba[:,1])axes[0].plot(precisions,recalls)axes[0].set_title("平均精准率：%.2f"%average_precision_score(y_test,y_predict_proba[:,1]))axes[0].set_xlabel("召回率")axes[0].set_ylabel("精准率")fpr,tpr,thretholds = roc_curve(y_test,y_predict_proba[:,1])axes[1].plot(fpr,tpr)axes[1].set_title("AUC值：%.2f"%auc(fpr,tpr))axes[1].set_xlabel("FPR")axes[1].set_ylabel("TPR")

# K近邻knn = KNeighborsClassifier(n_neighbors=5)scores = cross_val_score(knn,features_temp,targets,cv=5)print("准确率：",scores.mean())knn.fit(X_train,y_train)y_predict = knn.predict(X_test)# 精准率print("精准率：",precision_score(y_test,y_predict))# 召回率print("召回率：",recall_score(y_test,y_predict))# F1-Scoreprint("F1得分：",f1_score(y_test,y_predict))plotting(knn,y_test)plt.show()

准确率： 0.7985245901639344精准率： 0.8召回率： 0.8421052631578947F1得分： 0.8205128205128205

5.2 决策树算法评估

tree = DecisionTreeClassifier(max_depth=10)tree.fit(X_train,y_train)plotting(tree,y_test)plt.show()

5.3 随机森林算法评估

rf = RandomForestClassifier(n_estimators=100)rf.fit(X_train,y_train)plotting(rf,y_test)plt.show()

5.4 逻辑回归算法评估

logic = LogisticRegression(tol=1e-10)logic.fit(X_train,y_train)plotting(logic,y_test)plt.show()

5.5 SGD分类算法评估

sgd = SGDClassifier(loss="log")sgd.fit(X_train,y_train)plotting(sgd,y_test)plt.show()

5.6 特征重要性分析

# 4.6 心脏病预测-特征重要性分析importances = pd.Series(data=rf.feature_importances_,index=features.columns).sort_values(ascending=False)sns.barplot(y=importances.index,x=importances.values,orient='h')plt.show()

如果觉得《基于Kaggle心脏病数据集的数据分析和分类预测-StatisticalLearning统计学习实验报告》对你有帮助，请点赞、收藏，并留下你的观点哦！

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。