Анализ данных диабета

# importing all the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, f1_score

Прочитать набор данных

# reading the dataset through pandas read csv API
df=pd.read_csv('Diabetes.csv')
# displaying top 5 records for data values check
df.head()

# checking the dataframe metadata information
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   No. of times pregnant                           768 non-null    int64  
 1   Plasma glucose concentration                    768 non-null    int64  
 2   Diastolic blood pressure (mm Hg)                768 non-null    int64  
 3   Triceps skin fold thickness (mm)                768 non-null    int64  
 4   2-Hour serum insulin (mu U/ml)                  768 non-null    int64  
 5   Body mass index (weight in kg/(height in m)^2)  768 non-null    float64
 6   Diabetes pedigree function                      768 non-null    float64
 7   Age (years)                                     768 non-null    int64  
 8   Class variable (0 or 1)                         768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
# checking descriptional statistics
df.describe(include='all')

df['Class variable (0 or 1)'].value_counts()
0    500
1    268
Name: Class variable (0 or 1), dtype: int64
# printing the target column values distribution
sns.set(rc={'figure.figsize':(6.7,4.27)})
sns.countplot(x="Class variable (0 or 1)", data=df, palette="Set2")
<matplotlib.axes._subplots.AxesSubplot at 0x7f160cb16690>

поиск пропущенных значений

df.isna().any()
No. of times pregnant                             False
Plasma glucose concentration                      False
Diastolic blood pressure (mm Hg)                  False
Triceps skin fold thickness (mm)                  False
2-Hour serum insulin (mu U/ml)                    False
Body mass index (weight in kg/(height in m)^2)    False
Diabetes pedigree function                        False
Age (years)                                       False
Class variable (0 or 1)                           False
dtype: bool
# printing the null values count for all the columns
df.isna().sum()
No. of times pregnant                             0
Plasma glucose concentration                      0
Diastolic blood pressure (mm Hg)                  0
Triceps skin fold thickness (mm)                  0
2-Hour serum insulin (mu U/ml)                    0
Body mass index (weight in kg/(height in m)^2)    0
Diabetes pedigree function                        0
Age (years)                                       0
Class variable (0 or 1)                           0
dtype: int64
# printing the total null values count
df.isna().any().sum().sum()
0
# for checking null values
# Visualize missing values (NaN) values using Missingno Library
msno.bar(df) 
# there is no NaN Values from the below graph
<matplotlib.axes._subplots.AxesSubplot at 0x7f160c0bfb50>

# checking is there any zero value in all the columns
df.isin([0]).any()
No. of times pregnant                              True
Plasma glucose concentration                       True
Diastolic blood pressure (mm Hg)                   True
Triceps skin fold thickness (mm)                   True
2-Hour serum insulin (mu U/ml)                     True
Body mass index (weight in kg/(height in m)^2)     True
Diabetes pedigree function                        False
Age (years)                                       False
Class variable (0 or 1)                            True
dtype: bool

сообщать о средних и медианных расхождениях в функциях

# discrepancy is difference between mean and median
pd.DataFrame(df.drop(['Class variable (0 or 1)'], axis=1)
             .apply(lambda x : [x.mean(), x.median(), x.mean()-x.median()]).tolist(),
            index=df.drop(['Class variable (0 or 1)'],axis=1).columns,
            columns=['Mean','Median','Discrepancy'])

Визуализируйте отношения объектов

# displaying pairplot against target attribute.
sns.pairplot(df, hue='Class variable (0 or 1)')
<seaborn.axisgrid.PairGrid at 0x7f1607852410>

# calculating the correlation of all the features and displaying it through heatmap
sns.set(rc={'figure.figsize':(18.7,15.27)})
sns.heatmap(df.corr(),annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7f15ffcf3050>

Разделить данные

# Scaling the features by using min max scaling
minmax_scale = preprocessing.MinMaxScaler().fit(df.iloc[:,:-1])
df.iloc[:,:-1] = minmax_scale.transform(df.iloc[:,:-1])
# separating input and output columns for model training and testing
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
# splitting the datasets by train and test by train =70% and test = 30%
# With stratify = yes,so that the data is divided with more or less equal ration of class 1 and 0 in both test and train set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1/3,random_state=42, stratify=y)

Реализуйте модель SVM

# Building the model with SVM and fitting it with train datasets
model = SVC()
model.fit(X_train, y_train)
model.score(X_test,y_test)
0.74609375
# using grid search algorithm to find the best parameters for SVM algorithm
param_grid = [{'C': [0.001, 0.01,0.1,1,10,100], 'gamma': [0.01,0.001], 'kernel': ['linear','poly','rbf','sigmoid']}]
grid=GridSearchCV(model,param_grid=param_grid, cv=11, n_jobs=-1)
# once we find best parameter through grid search, fitting the train dataset again for prediction
grid.fit(X_train, y_train)
print('Score is  : ' + str(grid.score(X_test,y_test)))
Score is  : 0.73046875
grid.best_estimator_
SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
# getting the best params from the grid search
grid.best_params_
{'C': 100, 'gamma': 0.01, 'kernel': 'linear'}
# predicting test datasets based on model trained on train datasets
y_pred=grid.predict(X_test)

Точность измерения

matrix = confusion_matrix(y_test, y_pred)
print('Accuracy is : ' + str(accuracy_score(y_test, y_pred)))
print (matrix)
Accuracy is : 0.73046875
[[140  27]
 [ 42  47]]
# Displaying summary of the precision, recall, F1 score for each class
print(classification_report(y_test,y_pred))
precision    recall  f1-score   support

           0       0.77      0.84      0.80       167
           1       0.64      0.53      0.58        89

    accuracy                           0.73       256
   macro avg       0.70      0.68      0.69       256
weighted avg       0.72      0.73      0.72       256
# calculating confusion matrix based on predicted values and actual values
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)
print (confusion_matrix)
Predicted    0   1  All
Actual                 
0          140  27  167
1           42  47   89
All        182  74  256
# displaying cross tab confusion matrix in heat maps
sns.set(rc={'figure.figsize':(6.7,4.27)})
sns.heatmap(pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted']), annot=True,fmt="d",cbar=False)
<matplotlib.axes._subplots.AxesSubplot at 0x7f15ffb71210>

print('END OF NOTEBOOK')
END OF NOTEBOOK

Анализ данных диабета

Прочитать набор данных

поиск пропущенных значений

сообщать о средних и медианных расхождениях в функциях

Визуализируйте отношения объектов

Разделить данные

Реализуйте модель SVM

Точность измерения

Вопросы по теме