# importing all the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, f1_score

Прочитать набор данных

# reading the dataset through pandas read csv API
# displaying top 5 records for data values check

# checking the dataframe metadata information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   No. of times pregnant                           768 non-null    int64  
 1   Plasma glucose concentration                    768 non-null    int64  
 2   Diastolic blood pressure (mm Hg)                768 non-null    int64  
 3   Triceps skin fold thickness (mm)                768 non-null    int64  
 4   2-Hour serum insulin (mu U/ml)                  768 non-null    int64  
 5   Body mass index (weight in kg/(height in m)^2)  768 non-null    float64
 6   Diabetes pedigree function                      768 non-null    float64
 7   Age (years)                                     768 non-null    int64  
 8   Class variable (0 or 1)                         768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
# checking descriptional statistics

df['Class variable (0 or 1)'].value_counts()
0    500
1    268
Name: Class variable (0 or 1), dtype: int64
# printing the target column values distribution
sns.countplot(x="Class variable (0 or 1)", data=df, palette="Set2")
<matplotlib.axes._subplots.AxesSubplot at 0x7f160cb16690>

поиск пропущенных значений

No. of times pregnant                             False
Plasma glucose concentration                      False
Diastolic blood pressure (mm Hg)                  False
Triceps skin fold thickness (mm)                  False
2-Hour serum insulin (mu U/ml)                    False
Body mass index (weight in kg/(height in m)^2)    False
Diabetes pedigree function                        False
Age (years)                                       False
Class variable (0 or 1)                           False
dtype: bool
# printing the null values count for all the columns
No. of times pregnant                             0
Plasma glucose concentration                      0
Diastolic blood pressure (mm Hg)                  0
Triceps skin fold thickness (mm)                  0
2-Hour serum insulin (mu U/ml)                    0
Body mass index (weight in kg/(height in m)^2)    0
Diabetes pedigree function                        0
Age (years)                                       0
Class variable (0 or 1)                           0
dtype: int64
# printing the total null values count
# for checking null values
# Visualize missing values (NaN) values using Missingno Library
# there is no NaN Values from the below graph
<matplotlib.axes._subplots.AxesSubplot at 0x7f160c0bfb50>

# checking is there any zero value in all the columns
No. of times pregnant                              True
Plasma glucose concentration                       True
Diastolic blood pressure (mm Hg)                   True
Triceps skin fold thickness (mm)                   True
2-Hour serum insulin (mu U/ml)                     True
Body mass index (weight in kg/(height in m)^2)     True
Diabetes pedigree function                        False
Age (years)                                       False
Class variable (0 or 1)                            True
dtype: bool

сообщать о средних и медианных расхождениях в функциях

# discrepancy is difference between mean and median
pd.DataFrame(df.drop(['Class variable (0 or 1)'], axis=1)
             .apply(lambda x : [x.mean(), x.median(), x.mean()-x.median()]).tolist(),
            index=df.drop(['Class variable (0 or 1)'],axis=1).columns,

Визуализируйте отношения объектов

# displaying pairplot against target attribute.
sns.pairplot(df, hue='Class variable (0 or 1)')
<seaborn.axisgrid.PairGrid at 0x7f1607852410>

# calculating the correlation of all the features and displaying it through heatmap
<matplotlib.axes._subplots.AxesSubplot at 0x7f15ffcf3050>

Разделить данные

# Scaling the features by using min max scaling
minmax_scale = preprocessing.MinMaxScaler().fit(df.iloc[:,:-1])
df.iloc[:,:-1] = minmax_scale.transform(df.iloc[:,:-1])
# separating input and output columns for model training and testing
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
# splitting the datasets by train and test by train =70% and test = 30%
# With stratify = yes,so that the data is divided with more or less equal ration of class 1 and 0 in both test and train set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1/3,random_state=42, stratify=y)

Реализуйте модель SVM

# Building the model with SVM and fitting it with train datasets
model = SVC()
model.fit(X_train, y_train)
# using grid search algorithm to find the best parameters for SVM algorithm
param_grid = [{'C': [0.001, 0.01,0.1,1,10,100], 'gamma': [0.01,0.001], 'kernel': ['linear','poly','rbf','sigmoid']}]
grid=GridSearchCV(model,param_grid=param_grid, cv=11, n_jobs=-1)
# once we find best parameter through grid search, fitting the train dataset again for prediction
grid.fit(X_train, y_train)
print('Score is  : ' + str(grid.score(X_test,y_test)))
Score is  : 0.73046875
SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
# getting the best params from the grid search
{'C': 100, 'gamma': 0.01, 'kernel': 'linear'}
# predicting test datasets based on model trained on train datasets

Точность измерения

matrix = confusion_matrix(y_test, y_pred)
print('Accuracy is : ' + str(accuracy_score(y_test, y_pred)))
print (matrix)
Accuracy is : 0.73046875
[[140  27]
 [ 42  47]]
# Displaying summary of the precision, recall, F1 score for each class
precision    recall  f1-score   support

           0       0.77      0.84      0.80       167
           1       0.64      0.53      0.58        89

    accuracy                           0.73       256
   macro avg       0.70      0.68      0.69       256
weighted avg       0.72      0.73      0.72       256
# calculating confusion matrix based on predicted values and actual values
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)
print (confusion_matrix)
Predicted    0   1  All
0          140  27  167
1           42  47   89
All        182  74  256
# displaying cross tab confusion matrix in heat maps
sns.heatmap(pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted']), annot=True,fmt="d",cbar=False)
<matplotlib.axes._subplots.AxesSubplot at 0x7f15ffb71210>