# importing all the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, f1_score
Прочитать набор данных
# reading the dataset through pandas read csv API
df=pd.read_csv('Diabetes.csv')
# displaying top 5 records for data values check
df.head()
![](https://i2.wp.com/miro.medium.com/1*IAc1J5R0SjN46Ywj7pdO2Q.png)
# checking the dataframe metadata information
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 No. of times pregnant 768 non-null int64
1 Plasma glucose concentration 768 non-null int64
2 Diastolic blood pressure (mm Hg) 768 non-null int64
3 Triceps skin fold thickness (mm) 768 non-null int64
4 2-Hour serum insulin (mu U/ml) 768 non-null int64
5 Body mass index (weight in kg/(height in m)^2) 768 non-null float64
6 Diabetes pedigree function 768 non-null float64
7 Age (years) 768 non-null int64
8 Class variable (0 or 1) 768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
# checking descriptional statistics
df.describe(include='all')
![](https://i2.wp.com/miro.medium.com/1*dz23mBaFvxdi1lmD6dtbiQ.png)
df['Class variable (0 or 1)'].value_counts()
0 500
1 268
Name: Class variable (0 or 1), dtype: int64
# printing the target column values distribution
sns.set(rc={'figure.figsize':(6.7,4.27)})
sns.countplot(x="Class variable (0 or 1)", data=df, palette="Set2")
<matplotlib.axes._subplots.AxesSubplot at 0x7f160cb16690>
![](https://i2.wp.com/miro.medium.com/1*v0aQbomH3lUVO0Uwyr67fw.png)
поиск пропущенных значений
df.isna().any()
No. of times pregnant False
Plasma glucose concentration False
Diastolic blood pressure (mm Hg) False
Triceps skin fold thickness (mm) False
2-Hour serum insulin (mu U/ml) False
Body mass index (weight in kg/(height in m)^2) False
Diabetes pedigree function False
Age (years) False
Class variable (0 or 1) False
dtype: bool
# printing the null values count for all the columns
df.isna().sum()
No. of times pregnant 0
Plasma glucose concentration 0
Diastolic blood pressure (mm Hg) 0
Triceps skin fold thickness (mm) 0
2-Hour serum insulin (mu U/ml) 0
Body mass index (weight in kg/(height in m)^2) 0
Diabetes pedigree function 0
Age (years) 0
Class variable (0 or 1) 0
dtype: int64
# printing the total null values count
df.isna().any().sum().sum()
0
# for checking null values
# Visualize missing values (NaN) values using Missingno Library
msno.bar(df)
# there is no NaN Values from the below graph
<matplotlib.axes._subplots.AxesSubplot at 0x7f160c0bfb50>
![](https://i2.wp.com/miro.medium.com/1*SfS43cz8rwy-AMm8z3Zmkw.png)
# checking is there any zero value in all the columns
df.isin([0]).any()
No. of times pregnant True
Plasma glucose concentration True
Diastolic blood pressure (mm Hg) True
Triceps skin fold thickness (mm) True
2-Hour serum insulin (mu U/ml) True
Body mass index (weight in kg/(height in m)^2) True
Diabetes pedigree function False
Age (years) False
Class variable (0 or 1) True
dtype: bool
сообщать о средних и медианных расхождениях в функциях
# discrepancy is difference between mean and median
pd.DataFrame(df.drop(['Class variable (0 or 1)'], axis=1)
.apply(lambda x : [x.mean(), x.median(), x.mean()-x.median()]).tolist(),
index=df.drop(['Class variable (0 or 1)'],axis=1).columns,
columns=['Mean','Median','Discrepancy'])
![](https://i2.wp.com/miro.medium.com/1*Wg3x-nNPU2eqsBoRIldULw.png)
Визуализируйте отношения объектов
# displaying pairplot against target attribute.
sns.pairplot(df, hue='Class variable (0 or 1)')
<seaborn.axisgrid.PairGrid at 0x7f1607852410>
![](https://i2.wp.com/miro.medium.com/1*R054llR8cVTIt7xfOcRApw.png)
# calculating the correlation of all the features and displaying it through heatmap
sns.set(rc={'figure.figsize':(18.7,15.27)})
sns.heatmap(df.corr(),annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7f15ffcf3050>
![](https://i2.wp.com/miro.medium.com/1*IDyarjNTxrALuyKRZ6iJMw.png)
Разделить данные
# Scaling the features by using min max scaling
minmax_scale = preprocessing.MinMaxScaler().fit(df.iloc[:,:-1])
df.iloc[:,:-1] = minmax_scale.transform(df.iloc[:,:-1])
# separating input and output columns for model training and testing
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
# splitting the datasets by train and test by train =70% and test = 30%
# With stratify = yes,so that the data is divided with more or less equal ration of class 1 and 0 in both test and train set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1/3,random_state=42, stratify=y)
Реализуйте модель SVM
# Building the model with SVM and fitting it with train datasets
model = SVC()
model.fit(X_train, y_train)
model.score(X_test,y_test)
0.74609375
# using grid search algorithm to find the best parameters for SVM algorithm
param_grid = [{'C': [0.001, 0.01,0.1,1,10,100], 'gamma': [0.01,0.001], 'kernel': ['linear','poly','rbf','sigmoid']}]
grid=GridSearchCV(model,param_grid=param_grid, cv=11, n_jobs=-1)
# once we find best parameter through grid search, fitting the train dataset again for prediction
grid.fit(X_train, y_train)
print('Score is : ' + str(grid.score(X_test,y_test)))
Score is : 0.73046875
grid.best_estimator_
SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma=0.01, kernel='linear',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
# getting the best params from the grid search
grid.best_params_
{'C': 100, 'gamma': 0.01, 'kernel': 'linear'}
# predicting test datasets based on model trained on train datasets
y_pred=grid.predict(X_test)
Точность измерения
matrix = confusion_matrix(y_test, y_pred)
print('Accuracy is : ' + str(accuracy_score(y_test, y_pred)))
print (matrix)
Accuracy is : 0.73046875
[[140 27]
[ 42 47]]
# Displaying summary of the precision, recall, F1 score for each class
print(classification_report(y_test,y_pred))
precision recall f1-score support
0 0.77 0.84 0.80 167
1 0.64 0.53 0.58 89
accuracy 0.73 256
macro avg 0.70 0.68 0.69 256
weighted avg 0.72 0.73 0.72 256
# calculating confusion matrix based on predicted values and actual values
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)
print (confusion_matrix)
Predicted 0 1 All
Actual
0 140 27 167
1 42 47 89
All 182 74 256
# displaying cross tab confusion matrix in heat maps
sns.set(rc={'figure.figsize':(6.7,4.27)})
sns.heatmap(pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted']), annot=True,fmt="d",cbar=False)
<matplotlib.axes._subplots.AxesSubplot at 0x7f15ffb71210>
![](https://i2.wp.com/miro.medium.com/1*FDR07QFoeVRuLKsfZEKGAA.png)
print('END OF NOTEBOOK')
END OF NOTEBOOK