import warnings
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns


# load dataset 
warnings.filterwarnings('ignore')
diab_df = pd.read_csv("Hospital.csv") 

diab_df.head()


diab_cols = ['DiabetesPedigreeFunction'] 

X = diab_df[diab_cols]# Features 

y = diab_df.Outcome # Target variable 


X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0) 


# instantiate the model 

logreg =  LogisticRegression(solver='liblinear') 

# fit the model with data 

logreg.fit(X_train,y_train) 

# predicting 

y_pred=logreg.predict(X_test) 

y_pred 

cnf_matrix = metrics.confusion_matrix(y_test, y_pred) 

cnf_matrix

array([[124,   6],
       [ 55,   7]], dtype=int64)


#split dataset in features and target variable 

diab_cols = ['BMI', 'Age','DiabetesPedigreeFunction'] 

X = diab_df[diab_cols]# Features 

y = diab_df.Outcome # Target variable 


X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0) 


# instantiate the model 

logreg =  LogisticRegression(solver='liblinear') 



# fit the model with data 

logreg.fit(X_train,y_train) 



# predicting 

y_pred=logreg.predict(X_test) 

y_pred 

cnf_matrix = metrics.confusion_matrix(y_test, y_pred) 

cnf_matrix

array([[121,   9],
       [ 45,  17]], dtype=int64)


#split dataset in features and target variable 

diab_cols = ['Pregnancies', 'Insulin', 'BMI', 'Age','Glucose','BloodPressure','DiabetesPedigreeFunction'] 

X = diab_df[diab_cols]# Features 

y = diab_df.Outcome # Target variable 

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0) 

# instantiate the model 

logreg =  LogisticRegression(solver='liblinear') 



# fit the model with data 

logreg.fit(X_train,y_train) 



# predicting 

y_pred=logreg.predict(X_test) 

y_pred 

cnf_matrix = metrics.confusion_matrix(y_test, y_pred) 

cnf_matrix

array([[119,  11],
       [ 26,  36]], dtype=int64)


class_names=[0,1] # name  of classes 

fig, ax = plt.subplots() 

tick_marks = np.arange(len(class_names)) 

plt.xticks(tick_marks, class_names) 

plt.yticks(tick_marks, class_names) 

# create heatmap 

sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g') 

ax.xaxis.set_label_position("top") 

plt.tight_layout() 

plt.title('Confusion matrix', y=1.1) 

plt.ylabel('Actual label') 

plt.xlabel('Predicted label')

Text(0.5, 427.9555555555555, 'Predicted label')


print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) 

print("Precision:",metrics.precision_score(y_test, y_pred)) 

print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.8072916666666666
Precision: 0.7659574468085106
Recall: 0.5806451612903226


healthy1=logreg.predict([[1,148,11,23,199,76,0.6]])
healthy1

array([0], dtype=int64)


healthy2=logreg.predict([[2,132,15,22,201,72,0.4]])
healthy2

array([0], dtype=int64)


healthy3=logreg.predict([[0,110,14,25,180,71,0.3]])
healthy3

array([0], dtype=int64)


unhealthy1=logreg.predict([[6,10,35.6,95,100,86,2]])
unhealthy1

array([1], dtype=int64)


unhealthy2=logreg.predict([[5,20,37,36,90,92,3.2]])
unhealthy2

array([1], dtype=int64)


unhealthy3=logreg.predict([[4,22,44,48,110,91,1.55]])
unhealthy3

array([1], dtype=int64)

Predicting Diabetes Using Logistic Regression with SciKit¶

Import packages and prepare data.¶

First Model¶

Second Model¶

Final Model and Visualization¶

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1