Here is the approach I took for this project:
Data Processing
Exploratory Data Analysis
Methods
Results & Discussion
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df_diabetes = pd.read_csv("diabetes_data.csv", sep = ";")
df_diabetes.head()
print(df_diabetes.shape)
print(df_diabetes["class"].unique())
df_diabetes["gender"] = df_diabetes["gender"].apply({"Male":1, "Female":0}.get)
df_diabetes.head()
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = df_diabetes[df_diabetes.columns[:-1]]
Y = df_diabetes[df_diabetes.columns[-1]]
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(scaled_X,
Y,
test_size=0.4,
random_state=101)
sns.countplot(x="gender", data=df_diabetes)
sns.countplot(x="class", hue="gender", data=df_diabetes)
sns.histplot(df_diabetes["age"], kde=False, bins=10)
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train, Y_train)
predictions_log = logmodel.predict(X_test)
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, f1_score
print(classification_report(Y_test, predictions_log))
print(confusion_matrix(Y_test, predictions_log))
f1_log = f1_score(Y_test, predictions_log)
print(f1_log)
logmodel.coef_
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, Y_train)
predictions_knn1 = knn.predict(X_test)
print(classification_report(Y_test, predictions_knn1))
print(confusion_matrix(Y_test, predictions_knn1))
f1_knn1 = f1_score(Y_test, predictions_knn1)
print(f1_knn1)
error_rate = []
for i in range(1,40):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, Y_train)
pred_i = knn.predict(X_test)
error_rate.append(np.mean(pred_i != Y_test))
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, Y_train)
predictions_knn2 = knn.predict(X_test)
print(classification_report(Y_test, predictions_knn2))
print(confusion_matrix(Y_test, predictions_knn2))
f1_knn2 = f1_score(Y_test, predictions_knn2)
print(f1_knn2)
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train, Y_train)
predictions_dt = dtree.predict(X_test)
print(classification_report(Y_test, predictions_dt))
print(confusion_matrix(Y_test, predictions_dt))
f1_dt = f1_score(Y_test, predictions_dt)
print(f1_dt)
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, Y_train)
predictions_rfc = rfc.predict(X_test)
print(classification_report(Y_test, predictions_rfc))
print(confusion_matrix(Y_test, predictions_rfc))
f1_rfc = f1_score(Y_test, predictions_rfc)
print(f1_rfc)
data = {"Method":["Logistic Regression", "KNN1", "KNN2", "Decision Trees", "Random Forest"],
"f1-score":[f1_log, f1_knn1, f1_knn2, f1_dt, f1_rfc]}
results = pd.DataFrame(data)
print(results)
plt.figure(figsize=(12,6))
feature_imp = pd.Series(rfc.feature_importances_, index = X.columns).sort_values(ascending = False)
feature_plot = sns.barplot(x = feature_imp, y = feature_imp.index)
feature_plot.set_title("Feature Importance Plot")
feature_plot.set_xlabel("Score")
feature_plot.set_ylabel("Feature")