import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import figure
from matplotlib import pyplot
from PIL import Image
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')
Attributes = pd.read_csv("heart.csv")
Attributes
age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 63 | 1 | 3 | 145 | 233 | 1 | 0 | 150 | 0 | 2.3 | 0 | 0 | 1 | 1 |
1 | 37 | 1 | 2 | 130 | 250 | 0 | 1 | 187 | 0 | 3.5 | 0 | 0 | 2 | 1 |
2 | 41 | 0 | 1 | 130 | 204 | 0 | 0 | 172 | 0 | 1.4 | 2 | 0 | 2 | 1 |
3 | 56 | 1 | 1 | 120 | 236 | 0 | 1 | 178 | 0 | 0.8 | 2 | 0 | 2 | 1 |
4 | 57 | 0 | 0 | 120 | 354 | 0 | 1 | 163 | 1 | 0.6 | 2 | 0 | 2 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
298 | 57 | 0 | 0 | 140 | 241 | 0 | 1 | 123 | 1 | 0.2 | 1 | 0 | 3 | 0 |
299 | 45 | 1 | 3 | 110 | 264 | 0 | 1 | 132 | 0 | 1.2 | 1 | 0 | 3 | 0 |
300 | 68 | 1 | 0 | 144 | 193 | 1 | 1 | 141 | 0 | 3.4 | 1 | 2 | 3 | 0 |
301 | 57 | 1 | 0 | 130 | 131 | 0 | 1 | 115 | 1 | 1.2 | 1 | 1 | 3 | 0 |
302 | 57 | 0 | 1 | 130 | 236 | 0 | 0 | 174 | 0 | 0.0 | 1 | 1 | 2 | 0 |
303 rows × 14 columns
Attributes = Attributes.rename(columns={"cp": "Chest_Pain", "trestbps": "rest_blood_pres", "fbs" : "fast_blood_pres",
"thalach" : "max_HR", "exang" : "exercise_angina", "ca" : "vessel_flouro"})
Attributes["id"] = 1
counter = 1
for i, row in Attributes.iterrows():
Attributes["id"][i] = counter
counter = counter + 1
col_name = "id"
first_col = Attributes.pop(col_name)
Attributes.insert(0, col_name, first_col)
Attributes
id | age | sex | Chest_Pain | rest_blood_pres | chol | fast_blood_pres | restecg | max_HR | exercise_angina | oldpeak | slope | vessel_flouro | thal | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 63 | 1 | 3 | 145 | 233 | 1 | 0 | 150 | 0 | 2.3 | 0 | 0 | 1 | 1 |
1 | 2 | 37 | 1 | 2 | 130 | 250 | 0 | 1 | 187 | 0 | 3.5 | 0 | 0 | 2 | 1 |
2 | 3 | 41 | 0 | 1 | 130 | 204 | 0 | 0 | 172 | 0 | 1.4 | 2 | 0 | 2 | 1 |
3 | 4 | 56 | 1 | 1 | 120 | 236 | 0 | 1 | 178 | 0 | 0.8 | 2 | 0 | 2 | 1 |
4 | 5 | 57 | 0 | 0 | 120 | 354 | 0 | 1 | 163 | 1 | 0.6 | 2 | 0 | 2 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
298 | 299 | 57 | 0 | 0 | 140 | 241 | 0 | 1 | 123 | 1 | 0.2 | 1 | 0 | 3 | 0 |
299 | 300 | 45 | 1 | 3 | 110 | 264 | 0 | 1 | 132 | 0 | 1.2 | 1 | 0 | 3 | 0 |
300 | 301 | 68 | 1 | 0 | 144 | 193 | 1 | 1 | 141 | 0 | 3.4 | 1 | 2 | 3 | 0 |
301 | 302 | 57 | 1 | 0 | 130 | 131 | 0 | 1 | 115 | 1 | 1.2 | 1 | 1 | 3 | 0 |
302 | 303 | 57 | 0 | 1 | 130 | 236 | 0 | 0 | 174 | 0 | 0.0 | 1 | 1 | 2 | 0 |
303 rows × 15 columns
in20 = Attributes[Attributes["age"] < 30]
in30 = Attributes[Attributes["age"] > 29]
in30 = in30[in30["age"] < 40]
in40 = Attributes[Attributes["age"] > 39]
in40 =in40[in40["age"] < 50]
in50 = Attributes[Attributes["age"] > 49]
in50 =in50[in50["age"] < 60]
in60 = Attributes[Attributes["age"] > 59]
in60 =in60[in60["age"] < 70]
in70 = Attributes[Attributes["age"] > 69]
in70 =in70[in70["age"] < 80]
each_age = [len(in20.index), len(in30.index), len(in40.index),len(in50.index),
len(in60.index), len(in70.index)]
age_range = ["20's", "30's","40's","50's", "60's", "70's"]
sns.barplot(y=each_age, x=age_range)
in60
id | age | sex | Chest_Pain | rest_blood_pres | chol | fast_blood_pres | restecg | max_HR | exercise_angina | oldpeak | slope | vessel_flouro | thal | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 63 | 1 | 3 | 145 | 233 | 1 | 0 | 150 | 0 | 2.3 | 0 | 0 | 1 | 1 |
13 | 14 | 64 | 1 | 3 | 110 | 211 | 0 | 0 | 144 | 1 | 1.8 | 1 | 0 | 2 | 1 |
17 | 18 | 66 | 0 | 3 | 150 | 226 | 0 | 1 | 114 | 0 | 2.6 | 0 | 0 | 2 | 1 |
19 | 20 | 69 | 0 | 3 | 140 | 239 | 0 | 1 | 151 | 0 | 1.8 | 2 | 2 | 2 | 1 |
23 | 24 | 61 | 1 | 2 | 150 | 243 | 1 | 1 | 137 | 1 | 1.0 | 1 | 0 | 2 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
290 | 291 | 61 | 1 | 0 | 148 | 203 | 0 | 1 | 161 | 0 | 0.0 | 2 | 1 | 3 | 0 |
293 | 294 | 67 | 1 | 2 | 152 | 212 | 0 | 0 | 150 | 0 | 0.8 | 1 | 0 | 3 | 0 |
295 | 296 | 63 | 1 | 0 | 140 | 187 | 0 | 0 | 144 | 1 | 4.0 | 2 | 2 | 3 | 0 |
296 | 297 | 63 | 0 | 0 | 124 | 197 | 0 | 1 | 136 | 1 | 0.0 | 1 | 0 | 2 | 0 |
300 | 301 | 68 | 1 | 0 | 144 | 193 | 1 | 1 | 141 | 0 | 3.4 | 1 | 2 | 3 | 0 |
80 rows × 15 columns
sns.lineplot(x='age', y='target', data=Attributes)
pyplot.ylabel('Heart Disease (1 = Not present, 0 = Present)')
pyplot.xlabel('Age')
pyplot.title("Age Correlation W/ Heart Disease ")
Text(0.5, 1.0, 'Age Correlation W/ Heart Disease ')
sns.lineplot(x='Chest_Pain', y='target', data=Attributes)
pyplot.ylabel('Heart Disease (1 = Not present, 0 = Present)')
pyplot.xlabel('Chest Pain (0 = No Pain, 3 = Bad Pain)')
pyplot.title("Chest pain Correlation W/ Heart Disease ")
Text(0.5, 1.0, 'Chest pain Correlation W/ Heart Disease ')
sns.lineplot(x='rest_blood_pres', y='target', data=Attributes)
pyplot.ylabel('Heart Disease (1 = Not present, 0 = Present)')
pyplot.xlabel('Resting Blood Pressure')
pyplot.title("Resting Blood Pressure Correlation W/ Heart Disease ")
Text(0.5, 1.0, 'Resting Blood Pressure Correlation W/ Heart Disease ')
sns.lineplot(x='max_HR', y='target', data=Attributes)
pyplot.ylabel('Heart Disease (1 = Not present, 0 = Present)')
pyplot.xlabel('Max Heart Rate')
pyplot.title("Max Heart Rate Correlation W/ Heart Disease ")
Text(0.5, 1.0, 'Max Heart Rate Correlation W/ Heart Disease ')
sns.regplot(x='chol', y='target', data=Attributes)
pyplot.ylabel('Heart Disease (1 = Not present, 0 = Present)')
pyplot.xlabel('Cholesteral Level')
pyplot.title("Cholesterol Level Correlation W/ Heart Disease ")
Text(0.5, 1.0, 'Cholesterol Level Correlation W/ Heart Disease ')
age = 0
counter = 0
for i, row in Attributes.iterrows():
if (Attributes["target"][i] == 0):
age += Attributes["age"][i]
counter += 1
print("The average age of people with heart disease in this dataset is " + str(age/counter))
The average age of people with heart disease in this dataset is 56.60144927536232
figure(figsize=(10,10))
sns.regplot(x='chol', y='rest_blood_pres', data=Attributes)
pyplot.xlabel('Cholesteral Level')
pyplot.ylabel('Resting Blood Pressure')
pyplot.title("Cholesteral Levels vs. Resting Blood Pressure")
Text(0.5, 1.0, 'Cholesteral Levels vs. Resting Blood Pressure')
figure(figsize=(10,10))
sns.regplot(x='oldpeak', y='rest_blood_pres', data=Attributes)
pyplot.ylabel('Cholesteral Level')
pyplot.xlabel('ST Depression Induced by Exercise')
pyplot.title("ST Depression Induced by Exercise vs. Resting Blood Pressure")
Text(0.5, 1.0, 'ST Depression Induced by Exercise vs. Resting Blood Pressure')
try:
img = Image.open("1200px-SinusRhythmLabels.svg.png")
except IOError:
pass
img
knn = KNeighborsClassifier()
#Labels and Attributes
X = Attributes.drop('target',axis=1).values
y = Attributes['target'].values
#Split into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state=42, stratify=y)
neighbors = np.arange(1,9)
train_accuracy =np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))
for i,k in enumerate(neighbors):
#Setup a knn classifier with k neighbors
knn = KNeighborsClassifier(n_neighbors=k)
#Fit model
knn.fit(X_train, y_train)
#Find training set accuraccy
train_accuracy[i] = knn.score(X_train, y_train)
#Find testing set accuraccy
test_accuracy[i] = knn.score(X_test, y_test)
#creat plot to find optimal k
plt.title('k-NN Varying number of neighbors')
plt.plot(neighbors, test_accuracy, label='Testing Accuracy')
plt.plot(neighbors, train_accuracy, label='Training accuracy')
plt.legend()
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
Text(0, 0.5, 'Accuracy')
knn = KNeighborsClassifier(n_neighbors=3)
best_fit = knn.fit(X_train,y_train)
ten_fold = cross_val_score(best_fit, X,y,cv=10)
print("Optimal k parameter is 3")
print("Avg 10 fold CV Score is " + str(ten_fold.mean()))
print("STD Error is " + str(ten_fold.std()/ np.sqrt(10)))
Optimal k parameter is 3 Avg 10 fold CV Score is 0.9475268817204301 STD Error is 0.033192487730841345
estimators = np.arange(1,9)
train_accuracy =np.empty(len(estimators))
test_accuracy = np.empty(len(estimators))
for i,k in enumerate(estimators):
#Setup a f classifier k estimators
rf = RandomForestClassifier(n_estimators=k)
#Fit model
rf.fit(X_train, y_train)
#Compute accuracy of raining set
train_accuracy[i] = rf.score(X_train, y_train)
#Compute accuracy of test set
test_accuracy[i] = rf.score(X_test, y_test)
#Create plot to find optimal k value
plt.title('Random Forests of Varying Estmators')
plt.plot(neighbors, test_accuracy, label='Testing Accuracy')
plt.plot(neighbors, train_accuracy, label='Training accuracy')
plt.legend()
plt.xlabel('Number of Estimators')
plt.ylabel('Accuracy')
plt.show()
rf = RandomForestClassifier(n_estimators=7)
best_fit = rf.fit(X_train,y_train)
ten_fold = cross_val_score(best_fit, X,y,cv=10)
print("Optimal k parameter is 7")
print("Avg 10 fold CV Score is " + str(ten_fold.mean()))
print("STD Error is " + str(ten_fold.std()/ np.sqrt(10)))
Optimal k parameter is 7 Avg 10 fold CV Score is 0.9637634408602151 STD Error is 0.0240754825253329