import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
from IPython.display import display
data = pd.read_csv('ASD.csv')
display(data.head(n=5))
id | A1_Score | A2_Score | A3_Score | A4_Score | A5_Score | A6_Score | A7_Score | A8_Score | A9_Score | ... | gender | ethnicity | jundice | austim | contry_of_res | used_app_before | result | age_desc | relation | Class/ASD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | ... | f | White-European | no | no | United States | no | 6 | 18 and more | Self | NO |
1 | 2 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | m | Latino | no | yes | Brazil | no | 5 | 18 and more | Self | NO |
2 | 3 | 1 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | ... | m | Latino | yes | yes | Spain | no | 8 | 18 and more | Parent | YES |
3 | 4 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | ... | f | White-European | no | yes | United States | no | 6 | 18 and more | Self | NO |
4 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | f | ? | no | no | Egypt | no | 2 | 18 and more | ? | NO |
5 rows × 22 columns
# Total number of records
n_records = len(data.index)
print("Total number of records:",n_records)
# TODO: Number of records where individual's with ASD
n_asd_yes = len(data[data['Class/ASD'] == 'YES'])
print("Individuals diagonised with ASD:",n_asd_yes)
# TODO: Number of records where individual's with no ASD
n_asd_no = len(data[data['Class/ASD'] == 'NO'])
print("Individuals not diagonised with ASD:",n_asd_no)
# TODO: Percentage of individuals whose are with ASD
yes_percent = float(n_asd_yes) / n_records *100
print("Percentage of individuals diagonised with ASD:",yes_percent)
Total number of records: 704 Individuals diagonised with ASD: 189 Individuals not diagonised with ASD: 515 Percentage of individuals diagonised with ASD: 26.84659090909091
#preprocessing of data by changing '?' to NaN value
asd_data = pd.read_csv('ASD.csv', na_values=['?'])
asd_data.head(n=5)
id | A1_Score | A2_Score | A3_Score | A4_Score | A5_Score | A6_Score | A7_Score | A8_Score | A9_Score | ... | gender | ethnicity | jundice | austim | contry_of_res | used_app_before | result | age_desc | relation | Class/ASD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | ... | f | White-European | no | no | United States | no | 6 | 18 and more | Self | NO |
1 | 2 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | m | Latino | no | yes | Brazil | no | 5 | 18 and more | Self | NO |
2 | 3 | 1 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | ... | m | Latino | yes | yes | Spain | no | 8 | 18 and more | Parent | YES |
3 | 4 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | ... | f | White-European | no | yes | United States | no | 6 | 18 and more | Self | NO |
4 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | f | NaN | no | no | Egypt | no | 2 | 18 and more | NaN | NO |
5 rows × 22 columns
#checking whether data needs cleaning or not
asd_data.describe()
id | A1_Score | A2_Score | A3_Score | A4_Score | A5_Score | A6_Score | A7_Score | A8_Score | A9_Score | A10_Score | age | result | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 704.000000 | 704.000000 | 704.000000 | 704.000000 | 704.000000 | 704.000000 | 704.000000 | 704.000000 | 704.000000 | 704.000000 | 704.000000 | 702.000000 | 704.000000 |
mean | 352.500000 | 0.721591 | 0.453125 | 0.457386 | 0.495739 | 0.498580 | 0.284091 | 0.417614 | 0.649148 | 0.323864 | 0.573864 | 29.698006 | 4.875000 |
std | 203.371581 | 0.448535 | 0.498152 | 0.498535 | 0.500337 | 0.500353 | 0.451301 | 0.493516 | 0.477576 | 0.468281 | 0.494866 | 16.507465 | 2.501493 |
min | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 17.000000 | 0.000000 |
25% | 176.750000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 21.000000 | 3.000000 |
50% | 352.500000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 27.000000 | 4.000000 |
75% | 528.250000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 35.000000 | 7.000000 |
max | 704.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 383.000000 | 10.000000 |
So, here we have some missing values so it want to go through a cleaning process.
#here we are locating rows which contain null value in any of the column
asd_data.loc[(asd_data['age'].isnull()) |(asd_data['gender'].isnull()) |(asd_data['ethnicity'].isnull())
|(asd_data['jundice'].isnull())|(asd_data['austim'].isnull()) |(asd_data['contry_of_res'].isnull())
|(asd_data['used_app_before'].isnull())|(asd_data['result'].isnull())|(asd_data['age_desc'].isnull())
|(asd_data['relation'].isnull())]
id | A1_Score | A2_Score | A3_Score | A4_Score | A5_Score | A6_Score | A7_Score | A8_Score | A9_Score | ... | gender | ethnicity | jundice | austim | contry_of_res | used_app_before | result | age_desc | relation | Class/ASD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | f | NaN | no | no | Egypt | no | 2 | 18 and more | NaN | NO |
12 | 13 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 1 | ... | f | NaN | no | no | Bahamas | no | 6 | 18 and more | NaN | NO |
13 | 14 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | ... | m | NaN | no | no | Austria | no | 4 | 18 and more | NaN | NO |
14 | 15 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | ... | f | NaN | no | no | Argentina | no | 4 | 18 and more | NaN | NO |
19 | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | ... | m | NaN | yes | no | United Arab Emirates | no | 3 | 18 and more | NaN | NO |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
652 | 653 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | f | NaN | no | no | United States | no | 1 | 18 and more | NaN | NO |
658 | 659 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | ... | m | NaN | no | no | Azerbaijan | no | 3 | 18 and more | NaN | NO |
659 | 660 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 1 | ... | m | NaN | no | no | Pakistan | no | 8 | 18 and more | NaN | YES |
666 | 667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | m | NaN | no | no | Iraq | no | 1 | 18 and more | NaN | NO |
701 | 702 | 1 | 0 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | ... | f | NaN | no | no | Russia | no | 7 | 18 and more | NaN | YES |
95 rows × 22 columns
So here we have missing data rows in a random order so we have to drop them by 'dropna' function and again check the data that is it clean or not
asd_data.dropna(inplace=True)
asd_data.describe()
id | A1_Score | A2_Score | A3_Score | A4_Score | A5_Score | A6_Score | A7_Score | A8_Score | A9_Score | A10_Score | age | result | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 609.000000 | 609.000000 | 609.000000 | 609.000000 | 609.000000 | 609.000000 | 609.000000 | 609.000000 | 609.000000 | 609.000000 | 609.000000 | 609.000000 | 609.000000 |
mean | 349.725780 | 0.740558 | 0.469622 | 0.481117 | 0.520525 | 0.525452 | 0.307061 | 0.428571 | 0.665025 | 0.341544 | 0.597701 | 30.215107 | 5.077176 |
std | 207.856238 | 0.438689 | 0.499487 | 0.500054 | 0.499989 | 0.499762 | 0.461654 | 0.495278 | 0.472370 | 0.474617 | 0.490765 | 17.287470 | 2.522717 |
min | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 17.000000 | 0.000000 |
25% | 166.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 22.000000 | 3.000000 |
50% | 329.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 27.000000 | 5.000000 |
75% | 533.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 35.000000 | 7.000000 |
max | 704.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 383.000000 | 10.000000 |
# Reminder of the features:
print(asd_data.dtypes)
# Total number of records in clean dataset
n_records = len(asd_data.index)
# TODO: Number of records where individual's with ASD in the clean dataset
n_asd_yes = len(asd_data[asd_data['Class/ASD'] == 'YES'])
# TODO: Number of records where individual's with no ASD in the clean dataset
n_asd_no = len(asd_data[asd_data['Class/ASD'] == 'NO'])
# Print the results
print("Total number of records:",n_records)
print("Individuals diagonised with ASD:",n_asd_yes)
print("Individuals not diagonised with ASD:",n_asd_no)
id int64 A1_Score int64 A2_Score int64 A3_Score int64 A4_Score int64 A5_Score int64 A6_Score int64 A7_Score int64 A8_Score int64 A9_Score int64 A10_Score int64 age float64 gender object ethnicity object jundice object austim object contry_of_res object used_app_before object result int64 age_desc object relation object Class/ASD object dtype: object Total number of records: 609 Individuals diagonised with ASD: 180 Individuals not diagonised with ASD: 429
# Split the data into features and target label
asd_raw = asd_data['Class/ASD']
features_raw = asd_data[['age', 'gender', 'ethnicity', 'jundice', 'austim', 'contry_of_res', 'result',
'relation','A1_Score','A2_Score','A3_Score','A4_Score','A5_Score','A6_Score','A7_Score','A8_Score',
'A9_Score','A10_Score']]
When we create a model then we may need the data should be normalized so here we are using MinMaxScaler for preprocessing
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numerical = ['age', 'result']
features_minmax_transform = pd.DataFrame(data = features_raw)
features_minmax_transform[numerical] = scaler.fit_transform(features_raw[numerical])
features_minmax_transform
# Show an example of a record with scaling applied
display(features_minmax_transform.head(n = 5))
age | gender | ethnicity | jundice | austim | contry_of_res | result | relation | A1_Score | A2_Score | A3_Score | A4_Score | A5_Score | A6_Score | A7_Score | A8_Score | A9_Score | A10_Score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.024590 | f | White-European | no | no | United States | 0.6 | Self | 1 | 1 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 |
1 | 0.019126 | m | Latino | no | yes | Brazil | 0.5 | Self | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
2 | 0.027322 | m | Latino | yes | yes | Spain | 0.8 | Parent | 1 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | 1 |
3 | 0.049180 | f | White-European | no | yes | United States | 0.6 | Self | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 |
5 | 0.051913 | m | Others | yes | no | United States | 0.9 | Self | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 |
From above clean data sets we can observe that some have non-numeric values we shouldn't use non-numeric value with a learning algorithm so here we should convert it with some method. Here we will use 'One-Hot-Coding' scheme which will convert non-numeric values to "dummy" variable which can be used with learning algorithm.
In addition, We also need to convert the target to numeric value but we needn't to use 'One-Hot-Coding', we can use simple encoding to binary from as we have only variables as 'Yes' and 'No'.
#One-hot encode the 'features_minmax_transform' data using pandas.get_dummies()
features_final = pd.get_dummies(features_minmax_transform)
display(features_final.head(5))
# Encode the 'all_classes_raw' data to numerical values
asd_classes = asd_raw.apply(lambda x: 1 if x == 'YES' else 0)
# Print the number of features after one-hot encoding
encoded = list(features_final.columns)
print("total features after one-hot encoding:",len(encoded))
# Uncomment the following line to see the encoded feature names
print(encoded)
age | result | A1_Score | A2_Score | A3_Score | A4_Score | A5_Score | A6_Score | A7_Score | A8_Score | ... | contry_of_res_United Arab Emirates | contry_of_res_United Kingdom | contry_of_res_United States | contry_of_res_Uruguay | contry_of_res_Viet Nam | relation_Health care professional | relation_Others | relation_Parent | relation_Relative | relation_Self | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.024590 | 0.6 | 1 | 1 | 1 | 1 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | 0.019126 | 0.5 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2 | 0.027322 | 0.8 | 1 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
3 | 0.049180 | 0.6 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
5 | 0.051913 | 0.9 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
5 rows × 94 columns
total features after one-hot encoding: 94 ['age', 'result', 'A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'gender_f', 'gender_m', 'ethnicity_Asian', 'ethnicity_Black', 'ethnicity_Hispanic', 'ethnicity_Latino', 'ethnicity_Middle Eastern ', 'ethnicity_Others', 'ethnicity_Pasifika', 'ethnicity_South Asian', 'ethnicity_Turkish', 'ethnicity_White-European', 'ethnicity_others', 'jundice_no', 'jundice_yes', 'austim_no', 'austim_yes', 'contry_of_res_Afghanistan', 'contry_of_res_AmericanSamoa', 'contry_of_res_Angola', 'contry_of_res_Armenia', 'contry_of_res_Aruba', 'contry_of_res_Australia', 'contry_of_res_Austria', 'contry_of_res_Bahamas', 'contry_of_res_Bangladesh', 'contry_of_res_Belgium', 'contry_of_res_Bolivia', 'contry_of_res_Brazil', 'contry_of_res_Burundi', 'contry_of_res_Canada', 'contry_of_res_Chile', 'contry_of_res_China', 'contry_of_res_Costa Rica', 'contry_of_res_Cyprus', 'contry_of_res_Czech Republic', 'contry_of_res_Ecuador', 'contry_of_res_Egypt', 'contry_of_res_Ethiopia', 'contry_of_res_Finland', 'contry_of_res_France', 'contry_of_res_Germany', 'contry_of_res_Iceland', 'contry_of_res_India', 'contry_of_res_Indonesia', 'contry_of_res_Iran', 'contry_of_res_Ireland', 'contry_of_res_Italy', 'contry_of_res_Jordan', 'contry_of_res_Malaysia', 'contry_of_res_Mexico', 'contry_of_res_Nepal', 'contry_of_res_Netherlands', 'contry_of_res_New Zealand', 'contry_of_res_Nicaragua', 'contry_of_res_Niger', 'contry_of_res_Oman', 'contry_of_res_Pakistan', 'contry_of_res_Philippines', 'contry_of_res_Portugal', 'contry_of_res_Romania', 'contry_of_res_Russia', 'contry_of_res_Saudi Arabia', 'contry_of_res_Serbia', 'contry_of_res_Sierra Leone', 'contry_of_res_South Africa', 'contry_of_res_Spain', 'contry_of_res_Sri Lanka', 'contry_of_res_Sweden', 'contry_of_res_Tonga', 'contry_of_res_Turkey', 'contry_of_res_Ukraine', 'contry_of_res_United Arab Emirates', 'contry_of_res_United Kingdom', 'contry_of_res_United States', 'contry_of_res_Uruguay', 'contry_of_res_Viet Nam', 'relation_Health care professional', 'relation_Others', 'relation_Parent', 'relation_Relative', 'relation_Self']
# histogram of Class/ASD
# 8 bins
plt.hist(asd_classes, bins=10)
# x-axis limit from 0 to 1
plt.xlim(0,1)
plt.title('Histogram of Class/ASD')
plt.xlabel('Class/ASD from processed data')
plt.ylabel('Frequency')
Text(0, 0.5, 'Frequency')
Now we have all non-numerical values converted into numerical value and all numerical values has been normalized form. So we will split the data into training and test data. We will use 80% of data as training data and 20% as test data.
from sklearn.model_selection import train_test_split
np.random.seed(1234)
X_train, X_test, y_train, y_test = train_test_split(features_final, asd_classes, train_size=0.80, random_state=1)
# Show the results of the split
print("Training set has samples:",X_train.shape[0])
print("Testing set has samples:",X_test.shape[0])
#asd_data
Training set has samples: 487 Testing set has samples: 122
from sklearn import svm
from sklearn.model_selection import cross_val_score
C = 1.0
svc = svm.SVC(kernel='linear', C=C, gamma=2)
cv_scores = cross_val_score(svc, features_final, asd_classes, cv=10)
cv_scores.mean()
1.0
AUC Score:
# calculate cross-validated AUC
cross_val_score(svc, features_final, asd_classes, cv=10, scoring='roc_auc').mean()
1.0
F-beta Score:
svc.fit(X_train, y_train)
from sklearn.metrics import fbeta_score
predictions_test = svc.predict(X_test)
fbeta_score(y_test, predictions_test, average='binary', beta=0.5)
1.0
from sklearn import neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors=10)
cv_scores = cross_val_score(knn, features_final, asd_classes, cv=10)
cv_scores.mean()
0.9474590163934427
AUC Score:
# calculate cross-validated AUC
cross_val_score(knn, features_final, asd_classes, cv=10, scoring='roc_auc').mean()
0.9930078749846192
F-beta Score:
knn.fit(X_train, y_train)
from sklearn.metrics import fbeta_score
predictions_test = knn.predict(X_test)
fbeta_score(y_test, predictions_test, average='binary', beta=0.5)
0.9192825112107623
Choosing K is tricky, so I can't discard KNN until we've tried different values of K. Hence we write a for loop to run KNN with K values ranging from 10 to 50 and see if K makes a substantial difference.
for n in range(10, 50):
knn = neighbors.KNeighborsClassifier(n_neighbors=n)
cv_scores = cross_val_score(knn, features_final, asd_classes, cv=10)
print (n, cv_scores.mean())
10 0.9474590163934427 11 0.9507377049180328 12 0.9507377049180328 13 0.9540437158469945 14 0.9507650273224044 15 0.944207650273224 16 0.9507650273224044 17 0.9523770491803278 18 0.9523770491803278 19 0.9540163934426229 20 0.9523770491803278 21 0.9523770491803278 22 0.9474590163934424 23 0.9490983606557375 24 0.9507377049180326 25 0.9507377049180328 26 0.9523770491803278 27 0.9507377049180328 28 0.9507377049180326 29 0.9507377049180328 30 0.9523770491803278 31 0.9474863387978143 32 0.9491256830601094 33 0.9474863387978143 34 0.9507650273224044 35 0.9491256830601094 36 0.9491256830601091 37 0.9507650273224044 38 0.9540710382513661 39 0.9524316939890708 40 0.9524316939890708 41 0.9524316939890708 42 0.9507923497267757 43 0.9507923497267757 44 0.9507923497267757 45 0.9507923497267757 46 0.9524316939890708 47 0.9524316939890708 48 0.9557103825136611 49 0.9524316939890708
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
cv_scores = cross_val_score(logreg, features_final, asd_classes, cv=10)
cv_scores.mean()
0.9934426229508198
AUC Score:
# calculate cross-validated AUC
cv_scores_roc = cross_val_score(logreg, features_final, asd_classes, cv=10, scoring='roc_auc').mean()
cv_scores_roc.mean()
1.0
F-beta Score:
logreg.fit(X_train, y_train)
from sklearn.metrics import fbeta_score
predictions_test = logreg.predict(X_test)
fbeta_score(y_test, predictions_test, average='binary', beta=0.5)
0.9641255605381165
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
dectree = DecisionTreeClassifier(random_state=1)
# Train the classifier on the training set
dectree.fit(X_train, y_train)
DecisionTreeClassifier(random_state=1)
# make class predictions for the testing set
y_pred_class = dectree.predict(X_test)
dectree.score(X_test, y_test)
1.0
from sklearn.model_selection import cross_val_score
dectree = DecisionTreeClassifier(random_state=1)
cv_scores = cross_val_score(dectree, features_final, asd_classes, cv=10)
cv_scores.mean()
1.0
AUC Score:
# calculate cross-validated AUC
cross_val_score(dectree, features_final, asd_classes, cv=10, scoring='roc_auc').mean()
1.0
F-beta Score:
dectree.fit(X_train, y_train)
from sklearn.metrics import fbeta_score
predictions_test = dectree.predict(X_test)
fbeta_score(y_test, predictions_test, average='binary', beta=0.5)
1.0
# TODO: Import 'GridSearchCV', 'make_scorer', and any other necessary libraries
from sklearn.metrics import fbeta_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
def f_beta_score(y_true, y_predict):
return fbeta_score(y_true, y_predict, beta = 0.5)
# TODO: Initialize the classifier
clf = SVC(random_state = 1)
# TODO: Create the parameters list you wish to tune, using a dictionary if needed.
# HINT: parameters = {'parameter_1': [value1, value2], 'parameter_2': [value1, value2]}
parameters = {'C':range(1,6),'kernel':['linear','poly','rbf','sigmoid'],'degree':range(1,6)}
# TODO: Make an fbeta_score scoring object using make_scorer()
scorer = make_scorer(f_beta_score)
# TODO: Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV()
grid_obj = GridSearchCV(estimator = clf, param_grid = parameters, scoring = scorer)
# TODO: Fit the grid search object to the training data and find the optimal parameters using fit()
grid_fit = grid_obj.fit(X_train, y_train)
# Get the estimator
best_clf = grid_fit.best_estimator_
# Make predictions using the unoptimized and model
predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)
# Report the before-and-afterscores
print("Unoptimized model\n------")
print("Accuracy score on testing data:",accuracy_score(y_test, predictions))
print("F-score on testing data:",fbeta_score(y_test, predictions, beta = 0.5))
print("\nOptimized Model\n------")
print("Final accuracy score on the testing data:",accuracy_score(y_test, best_predictions))
print("Final F-score on the testing data:",fbeta_score(y_test, best_predictions, beta = 0.5))
Unoptimized model ------ Accuracy score on testing data: 0.9590163934426229 F-score on testing data: 0.9360730593607306 Optimized Model ------ Final accuracy score on the testing data: 1.0 Final F-score on the testing data: 1.0