import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
from IPython.display import display

data = pd.read_csv('ASD.csv')
display(data.head(n=5))


# Total number of records
n_records = len(data.index)

print("Total number of records:",n_records)

# TODO: Number of records where individual's with ASD
n_asd_yes = len(data[data['Class/ASD'] == 'YES'])

print("Individuals diagonised with ASD:",n_asd_yes)

# TODO: Number of records where individual's with no ASD
n_asd_no = len(data[data['Class/ASD'] == 'NO'])

print("Individuals not diagonised with ASD:",n_asd_no)

# TODO: Percentage of individuals whose are with ASD
yes_percent = float(n_asd_yes) / n_records *100

print("Percentage of individuals diagonised with ASD:",yes_percent)

Total number of records: 704
Individuals diagonised with ASD: 189
Individuals not diagonised with ASD: 515
Percentage of individuals diagonised with ASD: 26.84659090909091


#preprocessing of data by changing '?' to NaN value
asd_data = pd.read_csv('ASD.csv', na_values=['?'])
asd_data.head(n=5)


#checking whether data needs cleaning or not
asd_data.describe()


#here we are locating rows which contain null value in any of the column
asd_data.loc[(asd_data['age'].isnull()) |(asd_data['gender'].isnull()) |(asd_data['ethnicity'].isnull()) 
|(asd_data['jundice'].isnull())|(asd_data['austim'].isnull()) |(asd_data['contry_of_res'].isnull())
            |(asd_data['used_app_before'].isnull())|(asd_data['result'].isnull())|(asd_data['age_desc'].isnull())
            |(asd_data['relation'].isnull())]


asd_data.dropna(inplace=True)
asd_data.describe()


# Reminder of the features:
print(asd_data.dtypes)


# Total number of records in clean dataset
n_records = len(asd_data.index)

# TODO: Number of records where individual's with ASD in the clean dataset
n_asd_yes = len(asd_data[asd_data['Class/ASD'] == 'YES'])

# TODO: Number of records where individual's with no ASD in the clean dataset
n_asd_no = len(asd_data[asd_data['Class/ASD'] == 'NO'])

# Print the results
print("Total number of records:",n_records)
print("Individuals diagonised with ASD:",n_asd_yes)
print("Individuals not diagonised with ASD:",n_asd_no)

id                   int64
A1_Score             int64
A2_Score             int64
A3_Score             int64
A4_Score             int64
A5_Score             int64
A6_Score             int64
A7_Score             int64
A8_Score             int64
A9_Score             int64
A10_Score            int64
age                float64
gender              object
ethnicity           object
jundice             object
austim              object
contry_of_res       object
used_app_before     object
result               int64
age_desc            object
relation            object
Class/ASD           object
dtype: object
Total number of records: 609
Individuals diagonised with ASD: 180
Individuals not diagonised with ASD: 429


# Split the data into features and target label
asd_raw = asd_data['Class/ASD']
features_raw = asd_data[['age', 'gender', 'ethnicity', 'jundice', 'austim', 'contry_of_res', 'result',
                      'relation','A1_Score','A2_Score','A3_Score','A4_Score','A5_Score','A6_Score','A7_Score','A8_Score',
                      'A9_Score','A10_Score']]


from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numerical = ['age', 'result']

features_minmax_transform = pd.DataFrame(data = features_raw)
features_minmax_transform[numerical] = scaler.fit_transform(features_raw[numerical])
features_minmax_transform
# Show an example of a record with scaling applied
display(features_minmax_transform.head(n = 5))


#One-hot encode the 'features_minmax_transform' data using pandas.get_dummies()
features_final = pd.get_dummies(features_minmax_transform)
display(features_final.head(5))


# Encode the 'all_classes_raw' data to numerical values
asd_classes = asd_raw.apply(lambda x: 1 if x == 'YES' else 0)



# Print the number of features after one-hot encoding
encoded = list(features_final.columns)
print("total features after one-hot encoding:",len(encoded))

# Uncomment the following line to see the encoded feature names
print(encoded)

total features after one-hot encoding: 94
['age', 'result', 'A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'gender_f', 'gender_m', 'ethnicity_Asian', 'ethnicity_Black', 'ethnicity_Hispanic', 'ethnicity_Latino', 'ethnicity_Middle Eastern ', 'ethnicity_Others', 'ethnicity_Pasifika', 'ethnicity_South Asian', 'ethnicity_Turkish', 'ethnicity_White-European', 'ethnicity_others', 'jundice_no', 'jundice_yes', 'austim_no', 'austim_yes', 'contry_of_res_Afghanistan', 'contry_of_res_AmericanSamoa', 'contry_of_res_Angola', 'contry_of_res_Armenia', 'contry_of_res_Aruba', 'contry_of_res_Australia', 'contry_of_res_Austria', 'contry_of_res_Bahamas', 'contry_of_res_Bangladesh', 'contry_of_res_Belgium', 'contry_of_res_Bolivia', 'contry_of_res_Brazil', 'contry_of_res_Burundi', 'contry_of_res_Canada', 'contry_of_res_Chile', 'contry_of_res_China', 'contry_of_res_Costa Rica', 'contry_of_res_Cyprus', 'contry_of_res_Czech Republic', 'contry_of_res_Ecuador', 'contry_of_res_Egypt', 'contry_of_res_Ethiopia', 'contry_of_res_Finland', 'contry_of_res_France', 'contry_of_res_Germany', 'contry_of_res_Iceland', 'contry_of_res_India', 'contry_of_res_Indonesia', 'contry_of_res_Iran', 'contry_of_res_Ireland', 'contry_of_res_Italy', 'contry_of_res_Jordan', 'contry_of_res_Malaysia', 'contry_of_res_Mexico', 'contry_of_res_Nepal', 'contry_of_res_Netherlands', 'contry_of_res_New Zealand', 'contry_of_res_Nicaragua', 'contry_of_res_Niger', 'contry_of_res_Oman', 'contry_of_res_Pakistan', 'contry_of_res_Philippines', 'contry_of_res_Portugal', 'contry_of_res_Romania', 'contry_of_res_Russia', 'contry_of_res_Saudi Arabia', 'contry_of_res_Serbia', 'contry_of_res_Sierra Leone', 'contry_of_res_South Africa', 'contry_of_res_Spain', 'contry_of_res_Sri Lanka', 'contry_of_res_Sweden', 'contry_of_res_Tonga', 'contry_of_res_Turkey', 'contry_of_res_Ukraine', 'contry_of_res_United Arab Emirates', 'contry_of_res_United Kingdom', 'contry_of_res_United States', 'contry_of_res_Uruguay', 'contry_of_res_Viet Nam', 'relation_Health care professional', 'relation_Others', 'relation_Parent', 'relation_Relative', 'relation_Self']


# histogram of Class/ASD

# 8 bins
plt.hist(asd_classes, bins=10)

# x-axis limit from 0 to 1
plt.xlim(0,1)
plt.title('Histogram of Class/ASD')
plt.xlabel('Class/ASD from processed data')
plt.ylabel('Frequency')

Text(0, 0.5, 'Frequency')


from sklearn.model_selection import train_test_split

np.random.seed(1234)

X_train, X_test, y_train, y_test = train_test_split(features_final, asd_classes, train_size=0.80, random_state=1)


# Show the results of the split
print("Training set has samples:",X_train.shape[0])
print("Testing set has samples:",X_test.shape[0])
#asd_data

Training set has samples: 487
Testing set has samples: 122


from sklearn import svm
from sklearn.model_selection import cross_val_score

C = 1.0
svc = svm.SVC(kernel='linear', C=C, gamma=2)


cv_scores = cross_val_score(svc, features_final, asd_classes, cv=10)

cv_scores.mean()

1.0


# calculate cross-validated AUC
cross_val_score(svc, features_final, asd_classes, cv=10, scoring='roc_auc').mean()

1.0


svc.fit(X_train, y_train)
from sklearn.metrics import fbeta_score
predictions_test = svc.predict(X_test)
fbeta_score(y_test, predictions_test, average='binary', beta=0.5)

1.0


from sklearn import neighbors

knn = neighbors.KNeighborsClassifier(n_neighbors=10)
cv_scores = cross_val_score(knn, features_final, asd_classes, cv=10)

cv_scores.mean()

0.9474590163934427


# calculate cross-validated AUC
cross_val_score(knn, features_final, asd_classes, cv=10, scoring='roc_auc').mean()

0.9930078749846192


knn.fit(X_train, y_train)
from sklearn.metrics import fbeta_score
predictions_test = knn.predict(X_test)
fbeta_score(y_test, predictions_test, average='binary', beta=0.5)

0.9192825112107623


for n in range(10, 50):
    knn = neighbors.KNeighborsClassifier(n_neighbors=n)
    cv_scores = cross_val_score(knn, features_final, asd_classes, cv=10)
    print (n, cv_scores.mean())

10 0.9474590163934427
11 0.9507377049180328
12 0.9507377049180328
13 0.9540437158469945
14 0.9507650273224044
15 0.944207650273224
16 0.9507650273224044
17 0.9523770491803278
18 0.9523770491803278
19 0.9540163934426229
20 0.9523770491803278
21 0.9523770491803278
22 0.9474590163934424
23 0.9490983606557375
24 0.9507377049180326
25 0.9507377049180328
26 0.9523770491803278
27 0.9507377049180328
28 0.9507377049180326
29 0.9507377049180328
30 0.9523770491803278
31 0.9474863387978143
32 0.9491256830601094
33 0.9474863387978143
34 0.9507650273224044
35 0.9491256830601094
36 0.9491256830601091
37 0.9507650273224044
38 0.9540710382513661
39 0.9524316939890708
40 0.9524316939890708
41 0.9524316939890708
42 0.9507923497267757
43 0.9507923497267757
44 0.9507923497267757
45 0.9507923497267757
46 0.9524316939890708
47 0.9524316939890708
48 0.9557103825136611
49 0.9524316939890708


from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
cv_scores = cross_val_score(logreg, features_final, asd_classes, cv=10)
cv_scores.mean()

0.9934426229508198


# calculate cross-validated AUC
cv_scores_roc = cross_val_score(logreg, features_final, asd_classes, cv=10, scoring='roc_auc').mean()
cv_scores_roc.mean()

1.0


logreg.fit(X_train, y_train)
from sklearn.metrics import fbeta_score
predictions_test = logreg.predict(X_test)
fbeta_score(y_test, predictions_test, average='binary', beta=0.5)

0.9641255605381165


from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

dectree = DecisionTreeClassifier(random_state=1)

# Train the classifier on the training set
dectree.fit(X_train, y_train)

DecisionTreeClassifier(random_state=1)


# make class predictions for the testing set
y_pred_class = dectree.predict(X_test)


dectree.score(X_test, y_test)

1.0


from sklearn.model_selection import cross_val_score

dectree = DecisionTreeClassifier(random_state=1)

cv_scores = cross_val_score(dectree, features_final, asd_classes, cv=10)

cv_scores.mean()

1.0


# calculate cross-validated AUC
cross_val_score(dectree, features_final, asd_classes, cv=10, scoring='roc_auc').mean()

1.0


dectree.fit(X_train, y_train)
from sklearn.metrics import fbeta_score
predictions_test = dectree.predict(X_test)
fbeta_score(y_test, predictions_test, average='binary', beta=0.5)

1.0


# TODO: Import 'GridSearchCV', 'make_scorer', and any other necessary libraries
from sklearn.metrics import fbeta_score
from sklearn.metrics import accuracy_score

from sklearn.metrics import make_scorer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV


def f_beta_score(y_true, y_predict):
    return fbeta_score(y_true, y_predict, beta = 0.5)

# TODO: Initialize the classifier
clf = SVC(random_state = 1)

# TODO: Create the parameters list you wish to tune, using a dictionary if needed.
# HINT: parameters = {'parameter_1': [value1, value2], 'parameter_2': [value1, value2]}
parameters = {'C':range(1,6),'kernel':['linear','poly','rbf','sigmoid'],'degree':range(1,6)}

# TODO: Make an fbeta_score scoring object using make_scorer()
scorer = make_scorer(f_beta_score)

# TODO: Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV()
grid_obj = GridSearchCV(estimator = clf, param_grid = parameters, scoring = scorer)

# TODO: Fit the grid search object to the training data and find the optimal parameters using fit()
grid_fit = grid_obj.fit(X_train, y_train)

# Get the estimator
best_clf = grid_fit.best_estimator_

# Make predictions using the unoptimized and model
predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)


# Report the before-and-afterscores
print("Unoptimized model\n------")
print("Accuracy score on testing data:",accuracy_score(y_test, predictions))
print("F-score on testing data:",fbeta_score(y_test, predictions, beta = 0.5))
print("\nOptimized Model\n------")
print("Final accuracy score on the testing data:",accuracy_score(y_test, best_predictions))
print("Final F-score on the testing data:",fbeta_score(y_test, best_predictions, beta = 0.5))

Unoptimized model
------
Accuracy score on testing data: 0.9590163934426229
F-score on testing data: 0.9360730593607306

Optimized Model
------
Final accuracy score on the testing data: 1.0
Final F-score on the testing data: 1.0

	id	A1_Score	A2_Score	A3_Score	A4_Score	A5_Score	A6_Score	A7_Score	A8_Score	A9_Score	A10_Score	age	result
count	704.000000	704.000000	704.000000	704.000000	704.000000	704.000000	704.000000	704.000000	704.000000	704.000000	704.000000	702.000000	704.000000
mean	352.500000	0.721591	0.453125	0.457386	0.495739	0.498580	0.284091	0.417614	0.649148	0.323864	0.573864	29.698006	4.875000
std	203.371581	0.448535	0.498152	0.498535	0.500337	0.500353	0.451301	0.493516	0.477576	0.468281	0.494866	16.507465	2.501493
min	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	17.000000	0.000000
25%	176.750000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	21.000000	3.000000
50%	352.500000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	1.000000	27.000000	4.000000
75%	528.250000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	35.000000	7.000000
max	704.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	383.000000	10.000000

	id	A1_Score	A2_Score	A3_Score	A4_Score	A5_Score	A6_Score	A7_Score	A8_Score	A9_Score	A10_Score	age	result
count	609.000000	609.000000	609.000000	609.000000	609.000000	609.000000	609.000000	609.000000	609.000000	609.000000	609.000000	609.000000	609.000000
mean	349.725780	0.740558	0.469622	0.481117	0.520525	0.525452	0.307061	0.428571	0.665025	0.341544	0.597701	30.215107	5.077176
std	207.856238	0.438689	0.499487	0.500054	0.499989	0.499762	0.461654	0.495278	0.472370	0.474617	0.490765	17.287470	2.522717
min	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	17.000000	0.000000
25%	166.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	22.000000	3.000000
50%	329.000000	1.000000	0.000000	0.000000	1.000000	1.000000	0.000000	0.000000	1.000000	0.000000	1.000000	27.000000	5.000000
75%	533.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	35.000000	7.000000
max	704.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	383.000000	10.000000

SVM :¶

K-Nearest-Neighbors (KNN) :¶

Logistic Regression :¶

Decision Trees :¶

	id	A1_Score	A2_Score	A3_Score	A4_Score	A5_Score	A7_Score	A8_Score	A9_Score	...	gender	ethnicity	jundice	austim	contry_of_res	used_app_before	result	age_desc	relation	Class/ASD
0	1	1	1	1	1	0	1	1	0	...	f	White-European	no	no	United States	no	6	18 and more	Self	NO
1	2	1	1	0	1	0	0	1	0	...	m	Latino	no	yes	Brazil	no	5	18 and more	Self	NO
2	3	1	1	0	1	1	1	1	1	...	m	Latino	yes	yes	Spain	no	8	18 and more	Parent	YES
3	4	1	1	0	1	0	1	1	0	...	f	White-European	no	yes	United States	no	6	18 and more	Self	NO
4	5	1	0	0	0	0	0	1	0	...	f	?	no	no	Egypt	no	2	18 and more	?	NO

	id	A1_Score	A2_Score	A3_Score	A4_Score	A5_Score	A6_Score	A7_Score	A8_Score	A9_Score	...	gender	ethnicity	jundice	austim	contry_of_res	used_app_before	result	age_desc	relation	Class/ASD
4	5	1	0	0	0	0	0	0	1	0	...	f	NaN	no	no	Egypt	no	2	18 and more	NaN	NO
12	13	0	1	1	1	1	1	0	0	1	...	f	NaN	no	no	Bahamas	no	6	18 and more	NaN	NO
13	14	1	0	0	0	0	0	1	1	0	...	m	NaN	no	no	Austria	no	4	18 and more	NaN	NO
14	15	1	0	0	0	0	0	1	1	0	...	f	NaN	no	no	Argentina	no	4	18 and more	NaN	NO
19	20	0	0	0	0	0	0	1	1	0	...	m	NaN	yes	no	United Arab Emirates	no	3	18 and more	NaN	NO
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
652	653	0	0	0	0	0	0	0	0	0	...	f	NaN	no	no	United States	no	1	18 and more	NaN	NO
658	659	0	0	1	1	0	0	1	0	0	...	m	NaN	no	no	Azerbaijan	no	3	18 and more	NaN	NO
659	660	1	1	1	1	1	1	0	0	1	...	m	NaN	no	no	Pakistan	no	8	18 and more	NaN	YES
666	667	0	0	0	0	0	0	0	1	0	...	m	NaN	no	no	Iraq	no	1	18 and more	NaN	NO
701	702	1	0	1	1	1	0	1	1	0	...	f	NaN	no	no	Russia	no	7	18 and more	NaN	YES

	age	gender	ethnicity	jundice	austim	contry_of_res	result	relation	A1_Score	A2_Score	A3_Score	A4_Score	A5_Score	A7_Score	A8_Score	A9_Score	A10_Score
0	0.024590	f	White-European	no	no	United States	0.6	Self	1	1	1	1	0	1	1	0	0
1	0.019126	m	Latino	no	yes	Brazil	0.5	Self	1	1	0	1	0	0	1	0	1
2	0.027322	m	Latino	yes	yes	Spain	0.8	Parent	1	1	0	1	1	1	1	1	1
3	0.049180	f	White-European	no	yes	United States	0.6	Self	1	1	0	1	0	1	1	0	1
5	0.051913	m	Others	yes	no	United States	0.9	Self	1	1	1	1	1	1	1	1	1