This post is evaluating Aagorithms using MNIST
In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
In [2]:
# importing the train dataset
train = pd.read_csv(r'C:UserspiushDesktopDatasetDigitRecognizertrain.csv')
train.head(10)
Out[2]:
label | pixel0 | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | … | pixel774 | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
6 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
7 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
8 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
9 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10 rows × 785 columns
In [3]:
# importing the train dataset
test = pd.read_csv(r'C:UserspiushDesktopDatasetDigitRecognizertest.csv')
test.head(3)
Out[3]:
pixel0 | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | pixel9 | … | pixel774 | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 rows × 784 columns
In [4]:
print('Reading train data')
print('nSize of training data: ' + str(train.shape))
print('Columns:' + str(train.columns.values))
print('dtypes')
print('n')
print(train.dtypes)
print('n')
print('Info: ')
print('n')
print(train.info)
print('Shape: ')
print('n')
print(train.shape)
print('n')
print('numerical columns statistcs')
print('n')
print(train.describe())
Reading train data Size of training data: (42000, 785) Columns:['label' 'pixel0' 'pixel1' 'pixel2' 'pixel3' 'pixel4' 'pixel5' 'pixel6' 'pixel7' 'pixel8' 'pixel9' 'pixel10' 'pixel11' 'pixel12' 'pixel13' 'pixel14' 'pixel15' 'pixel16' 'pixel17' 'pixel18' 'pixel19' 'pixel20' 'pixel21' 'pixel22' 'pixel23' 'pixel24' 'pixel25' 'pixel26' 'pixel27' .... .... ....
41998 0 0 0 0 0 41999 0 0 0 0 0 [42000 rows x 785 columns]> Shape: (42000, 785) numerical columns statistcs label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 count 42000.000000 42000.0 42000.0 42000.0 42000.0 42000.0 42000.0 mean 4.456643 0.0 0.0 0.0 0.0 0.0 0.0 std 2.887730 0.0 0.0 0.0 0.0 0.0 0.0 min 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 25% 2.000000 0.0 0.0 0.0 0.0 0.0 0.0 50% 4.000000 0.0 0.0 0.0 0.0 0.0 0.0 75% 7.000000 0.0 0.0 0.0 0.0 0.0 0.0 max 9.000000 0.0 0.0 0.0 0.0 0.0 0.0 pixel6 pixel7 pixel8 ... pixel774 pixel775 count 42000.0 42000.0 42000.0 ... 42000.000000 42000.000000 mean 0.0 0.0 0.0 ... 0.219286 0.117095 std 0.0 0.0 0.0 ... 6.312890 4.633819 min 0.0 0.0 0.0 ... 0.000000 0.000000 25% 0.0 0.0 0.0 ... 0.000000 0.000000 50% 0.0 0.0 0.0 ... 0.000000 0.000000 75% 0.0 0.0 0.0 ... 0.000000 0.000000 max 0.0 0.0 0.0 ... 254.000000 254.000000 pixel776 pixel777 pixel778 pixel779 pixel780 count 42000.000000 42000.00000 42000.000000 42000.000000 42000.0 mean 0.059024 0.02019 0.017238 0.002857 0.0 std 3.274488 1.75987 1.894498 0.414264 0.0 min 0.000000 0.00000 0.000000 0.000000 0.0 25% 0.000000 0.00000 0.000000 0.000000 0.0 50% 0.000000 0.00000 0.000000 0.000000 0.0 75% 0.000000 0.00000 0.000000 0.000000 0.0 max 253.000000 253.00000 254.000000 62.000000 0.0 pixel781 pixel782 pixel783 count 42000.0 42000.0 42000.0 mean 0.0 0.0 0.0 std 0.0 0.0 0.0 min 0.0 0.0 0.0 25% 0.0 0.0 0.0 50% 0.0 0.0 0.0 75% 0.0 0.0 0.0 max 0.0 0.0 0.0 [8 rows x 785 columns]
So there are 785 columns with the first column as label. It is multi class classification problem.
Also ,all the data rows are in numerical.
import re
# Review input features (train set) - Part 2A
missing_values = []
nonumeric_values = []
print ("TRAINING SET INFORMATION")
print ("========================n")
for column in train:
# Find all the unique feature values
uniq = train[column].unique()
print ("'{}' has {} unique values" .format(column,uniq.size))
# Find features with missing values
if (True in pd.isnull(uniq)):
s = "{} has {} missing" .format(column, pd.isnull(train[column]).sum())
missing_values.append(s)
# Find features with non-numeric values
for i in range (1, np.prod(uniq.shape)):
if (re.match('nan', str(uniq[i]))):
break
if not (re.search('(^d+.?d*$)|(^d*.?d+$)', str(uniq[i]))):
nonumeric_values.append(column)
break
print ("n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~n")
print ("Features with missing values:n{}nn" .format(missing_values))
print ("Features with non-numeric values:n{}" .format(nonumeric_values))
print ("n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~n")
TRAINING SET INFORMATION ======================== 'label' has 10 unique values 'pixel0' has 1 unique values .... .... ....
'pixel781' has 1 unique values 'pixel782' has 1 unique values 'pixel783' has 1 unique values ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Features with missing values: [] Features with non-numeric values: [] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
There are no missing and non-numeric values
Feature Selection
As there are 785 features, let us select some important features.
In [23]:
data_cl = train.copy() # create a copy of data frame
target = data_cl['label'].copy()
data_cl.drop('label', axis=1, inplace=True)
In [59]:
# Separate dataset for validation
unknown_mask = train['label'].isnull()
#data_submit = data_cl[unknown_mask]
# Separate dataset for training
X = data_cl[~unknown_mask]
Y = target[~unknown_mask]
In [37]:
from sklearn.feature_selection import VarianceThreshold, RFE, SelectKBest, chi2
Variance Threshold
In [38]:
#Find all features with more than 90% variance in values.
threshold = 0.90
vt = VarianceThreshold().fit(X)
# Find feature names
feat_var_threshold = data_cl.columns[vt.variances_ > threshold * (1-threshold)]
feat_var_threshold
Out[38]:
Index(['pixel100', 'pixel101', 'pixel102', 'pixel103', 'pixel104', 'pixel105', 'pixel106', 'pixel107', 'pixel108', 'pixel109', ... 'pixel90', 'pixel91', 'pixel92', 'pixel93', 'pixel94', 'pixel95', 'pixel96', 'pixel97', 'pixel98', 'pixel99'], dtype='object', length=689)
Top 20 most important features
According to RandomForestClassifier
In [39]:
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier, RandomForestClassifier, AdaBoostClassifier
In [40]:
model = RandomForestClassifier()
model.fit(X, Y)
feature_imp = pd.DataFrame(model.feature_importances_, index=X.columns, columns=["importance"])
feat_imp_20 = feature_imp.sort_values("importance", ascending=False).head(20).index
feat_imp_20
Out[40]:
Index(['pixel405', 'pixel461', 'pixel346', 'pixel430', 'pixel350', 'pixel657', 'pixel318', 'pixel437', 'pixel183', 'pixel489', 'pixel409', 'pixel569', 'pixel515', 'pixel431', 'pixel152', 'pixel408', 'pixel349', 'pixel433', 'pixel211', 'pixel541'], dtype='object')
Univariate feature selection
Select top 20 features using chi2chi2 test.
In [41]:
from sklearn.preprocessing import MinMaxScaler
In [42]:
X_minmax = MinMaxScaler(feature_range=(0,1)).fit_transform(X)
X_scored = SelectKBest(score_func=chi2, k='all').fit(X_minmax, Y)
feature_scoring = pd.DataFrame({
'feature': X.columns,
'score': X_scored.scores_
})
feat_scored_20 = feature_scoring.sort_values('score', ascending=False).head(20)['feature'].values
feat_scored_20
Out[42]:
array(['pixel386', 'pixel358', 'pixel414', 'pixel350', 'pixel539', 'pixel413', 'pixel511', 'pixel330', 'pixel567', 'pixel385', 'pixel568', 'pixel427', 'pixel455', 'pixel514', 'pixel483', 'pixel542', 'pixel540', 'pixel596', 'pixel428', 'pixel510'], dtype=object)
Final feature selection
Finally features selected by all methods will be merged together
In [43]:
features = np.hstack([
feat_var_threshold,
feat_imp_20,
feat_scored_20,
])
features = np.unique(features)
print('Final features set:n')
for f in features:
print("t-{}".format(f))
Final features set: -pixel100 -pixel101 ...... ...... ......
-pixel98 -pixel99
Prepare dataset for further analysis
In [60]:
data_cl = data_cl.ix[:, features]
data_submit = test.ix[:, features]
X = X.ix[:, features]
print('Clean dataset shape: {}'.format(data_cl.shape))
print('Submitable dataset shape: {}'.format(data_submit.shape))
print('Train features shape: {}'.format(X.shape))
print('Target label shape: {}'. format(Y.shape))
Clean dataset shape: (42000, 689) Submitable dataset shape: (28000, 689) Train features shape: (42000, 689) Target label shape: (42000,)
PCA Visualization
In [47]:
from sklearn.decomposition import PCA, KernelPCA
In [48]:
components = 8
pca = PCA(n_components=components).fit(X)
In [49]:
#Show explained variance for each component
pca_variance_explained_df = pd.DataFrame({
"component": np.arange(1, components+1),
"variance_explained": pca.explained_variance_ratio_
})
ax = sns.barplot(x='component', y='variance_explained', data=pca_variance_explained_df)
ax.set_title("PCA - Variance explained")
plt.show()
Evaluate Algorithms
In [50]:
from sklearn.cross_validation import KFold, cross_val_score
seed = 7
processors=1
num_folds=3
num_instances=len(X)
scoring='log_loss'
kfold = KFold(n=num_instances, n_folds=num_folds, random_state=seed)
Algorithms spot-check
In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
In [52]:
# Prepare some basic models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('K-NN', KNeighborsClassifier(n_neighbors=5)))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
# Evaluate each model in turn
results = []
names = []
for name, model in models:
cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring, n_jobs=processors)
results.append(cv_results)
names.append(name)
print("{0}: ({1:.3f}) +/- ({2:.3f})".format(name, cv_results.mean(), cv_results.std()))
LR: (-0.548) +/- (0.032) LDA: (-0.635) +/- (0.017) K-NN: (-0.396) +/- (0.031) CART: (-5.281) +/- (0.014) NB: (-14.664) +/- (0.364)
KNeighborsClassifier and DecisionTreeClassifier have the least log-loss , followed by Logistic Regression
Ensembles
Bagged Decision Trees
In [53]:
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring, n_jobs=processors)
print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std()))
(-0.279) +/- (0.011)
AdaBoost
In [54]:
model = AdaBoostClassifier(n_estimators=100, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring, n_jobs=processors)
print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std()))
(-2.123) +/- (0.002)
Stochastic Gradient Boosting
In [55]:
model = GradientBoostingClassifier(n_estimators=100, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring, n_jobs=processors)
print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std()))
(-0.209) +/- (0.003)
Random Forest Classifier
In [56]:
num_trees = 100
num_features = 10
model = RandomForestClassifier(n_estimators=num_trees, max_features=num_features)
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring, n_jobs=processors)
print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std()))
(-0.353) +/- (0.005)
Therefore the algorithms which perform better are:
Stochastic Gradient Boosting
AdaBoost
LogisticRegression
KNeighborsClassifier
RandomForestClassifier
Let us create a sub model.
In [64]:
#Voting Ensemble
# Create sub models
estimators = []
estimators.append(('lr', LogisticRegression(penalty='l2', C=1)))
estimators.append(('knn', KNeighborsClassifier(n_neighbors=5)))
estimators.append(('gbm', GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.1, max_features=15, warm_start=True, random_state=seed)))
estimators.append(('rf', RandomForestClassifier(bootstrap=True, max_depth=8, n_estimators=200, max_features=20, criterion='entropy', random_state=seed)))
estimators.append(('ada', AdaBoostClassifier(algorithm='SAMME.R', learning_rate=1e-2, n_estimators=10, random_state=seed)))
# create the ensemble model
ensemble = VotingClassifier(estimators, voting='soft', weights=[3,2,3,3,1])
results = cross_val_score(ensemble, X, Y, cv=kfold, scoring=scoring,n_jobs=processors)
print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std()))
(-0.330) +/- (0.001)