This notebook contains my notes for Predictive Analysis on Binary Classification. It acts as a cookbook. It is a continuation from the previous post on assessing performance of Predictive Models.
For Deployment
Coefficient Trajectories for ElasticNet
#Convert normalized labels to numpy array
Y = numpy.array(labelNormalized)
#Convert normalized attributes to numpy array
X = numpy.array(xNormalized)
alphas, coefs, _ = enet_path(X, Y,l1_ratio=0.4, fit_intercept=False, return_models=False)
plot.plot(alphas,coefs.T)
plot.xlabel('alpha')
plot.ylabel('Coefficients')
plot.axis('tight')
plot.semilogx()
ax = plot.gca()
ax.invert_xaxis()
plot.show()
nattr, nalpha = coefs.shape
#find coefficient ordering
nzList = []
for iAlpha in range(1,nalpha):
coefList = list(coefs[: ,iAlpha])
nzCoef = [index for index in range(nattr) if coefList[index] != 0.0]
for q in nzCoef:
if not(q in nzList):
nzList.append(q)
#make up names for columns of X
names = ['V' + str(i) for i in range(ncol)]
nameList = [names[nzList[i]] for i in range(len(nzList))]
print("Attributes Ordered by How Early They Enter the Model")
print(nameList)
print('')
One is the order in which variables come into the solution as alpha is decremented downward. The other ordering is according to the magnitude of the coefficients at the optimum solution.
Apparently, V10 is important when the coefficient penalty is so large that the algorithm only permits a single attribute, but when the coefficient penalty has shrunk to the point that a multitude of attributes are included, the attribute V10 levels off and drops in importance somewhat as other attributes are added to the mix.
The value for alpha at which coefficients are sought is hard-coded and comes directly from the results generated previously.
Best alpha for AUC = 0.0203348835893
#find coefficients corresponding to best alpha value. alpha value corresponding to
#normalized X and normalized Y is 0.020334883589342503
alphaStar = 0.020334883589342503
indexLTalphaStar = [index for index in range(100) if alphas[index] > alphaStar]
indexStar = max(indexLTalphaStar)
#here's the set of coefficients to deploy
coefStar = list(coefs[:,indexStar])
print("Best Coefficient Values ")
print(coefStar)
print('')
#The coefficients on normalized attributes give another slightly different ordering
absCoef = [abs(a) for a in coefStar]
#sort by magnitude
coefSorted = sorted(absCoef, reverse=True)
idxCoefSize = [absCoef.index(a) for a in coefSorted if not(a == 0.0)]
namesList2 = [names[idxCoefSize[i]] for i in range(len(idxCoefSize))]
print("Attributes Ordered by Coef Size at Optimum alpha")
print(namesList2)
Penalized Logistic Regression – Glmnet
The process has to be iterated until the probabilities (and corresponding weights) stop changing. Basically, the IRLS for logistic regression adds another layer of iteration to the algorithm for (not logistic) penalized regression
from math import sqrt, fabs, exp
def S(z,gamma):
if gamma >= fabs(z):
return 0.0
if z > 0.0:
return z - gamma
else:
return z + gamma
def Pr(b0,b,x):
n = len(x)
sum = b0
for i in range(n):
sum += b[i]*x[i]
if sum < -100: sum = -100
return 1.0/(1.0 + exp(-sum))
#Do Not Normalize labels but do calculate averages
meanLabel = sum(labels)/nrow
sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrow)])/nrow)
#initialize probabilities and weights
sumWxr = [0.0] * ncol
sumWxx = [0.0] * ncol
sumWr = 0.0
sumW = 0.0
#calculate starting points for betas
for iRow in range(nrow):
p = meanLabel
w = p * (1.0 - p)
#residual for logistic
r = (labels[iRow] - p) / w
x = xNormalized[iRow]
sumWxr = [sumWxr[i] + w * x[i] * r for i in range(ncol)]
sumWxx = [sumWxx[i] + w * x[i] * x[i] for i in range(ncol)]
sumWr = sumWr + w * r
sumW = sumW + w
avgWxr = [sumWxr[i]/nrow for i in range(ncol)]
avgWxx = [sumWxx[i]/nrow for i in range(ncol)]
maxWxr = 0.0
for i in range(ncol):
val = abs(avgWxr[i])
if val > maxWxr:
maxWxr = val
alpha = 0.8
#calculate starting value for lambda
lam = maxWxr/alpha
#this value of lambda corresponds to beta = list of 0's
#initialize a vector of coefficients beta
beta = [0.0] * ncol
beta0 = sumWr/sumW
#initialize matrix of betas at each step
betaMat = []
betaMat.append(list(beta))
beta0List = []
beta0List.append(beta0)
#begin iteration
nSteps = 100
lamMult = 0.93 #100 steps gives reduction by factor of 1000 in lambda (recommended by authors)
nzList = []
for iStep in range(nSteps):
#decrease lambda
lam = lam * lamMult
#Use incremental change in betas to control inner iteration
#set middle loop values for betas = to outer values
# values are used for calculating weights and probabilities
#inner values are used for calculating penalized regression updates
#take pass through data to calculate averages over data require for iteration
#initilize accumulators
betaIRLS = list(beta)
beta0IRLS = beta0
distIRLS = 100.0
#Middle loop to calculate new betas with fixed IRLS weights and probabilities
iterIRLS = 0
while distIRLS > 0.01:
iterIRLS += 1
iterInner = 0.0
betaInner = list(betaIRLS)
beta0Inner = beta0IRLS
distInner = 100.0
while distInner > 0.01:
iterInner += 1
if iterInner > 100: break
#cycle through attributes and update one-at-a-time
#record starting value for comparison
betaStart = list(betaInner)
for iCol in range(ncol):
sumWxr = 0.0
sumWxx = 0.0
sumWr = 0.0
sumW = 0.0
for iRow in range(nrow):
x = list(xNormalized[iRow])
y = labels[iRow]
p = Pr(beta0IRLS, betaIRLS, x)
if abs(p) < 1e-5:
p = 0.0
w = 1e-5
elif abs(1.0 - p) < 1e-5:
p = 1.0
w = 1e-5
else:
w = p * (1.0 - p)
z = (y - p) / w + beta0IRLS + sum([x[i] * betaIRLS[i] for i in range(ncol)])
r = z - beta0Inner - sum([x[i] * betaInner[i] for i in range(ncol)])
sumWxr += w * x[iCol] * r
sumWxx += w * x[iCol] * x[iCol]
sumWr += w * r
sumW += w
avgWxr = sumWxr / nrow
avgWxx = sumWxx / nrow
beta0Inner = beta0Inner + sumWr / sumW
uncBeta = avgWxr + avgWxx * betaInner[iCol]
betaInner[iCol] = S(uncBeta, lam * alpha) / (avgWxx + lam * (1.0 - alpha))
sumDiff = sum([abs(betaInner[n] - betaStart[n]) for n in range(ncol)])
sumBeta = sum([abs(betaInner[n]) for n in range(ncol)])
distInner = sumDiff/sumBeta
#print number of steps for inner and middle loop convergence to monitor behavior
#print(iStep, iterIRLS, iterInner)
#if exit inner while loop, then set betaMiddle = betaMiddle and run through middle loop again.
#Check change in betaMiddle to see if IRLS is converged
a = sum([abs(betaIRLS[i] - betaInner[i]) for i in range(ncol)])
b = sum([abs(betaIRLS[i]) for i in range(ncol)])
distIRLS = a / (b + 0.0001)
dBeta = [betaInner[i] - betaIRLS[i] for i in range(ncol)]
gradStep = 1.0
temp = [betaIRLS[i] + gradStep * dBeta[i] for i in range(ncol)]
betaIRLS = list(temp)
beta = list(betaIRLS)
beta0 = beta0IRLS
betaMat.append(list(beta))
beta0List.append(beta0)
nzBeta = [index for index in range(ncol) if beta[index] != 0.0]
for q in nzBeta:
if not(q in nzList):
nzList.append(q)
#make up names for columns of xNum
names = ['V' + str(i) for i in range(ncol)]
nameList = [names[nzList[i]] for i in range(len(nzList))]
print("Attributes Ordered by How Early They Enter the Model")
print(nameList)
for i in range(ncol):
#plot range of beta values for each attribute
coefCurve = [betaMat[k][i] for k in range(nSteps)]
xaxis = range(nSteps)
plot.plot(xaxis, coefCurve)
plot.xlabel("Steps Taken")
plot.ylabel("Coefficient Values")
plot.show()
Ensemble Methods
Classifying using Random Forest
from sklearn.cross_validation import train_test_split
from sklearn import ensemble
#number of rows and columns in x matrix
nrows = len(xNum)
ncols = len(xNum[1])
#form x and y into numpy arrays and make up column names
X = np.array(xNum)
y = np.array(labels)
rocksVMinesNames = np.array(['V' + str(i) for i in range(ncols)])
#break into training and test sets.
xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.30, random_state=531)
auc = []
nTreeList = range(50, 2000, 50)
for iTrees in nTreeList:
depth = None
maxFeat = 8 #try tweaking
rocksVMinesRFModel = ensemble.RandomForestClassifier(n_estimators=iTrees, max_depth=depth, max_features=maxFeat,
oob_score=False, random_state=531)
rocksVMinesRFModel.fit(xTrain,yTrain)
#Accumulate auc on test set
prediction = rocksVMinesRFModel.predict_proba(xTest)
aucCalc = roc_auc_score(yTest, prediction[:,1:2])
auc.append(aucCalc)
print("AUC" )
print(auc[-1])
#plot training and test errors vs number of trees in ensemble
plot.plot(nTreeList, auc)
plot.xlabel('Number of Trees in Ensemble')
plot.ylabel('Area Under ROC Curve - AUC')
#plot.ylim([0.0, 1.1*max(mseOob)])
plot.show()
For AUC, 1.0 is perfect, and 0.5 is perfectly bad. So, with AUC, larger is better, and instead of looking for a valley in the plot, you’re looking for a peak.
Figure shows a peak toward the left side of the plot.
However, because Random Forest only reduces variance and does not overfit, the peak can be attributed to random fluctuation.
The best choice of model is the one including all the trees whose performance is the rightmost point on the curve.
# Plot feature importance
featureImportance = rocksVMinesRFModel.feature_importances_
# normalize by max importance
featureImportance = featureImportance / featureImportance.max()
#plot importance of top 30
idxSorted = numpy.argsort(featureImportance)[30:60]
idxTemp = numpy.argsort(featureImportance)[::-1]
print(idxTemp)
barPos = numpy.arange(idxSorted.shape[0]) + .5
plot.barh(barPos, featureImportance[idxSorted], align='center')
plot.yticks(barPos, rocksVMinesNames[idxSorted])
plot.xlabel('Variable Importance')
plot.show()
The different attributes in the mine detection problem correspond to different frequencies of sonar signal and therefore different wavelengths. If you were given the problem of designing the machine learning for this problem, your next step might be to determine the wavelengths corresponding to these variables and compare those wavelengths to the characteristic dimensions of the rocks and mines in the test and training set.
#plot best version of ROC curve
fpr, tpr, thresh = roc_curve(yTest, list(prediction[:,1:2]))
ctClass = [i*0.01 for i in range(101)]
plot.plot(fpr, tpr, linewidth=2)
plot.plot(ctClass, ctClass, linestyle=':')
plot.xlabel('False Positive Rate')
plot.ylabel('True Positive Rate')
plot.show()
#pick some threshold values and calc confusion matrix for best predictions
#notice that GBM predictions don't fall in range of (0, 1)
#pick threshold values at 25th, 50th and 75th percentiles
idx25 = int(len(thresh) * 0.25)
idx50 = int(len(thresh) * 0.50)
idx75 = int(len(thresh) * 0.75)
#calculate total points, total positives and total negatives
totalPts = len(yTest)
P = sum(yTest)
N = totalPts - P
print('')
print('Confusion Matrices for Different Threshold Values')
#25th
TP = tpr[idx25] * P; FN = P - TP; FP = fpr[idx25] * N; TN = N - FP
print('')
print('Threshold Value = ', thresh[idx25])
print('TP = ', TP/totalPts, 'FP = ', FP/totalPts)
print('FN = ', FN/totalPts, 'TN = ', TN/totalPts)
#50th
TP = tpr[idx50] * P; FN = P - TP; FP = fpr[idx50] * N; TN = N - FP
print('')
print('Threshold Value = ', thresh[idx50])
print('TP = ', TP/totalPts, 'FP = ', FP/totalPts)
print('FN = ', FN/totalPts, 'TN = ', TN/totalPts)
#75th
TP = tpr[idx75] * P; FN = P - TP; FP = fpr[idx75] * N; TN = N - FP
print('')
print('Threshold Value = ', thresh[idx75])
print('TP = ', TP/totalPts, 'FP = ', FP/totalPts)
print('FN = ', FN/totalPts, 'TN = ', TN/totalPts)
Detecting Unexploded Mines with Python Gradient Boosting
#form x and y into numpy arrays and make up column names
X = numpy.array(xNum)
y = numpy.array(labels)
rockVMinesNames = numpy.array(['V' + str(i) for i in range(ncols)])
#break into training and test sets.
xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.30, random_state=531)
#instantiate model
nEst = 2000
depth = 3
learnRate = 0.007
maxFeatures = 20
rockVMinesGBMModel = ensemble.GradientBoostingClassifier(n_estimators=nEst, max_depth=depth,
learning_rate=learnRate,
max_features=maxFeatures)
#train
rockVMinesGBMModel.fit(xTrain, yTrain)
# compute auc on test set as function of ensemble size
auc = []
aucBest = 0.0
predictions = rockVMinesGBMModel.staged_decision_function(xTest)
for p in predictions:
aucCalc = roc_auc_score(yTest, p)
auc.append(aucCalc)
#capture best predictions
if aucCalc > aucBest:
aucBest = aucCalc
pBest = p
idxBest = auc.index(max(auc))
#print best values
print("Best AUC" )
print(auc[idxBest])
print("Number of Trees for Best AUC")
print(idxBest)
#plot training deviance and test auc's vs number of trees in ensemble
plot.figure()
plot.plot(range(1, nEst + 1), rockVMinesGBMModel.train_score_, label='Training Set Deviance', linestyle=":")
plot.plot(range(1, nEst + 1), auc, label='Test Set AUC')
plot.legend(loc='upper right')
plot.xlabel('Number of Trees in Ensemble')
plot.ylabel('Deviance / AUC')
plot.show()
Deviance is related to how far the probability estimates are from correct but differs slightly from misclassification error. Deviance is plotted because that quantity is what gradient boosting is training to improve. It’s included in the plot to show the progress of training. The AUC (on oos data) is also plotted to show how the oos performance is changing as the number of trees increases (or equivalently more gradient steps are taken; each step results in training an additional tree).
# Plot feature importance
featureImportance = rockVMinesGBMModel.feature_importances_
# normalize by max importance
featureImportance = featureImportance / featureImportance.max()
#plot importance of top 30
idxSorted = numpy.argsort(featureImportance)[30:60]
barPos = numpy.arange(idxSorted.shape[0]) + .5
plot.barh(barPos, featureImportance[idxSorted], align='center')
plot.yticks(barPos, rockVMinesNames[idxSorted])
plot.xlabel('Variable Importance')
plot.show()
The variable importances have a somewhat different order than the ones for Random Forest.
There is some commonality; for example, variables V10, V11, V20 and V51 are near the top of both lists although not in quite the same order.
#pick some threshold values and calc confusion matrix for best predictions
#notice that GBM predictions don't fall in range of (0, 1)
#plot best version of ROC curve
fpr, tpr, thresh = roc_curve(yTest, list(pBest))
ctClass = [i*0.01 for i in range(101)]
plot.plot(fpr, tpr, linewidth=2)
plot.plot(ctClass, ctClass, linestyle=':')
plot.xlabel('False Positive Rate')
plot.ylabel('True Positive Rate')
plot.show()
Gradient Boosting does get better results using Random Forest base learners, but the difference isn’t large enough to be obvious in the graph.
#pick some threshold values and calc confusion matrix for best predictions
#notice that GBM predictions don't fall in range of (0, 1)
#pick threshold values at 25th, 50th and 75th percentiles
idx25 = int(len(thresh) * 0.25)
idx50 = int(len(thresh) * 0.50)
idx75 = int(len(thresh) * 0.75)
#calculate total points, total positives and total negatives
totalPts = len(yTest)
P = sum(yTest)
N = totalPts - P
print('')
print('Confusion Matrices for Different Threshold Values')
#25th
TP = tpr[idx25] * P; FN = P - TP; FP = fpr[idx25] * N; TN = N - FP
print('')
print('Threshold Value = ', thresh[idx25])
print('TP = ', TP/totalPts, 'FP = ', FP/totalPts)
print('FN = ', FN/totalPts, 'TN = ', TN/totalPts)
#50th
TP = tpr[idx50] * P; FN = P - TP; FP = fpr[idx50] * N; TN = N - FP
print('')
print('Threshold Value = ', thresh[idx50])
print('TP = ', TP/totalPts, 'FP = ', FP/totalPts)
print('FN = ', FN/totalPts, 'TN = ', TN/totalPts)
#75th
TP = tpr[idx75] * P; FN = P - TP; FP = fpr[idx75] * N; TN = N - FP
print('')
print('Threshold Value = ', thresh[idx75])
print('TP = ', TP/totalPts, 'FP = ', FP/totalPts)
print('FN = ', FN/totalPts, 'TN = ', TN/totalPts)
Excellent post. It helped me a lot. Thanks