In this competition, a classification algorithm is supposed to accurately identify which customers have the most potential business value for Red Hat based on their characteristics and activities.
For more information, please visit: https://www.kaggle.com/c/predicting-red-hat-business-value
In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
Loading the dataset
In [14]:
print("Read people.csv...")
people = pd.read_csv(r'..\RedHat\people.csv',
dtype={'people_id': np.str,
'activity_id': np.str,
'char_38': np.int32},
parse_dates=['date'])
In [15]:
print("Load train.csv...")
train = pd.read_csv(r'..\RedHat\act_train.csv',
dtype={'people_id': np.str,
'activity_id': np.str,
'outcome': np.int8},
parse_dates=['date'])
In [16]:
print("Load test.csv...")
test = pd.read_csv(r'..\RedHat\act_test.csv',
dtype={'people_id': np.str,
'activity_id': np.str},
parse_dates=['date'])
Review input features
In [6]:
print ("\n\n---------------------")
print ("TRAIN SET INFORMATION")
print ("---------------------")
print ("Shape of training set:", train.shape, "\n")
print ("Column Headers:", list(train.columns.values), "\n")
print (train.dtypes)
In [7]:
import re
missing_values = []
nonumeric_values = []
print ("TRAINING SET INFORMATION")
print ("========================\n")
for column in train:
# Find all the unique feature values
uniq = train[column].unique()
print ("'{}' has {} unique values" .format(column,uniq.size))
if (uniq.size > 10):
print("~~Listing up to 10 unique values~~")
print (uniq[0:10])
print ("\n-----------------------------------------------------------------------\n")
# Find features with missing values
if (True in pd.isnull(uniq)):
s = "{} has {} missing" .format(column, pd.isnull(train[column]).sum())
missing_values.append(s)
# Find features with non-numeric values
for i in range (1, np.prod(uniq.shape)):
if (re.match('nan', str(uniq[i]))):
break
if not (re.search('(^\d+\.?\d*$)|(^\d*\.?\d+$)', str(uniq[i]))):
nonumeric_values.append(column)
break
print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
print ("Features with missing values:\n{}\n\n" .format(missing_values))
print ("Features with non-numeric values:\n{}" .format(nonumeric_values))
print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")