import pandas as pd
import numpy as np
# Take few samples for the visualization
sample_fbcheckin_train_tbl = fbcheckin_train_tbl[:10000].copy()
df = pd.read_csv('train.csv', index_col='row_id')
df.head()
print('Reading train data')
print('\nSize of training data: ' + str(df.shape))
print('Columns:' + str(df.columns.values))
print('Number of places: ' + str(len(list(set(df['place_id'].values.tolist())))))
print('\n')
print('dtypes')
print('\n')
print(df.dtypes)
print('\n')
print('Info: ')
print('\n')
print(df.info)
print('Shape: ')
print('\n')
print(df.shape)
print('\n')
print('numerical columns statistcs')
print('\n')
print(df.describe())
OK: so we have ~29 million records.
A few notes:
- row_id seems to be … a row ID. It is TRUE that the number of unique row_ids is the same as the number of rows in the data frame.
- x is presumably bounded between [0, 10] as the x-axis on the 10-km square.
- y looks to be the same as x, just the other dimension.
- accuracy is intersting: it’s all over the place. The smallest value is 1.00; the biggest value is 1,033.00. We’ll have to look into that.
- time has no units. Since Facebook notes that time an accuracy are “intentionally left vague in their definitions.”, we will have to look into that.
- place_id is probably a unique identifier. There 108390 unique values.
Let’s start by examining the “vague” variables.
Accuracy We already know that accuracy isn’t exactly defined. From first principles, we could think of it a few ways –
Error on some arbitary scale. This seems unlikely, since the max is > 1,000. Error on the same scale as x and y. Now, this could be an estimated radius (with the x and y values as the center); either normal or squared. Since we have a lot of data, and we’re running this in Kaggle scripts, we can randomly sample 1% of the data and look at the data. The pattern will (almost certainly) be the same.
import matplotlib.pyplot as plt
%matplotlib inline
# the histogram of the data
n, bins, patches = plt.hist(df['accuracy'], 50, normed=1, facecolor='r', alpha=0.75)
It looks like we have three peaks. Do we think there are underlying parameters in the simulation at these peaks?
We might also think that there are different measures of accuracy at different locations.
Let’s see how even the accuracy is over x and y.
# change outlier point symbols horizontal boxes
plt.figure()
plt.boxplot(df['accuracy'],0, 'o',0)
plt.show()
from scipy.stats import kurtosis,skew
kurtosis(df['accuracy'])
skew(df, axis=0, bias=True)
df[df.x<0.1].x.hist(bins=100)
plt.xlabel('x')
plt.ylabel('Number of checkins');
#3D plot grouping the data into rounded off x and y
from mpl_toolkits.mplot3d import Axes3D
plt.rcParams['figure.figsize'] = (16,5)
df['grid_x'] = df.x.apply(lambda x: round(x))
df['grid_y'] = df.y.apply(lambda y: round(y))
grouped_grid = df.groupby(['grid_x', 'grid_y'])
grid_freq = grouped_grid.size().reset_index()
fig = plt.figure()
ax1 = fig.add_subplot(111, projection='3d')
xpos = grid_freq.grid_x
ypos = grid_freq.grid_y
zpos = np.zeros(len(grouped_grid))
dx = np.ones(len(grouped_grid))
dy = np.ones(len(grouped_grid))
dz = grid_freq[0]
ax1.bar3d(xpos, ypos, zpos, dx, dy, dz, color='#00ceaa')
plt.show()
I didn’t see many people experimenting with the accuracy so I was curious if I could find anything from it. My goal was to find some sort of relationship between accuracy and the variation in x and y. The purpose of this was so that I could eventually make a way to artificially populate the data set with ‘ghosts’ of the same data point in a localized radius (obtained from this relationship between accuracy and distance).
mean_train_data = df.groupby('place_id').mean()
std_train_data = df.groupby('place_id').std()
acc_df = pd.concat([mean_train_data['accuracy'],std_train_data['x'],std_train_data['y']], axis=1)
acc_df.rename(columns={'accuracy':'mean_accuracy','x':'std_x','y':'std_y'}, inplace=True)
acc_df.fillna(0, inplace=True)
p = plt.hist(acc_df.mean_accuracy, bins=np.arange(min(acc_df.mean_accuracy), max(acc_df.mean_accuracy) + 1, 1))
plt.xlabel('Mean Accuracy')
plt.ylabel('Count')
plt.title('Counts of mean accuracy for places')
plt.show()
p = plt.hist(acc_df.std_x, bins=np.arange(min(acc_df.std_x), max(acc_df.std_x) + .01, .01))
plt.xlabel('Std(y)')
plt.ylabel('Count')
plt.title('Counts of std(x) for places')
plt.show()
p = plt.hist(acc_df.std_y, bins=np.arange(min(acc_df.std_y), max(acc_df.std_y) + .001, .001))
plt.xlabel('Std(y)')
plt.ylabel('Count')
plt.title('Counts of std(y) for places')
plt.show()
p = plt.scatter(acc_df.mean_accuracy,acc_df.std_x)
plt.xlabel('Mean Accuracy')
plt.ylabel('Std(x)')
plt.title('std(x) vs Mean accuracy for each place')
plt.show()
Some outliers in this graph but for the most part this is about what was expected, higher accuracy generally leads to lower deviation in distance and the opposite for lower accuracy. The variation in this data is larger than what would be ideal, which could cause some distance approximations to be very inaccurate but it is a start.
p = plt.scatter(acc_df.mean_accuracy, acc_df.std_y)
plt.xlabel('Mean Accuracy')
plt.ylabel('Std(y)')
plt.title('std(y) vs mean accuracy for each place')
plt.show()
df.time.describe()
range = df.time.max() - df.time.min()
print(range)
print('Calculate hour, weekday, month and year for train and test')
df['hour'] = (df['time']//60)%24+1 # 1 to 24
df['weekday'] = (df['time']//1440)%7+1
df['month'] = (df['time']//43200)%12+1 # rough estimate, month = 30 days
df['year'] = (df['time']//525600)+1
print('group by place_id and get count')
places = df[['place_id', 'time']].groupby('place_id').count()
import seaborn as sns
places.rename(columns={'time': 'count'}, inplace=True)
print('plot weekday Vs hour for 6 place_ids with highest counts')
plt.figure(1, figsize=(14,10))
placeindex = places['count'].sort_values(ascending=False)[:6]
for (i, placeid) in enumerate(placeindex.index):
ax = plt.subplot(2,3,i+1)
df_place = df.query('place_id == @placeid')
# df_place = train.query('place_id == @placeid and year==1') # to separate by year
sns.kdeplot(df_place.weekday, df_place.hour, shade=True, ax = ax)
plt.title("place_id " + str(placeid))
ax.set(xlim=(0, 8))
ax.set(ylim=(0, 25))
Kdes for weekday Vs hour for 6 place ids with the highest counts. The plots show a preference for certain hours and weekdays for each place id. Maybe weekends and weekdays and holidays can be separated if more place ids are analyzed.
print('plot weekday Vs month for 6 place_ids with highest counts')
plt.figure(2, figsize=(14,10))
placeindex = places['count'].sort_values(ascending=False)[:6]
for (i, placeid) in enumerate(placeindex.index):
df_place = df.query('place_id == @placeid and year==1')
ax = plt.subplot(2,3,i+1)
sns.kdeplot(df_place.weekday, df_place.month, shade=True, ax=ax)
plt.title("place_id " + str(placeid))
Estimate the autocorrelation of all of the timestamps to see if there are any repeating patterns
# Pull out timestamps
#Remove single-dimensional entries from the shape of an array.
times = np.squeeze(df.as_matrix(columns=['time']))
sns.distplot(df.time.sample(frac=0.01))
Simply looking at the KDE distribution of time does not provide much information. The only thing to notice are two valleys that probably indicate some kind of seasonality.
FFT (Fast Fourier Transform) refers to a way the discrete Fourier Transform (DFT) can be calculated efficiently, by using symmetries in the calculated terms. The symmetry is highest when n is a power of 2, and the transform is therefore most efficient for these sizes.
ts_count = df.time.value_counts().sort_index()
ts_fft = np.fft.fft(ts_count.tolist())
print(ts_fft)
import matplotlib as mpl
def discrete_cmap(N, base_cmap=None):
"""Create an N-bin discrete colormap from the specified input map"""
# By Jake VanderPlas
# License: BSD-style
# Note that if base_cmap is a string or None, you can simply do
# return plt.cm.get_cmap(base_cmap, N)
# The following works for string, None, or a colormap instance:
base = plt.cm.get_cmap(base_cmap)
color_list = base(np.linspace(0, 1, N))
cmap_name = base.name + str(N)
return base.from_list(cmap_name, color_list, N)
def getSamplePlaces(numberOfPlaces=1):
return train[train.place_id.isin((train.place_id).unique()[1:numberOfPlaces+1])]
def getSamplePlace(numberOfPlaces=1):
return train[train.place_id==(train.place_id).unique()[numberOfPlaces]]
def normalize(series):
return (series-min(series))/(max(series)-min(series))
train=df
sample=getSamplePlaces(30)
norm = mpl.colors.Normalize(vmin=min(sample.place_id),vmax=max(sample.place_id))
cp=plt.cm.get_cmap('jet')
cp=discrete_cmap(30,cp)
cols=cp(norm(sample['place_id']))
cols[:,3]=normalize(sample['accuracy'])
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
plt.xlabel('x')
plt.ylabel('y')
ax.set_zlabel('time')
ax.scatter(sample.x,sample.y,sample.time,c=cols,marker='o',depthshade=False,lw = 0)
plt.show()
#plt.savefig('output.png')
df_test = pd.read_csv("test.csv")
# Sample them for quicker visualisations
df_train_sample = df.sample(n=1000000)
df_test_sample = df_test.sample(n=1000000)
Part 2 – Quick visualisations
Let’s start with some basic histograms, showing the distribution of accuracy and time.
counts1, bins1 = np.histogram(df["accuracy"], bins=50)
binsc1 = bins1[:-1] + np.diff(bins1)/2.
counts2, bins2 = np.histogram(df_test["accuracy"], bins=50)
binsc2 = bins2[:-1] + np.diff(bins2)/2.
plt.figure(0, figsize=(14,4))
plt.subplot(121)
plt.bar(binsc1, counts1/(counts1.sum()*1.0), width=np.diff(bins1)[0])
plt.grid(True)
plt.xlabel("Accuracy")
plt.ylabel("Fraction")
plt.title("Train")
plt.subplot(122)
plt.bar(binsc2, counts2/(counts2.sum()*1.0), width=np.diff(bins2)[0])
plt.grid(True)
plt.xlabel("Accuracy")
plt.ylabel("Fraction")
plt.title("Test")
plt.show()
The two dips of time in training set are curious, if looking at counts per unit time they might need to be normalised.
Another thing we can look at is how frequently different locations appear.