Kobe Bryant

Kaggle Tutorial using Kobe Bryant Dataset – Part 3

Posted on Posted in Kaggle
#columns not needed
notNeeded = []
In [183]:
#Action type column
print(df['action_type'].unique())
['Jump Shot' 'Driving Dunk Shot' 'Layup Shot' 'Running Jump Shot'
 'Driving Layup Shot' 'Reverse Layup Shot' 'Reverse Dunk Shot'
 'Slam Dunk Shot' 'Turnaround Jump Shot' 'Tip Shot' 'Running Hook Shot'
 'Alley Oop Dunk Shot' 'Dunk Shot' 'Alley Oop Layup shot'
 'Running Dunk Shot' 'Driving Finger Roll Shot' 'Running Layup Shot'
 'Finger Roll Shot' 'Fadeaway Jump Shot' 'Follow Up Dunk Shot' 'Hook Shot'
 'Turnaround Hook Shot' 'Running Tip Shot' 'Jump Hook Shot'
 'Running Finger Roll Shot' 'Jump Bank Shot' 'Turnaround Finger Roll Shot'
 'Hook Bank Shot' 'Driving Hook Shot' 'Running Reverse Layup Shot'
 'Driving Finger Roll Layup Shot' 'Fadeaway Bank shot' 'Pullup Jump shot'
 'Finger Roll Layup Shot' 'Turnaround Fadeaway shot'
 'Driving Reverse Layup Shot' 'Driving Slam Dunk Shot'
 'Step Back Jump shot' 'Reverse Slam Dunk Shot' 'Turnaround Bank shot'
 'Running Finger Roll Layup Shot' 'Floating Jump shot'
 'Putback Slam Dunk Shot' 'Running Bank shot' 'Driving Bank shot'
 'Putback Layup Shot' 'Driving Jump shot' 'Putback Dunk Shot'
 'Pullup Bank shot' 'Running Slam Dunk Shot' 'Cutting Layup Shot'
 'Driving Floating Jump Shot' 'Running Pull-Up Jump Shot' 'Tip Layup Shot'
 'Driving Floating Bank Jump Shot' 'Turnaround Fadeaway Bank Jump Shot'
 'Cutting Finger Roll Layup Shot']
In [184]:
#Combined shot type 
print(df['combined_shot_type'].unique())
['Jump Shot' 'Dunk' 'Layup' 'Tip Shot' 'Hook Shot' 'Bank Shot']
In [185]:
#game event and game IDs not needed
notNeeded.extend(['game_event_id','game_id'])
In [29]:
#loc_x,loc_y,lat,lon
#sns.set_style('whitegrid')
sns.pairplot(df, vars=['loc_x', 'loc_y', 'lat', 'lon'], hue='shot_distance',size = 3)
Out[29]:
<seaborn.axisgrid.PairGrid at 0x197094bbd30>
In [187]:
#loc_x,loc_y,lat,lon
sns.set_style('whitegrid')
sns.pairplot(df, vars=['loc_x', 'loc_y'], hue='shot_made_flag')
Out[187]:
<seaborn.axisgrid.PairGrid at 0x27093a33630>
In [188]:
#loc_x and lon are correlated, also loc_y and lat, so we'll drop lon and lat.
notNeeded.extend(['lon','lat'])
In [189]:
# minutes_remaining and seconds_remaining can be put in one column named time_remaining.
df['timeRemaining'] = 60 * df.loc[:,'minutes_remaining'] + df.loc[:,'seconds_remaining']
In [190]:
notNeeded.extend(['minutes_remaining','seconds_remaining'])
In [191]:
#Period
df['period'].unique()
Out[191]:
array([1, 2, 3, 4, 5, 6, 7], dtype=int64)
In [192]:
#Playoffs
df['playoffs'].unique()
Out[192]:
array([0, 1], dtype=int64)
In [193]:
#Shot made flag
df['shot_made_flag'].unique()
#This will be the feature we're looking for, 
#and later we'll split the data based on it.
Out[193]:
array([ nan,   0.,   1.])
In [194]:
#Season
#In season column, we'll just keep the year when season started
df['season'] = df['season'].apply(lambda x: x[:4])
#convert column to integer.
df['season'] = pd.to_numeric(df['season'])
In [195]:
#Shot distance
#Seems like shot_distance is just floored distance calculated from x- and
# y- location of a shot, so we'll use more precise measure and drop floored one.
distance = pd.DataFrame({'trueDistance': np.sqrt((df['loc_x']/10)** 2
                        + (df['loc_y']/10) ** 2),
                       'shotDistance': df['shot_distance']})
In [196]:
distance[:5]
Out[196]:
shotDistance trueDistance
0 18 18.185984
1 15 15.700000
2 16 16.860012
3 22 22.286543
4 0 0.000000
In [197]:
df['shotDistance'] = distance['trueDistance']
In [198]:
notNeeded.append('shot_distance')
In [199]:
notNeeded
Out[199]:
['game_event_id',
 'game_id',
 'lon',
 'lat',
 'minutes_remaining',
 'seconds_remaining',
 'shot_distance']
In [200]:
#shot type
df['shot_type'].unique()
Out[200]:
array(['2PT Field Goal', '3PT Field Goal'], dtype=object)
In [201]:
# We can create a new column 3pt_goal which will have values 1 for 3pt goal
# and 0 for 2pt goal, and then drop shot_type column.
df['3ptGoal'] = df['shot_type'].str.contains('3PT').astype('int')
notNeeded.append('shot_type')
In [202]:
#Shot zone: range,area,basic
print(df['shot_zone_range'].unique())
print(df['shot_zone_area'].unique())
print(df['shot_zone_basic'].unique())
['16-24 ft.' '8-16 ft.' 'Less Than 8 ft.' '24+ ft.' 'Back Court Shot']
['Right Side(R)' 'Left Side(L)' 'Left Side Center(LC)'
 'Right Side Center(RC)' 'Center(C)' 'Back Court(BC)']
['Mid-Range' 'Restricted Area' 'In The Paint (Non-RA)' 'Above the Break 3'
 'Right Corner 3' 'Backcourt' 'Left Corner 3']
In [203]:
#shot_zone_range is just putting shot_distance in 5 bins. Don't need it.
notNeeded.append('shot_zone_range')
In [204]:
#Let's visualize shot_zone_area and shot_zone_basic.
#We'll put loc_y = 0 near the top, so right and left sides show correctly in the graph.
areaGroup = df.groupby('shot_zone_area')
basicGroup = df.groupby('shot_zone_basic')
plt.subplots(1,2,figsize = (15,7),sharey = True)
colors = list('rgbcmyk')
plt.subplot(121)
plt.ylim(500,-50)
plt.title('shot_zone_area')
for i,(_,area) in enumerate(areaGroup):
    plt.scatter(area['loc_x'],area['loc_y'],alpha = 0.1,color = colors[i])
    
plt.subplot(122)
plt.ylim(500,-50)
plt.title('shot_zone_basic')
for i,(_,basic) in enumerate(basicGroup):
    plt.scatter(basic['loc_x'],basic['loc_y'],alpha = 0.1,color = colors[i])
In [205]:
#Team ID and name
print(df['team_id'].unique())
print(df['team_name'].unique())
#Those two columns are the same for all entries, so we can drop them.
notNeeded.extend(['team_id','team_name'])
[1610612747]
['Los Angeles Lakers']
In [206]:
#Game date
#We'll convert game_date to datetime format, and then split it to year, month and weekday (0 = Monday, 6 = Sunday), so it won't be needed anymore.
df['game_date'] = pd.to_datetime(df['game_date'])
df['game_year'] = df['game_date'].dt.year
df['game_month'] = df['game_date'].dt.month
df['game_day'] = df['game_date'].dt.dayofweek
notNeeded.append('game_date')
In [207]:
#Matchup and opponent
#matchup and opponent columns give as almost the same data - matchup tells us if the game was home or away (depending if it is '@' or 'vs'), so we'll make a new column with that info and then we can drop matchup column.
df['homeGame'] = df['matchup'].str.contains('vs').astype(int)
notNeeded.append('matchup')
In [208]:
#Shot ID
#We can set shot_id as index:
df.set_index('shot_id',inplace = True)
In [209]:
notNeeded
Out[209]:
['game_event_id',
 'game_id',
 'lon',
 'lat',
 'minutes_remaining',
 'seconds_remaining',
 'shot_distance',
 'shot_type',
 'shot_zone_range',
 'team_id',
 'team_name',
 'game_date',
 'matchup']
In [210]:
#Exploring the columns - summary
#Let's finally drop all not needed columns:
df = df.drop(notNeeded,axis=1)
In [211]:
df.shape
Out[211]:
(30697, 18)
In [212]:
pd.set_option('display.max_columns',None)
random_sample = df.take(np.random.permutation(len(df))[:10])
random_sample.head(10)
Out[212]:
action_type combined_shot_type loc_x loc_y period playoffs season shot_made_flag shot_zone_area shot_zone_basic opponent timeRemaining shotDistance 3ptGoal game_year game_month game_day homeGame
shot_id
26948 Jump Shot Jump Shot -69 281 4 1 2001 1.0 Center(C) Above the Break 3 NJN 10 28.934754 1 2002 6 4 1
23561 Jump Shot Jump Shot -91 246 2 0 1997 1.0 Left Side Center(LC) Above the Break 3 DAL 590 26.229182 1 1997 12 6 1
2989 Jump Shot Jump Shot 24 85 1 0 2001 0.0 Center(C) In The Paint (Non-RA) BOS 287 8.832327 0 2002 4 4 0
18539 Jump Shot Jump Shot 162 -6 1 0 2011 1.0 Right Side(R) Mid-Range MIL 312 16.211107 0 2012 1 5 0
19683 Reverse Layup Shot Layup -24 1 4 0 2012 1.0 Center(C) Restricted Area IND 147 2.402082 0 2012 11 1 1
14241 Layup Shot Layup 0 0 3 0 2008 NaN Center(C) Restricted Area UTA 247 0.000000 0 2009 2 2 0
2313 Layup Shot Layup 0 0 1 0 2001 NaN Center(C) Restricted Area SAS 226 0.000000 0 2002 1 4 1
20902 Turnaround Fadeaway shot Jump Shot -172 17 2 0 2012 1.0 Left Side(L) Mid-Range MEM 7 17.283807 0 2013 4 4 1
19212 Pullup Jump shot Jump Shot -4 121 3 0 2011 1.0 Center(C) In The Paint (Non-RA) MEM 418 12.106610 0 2012 3 6 1
27739 Jump Shot Jump Shot -232 -1 4 1 2003 NaN Left Side(L) Left Corner 3 DET 572 23.200216 1 2004 6 6 0
In [213]:
#Splitting the data
#submission_data are those shots where we don't know if he scored or not, 
#and shots where we'll test accuracy of our model.
submission_data = df[df['shot_made_flag'].isnull()]
submission_data = submission_data.drop('shot_made_flag',1)
submission_data.shape
Out[213]:
(5000, 17)
In [214]:
data = df[df['shot_made_flag'].notnull()]
data.shape
Out[214]:
(25697, 18)

Leave a Reply

Your email address will not be published. Required fields are marked *