This script creates features for the machine learning algorithms to train on. It will create a rolling average of the points per game for the last 5 games of the home and away teams. Also, a proxy of their defense by averaging the number of points that was scored against the team.
This script will also create the Elo rating for the home and away team. The higher the rating, the better the team.
This will calculate the number of days between games. Sometimes teams will have to play games on consecutive days. Teams generally play worse if they play on consecutive days.
import numpy as np
import pandas as pd
import pickle
def RollingAvg(df,teams,column,window):
""" Calculates the rolling average of a column for both teams.
:param df: The dataframe to be added to
:param teams: A list of all the teams
:param column: Basketball statistic to average (i.e. points)
:param window: Number of games to use for the average
"""
# Column names for new variables
new_colA = 'RollAvg_A_'+str(window)+'_'+column
new_colB = 'RollAvg_B_'+str(window)+'_'+column
new_col_oppA = 'RollAvg_A_'+str(window)+'_opp_'+column
new_col_oppB = 'RollAvg_B_'+str(window)+'_opp_'+column
# Default value is NAN
df[new_colA] = np.nan
df[new_colB] = np.nan
df[new_col_oppA] = np.nan
df[new_col_oppB] = np.nan
for team in teams:
# Indices of the games for this team
fran_indices = df[(df['fran_id']==team)].index
opp_indices = df[(df['opp_fran']==team)].index
# Get the statistic that will be averaged
fran_stats = df[(df['fran_id']==team)][column]
opp_stats = df[(df['opp_fran']==team)]['opp_'+column]
# Order and calculate moving average
stats = fran_stats.append(opp_stats)
stats.sort_index(inplace = True)
stats = pd.rolling_mean(stats,window)
# Add the information to the columns
df.ix[fran_indices,new_colA] = stats[fran_indices].tolist()
df.ix[opp_indices,new_colB] = stats[opp_indices].tolist()
# Get the statistic that will be averaged for the opposing team
fran_stats = df[(df['fran_id']==team)]['opp_'+column]
opp_stats = df[(df['opp_fran']==team)][column]
# Order and calculate moving average for the opposing team
stats = fran_stats.append(opp_stats)
stats.sort_index(inplace = True)
stats = pd.rolling_mean(stats,window)
# Add the information to the columns
df.ix[fran_indices,new_col_oppA] = stats[fran_indices].tolist()
df.ix[opp_indices,new_col_oppB] = stats[opp_indices].tolist()
def add_elo_columns(df):
""" Add columns to dataframe, containing the elo ratings of the home team
and away team at the current time of each game. Gets elo scores out
of the pickled elo_dict.p, so be sure it is up to date (trained on most
recent data).
:param df: The dataframe to be added to
"""
# Get elo dictionary
elodict = pickle.load(open('elo_dict.p', 'rb'))
# Add columns to dataframe
df['fran_elo'] = np.zeros(len(df))
df['opp_elo'] = np.zeros(len(df))
# Add elo rating for each team for each game
for team in elodict.keys():
# Find all games with this team and get indices where home and where away
games = df[(df['fran_id'] == team) | (df['opp_fran'] == team)]
home_actual_inds = games[(games['fran_id'] == team)].index
away_actual_inds = games[(games['opp_fran'] == team)].index
# Reindex from 0 to n
games.index = range(len(games))
home_elo_inds = games[(games['fran_id'] == team)].index
away_elo_inds = games[(games['opp_fran'] == team)].index
# Get the elo scores for each home and away game
away_elos = np.array(elodict[team])[away_elo_inds]
home_elos = np.array(elodict[team])[home_elo_inds]
# Add this team's elo ratings to df
df.ix[home_actual_inds, 'fran_elo'] = home_elos
df.ix[away_actual_inds, 'opp_elo'] = away_elos
def days_between_games(df,teams):
""" Adds the days since the last game for each team used to measure
fatigue of players.
:param df: The dataframe to be added to
:param teams: A list of all the teams
"""
df['Days_Since_Last'] = np.nan
for team in teams:
df['date'] = pd.to_datetime(df['date'])
team_df = df[(df['fran_id']==team) | (df['opp_fran']==team)]
teamTime = (team_df['date'] - team_df['date'].shift(1))
days = teamTime.iloc[1:].map(lambda x: x.days if x.days<=10 else 10)
fran_indices = team_df.index[1:]
df.ix[fran_indices,'Days_Since_Last'] = days.tolist()
def who_wins(df):
""" Adds the Win column. If Win is true then team A (away team) won
the game.
:param df: The dataframe to be added to
"""
df['Win'] = df['pts']>df['opp_pts']
if __name__ == "__main__":
# Read in data
filename = 'data/all_games.csv'
df = pd.read_csv(filename)
teams = df['fran_id'].unique()
# Add new columns here
colToAdd = ['pts']
for col in colToAdd:
RollingAvg(df,teams,col,5)
add_elo_columns(df)
days_between_games(df,teams)
who_wins(df)
# Save final dataframe
df.to_csv('Algorithms_Data.csv')
#print df
The Kristaps class is used to calculate and store Elo ratings, predict future wins and losses using only the Elo rating, plote elo ratings over time, and compare our predictions with the predictions of 538.
import elo
import numpy as np
import pandas as pd
import datetime as dt
import pickle
from matplotlib import pyplot as plt
from scipy.signal import savgol_filter
class Kristaps(object):
""" Used to store and calculate elo ratings, predict future wins and losses,
plot elo ratings, and compare out predictions to 538's predictions.
"""
def __init__(self, elo_dict_file=None):
if elo_dict_file is not None:
self.elo_dict = pickle.load(open('elo_dict.p', 'rb'))
self.init = True
else:
self.init = False
self.elo_dict = dict()
def train_all(self, data, write=1):
""" Calculates the current Elo rating for each of the teams.
:param filename: Name of csv file with data, or pandas dataframe. Columns include 'fran_id',
'opp_fran', 'pts', and 'opp_pts'
:return: Update the self.elo_dict where the team names are the keys and the elo
ratings are the values.
"""
# Initialize score
try:
# If data is filename, open it in pandas
data = pd.read_csv(data)
except:
pass
if not self.init:
teams = np.unique(data['fran_id'])
self.elo_dict = dict(zip(teams, [[1500] for _ in range(len(teams))]))
data['date'] = pd.to_datetime(data['date'])
data.sort_values('date', inplace=True)
for i in range(len(data)):
row = data.iloc[i]
RA = self.elo_dict[row['fran_id']][-1]
RB = self.elo_dict[row['opp_fran']][-1]
newRA, newRB = elo.update_elo_ratings(RA, RB, row['pts'] > row['opp_pts'], row['pts'] < row['opp_pts'])
self.elo_dict[row['fran_id']].append(newRA)
self.elo_dict[row['opp_fran']].append(newRB)
if write == 1:
pickle.dump(self.elo_dict, open('elo_dict.p', 'wb'))
def train_yesterday(self, filename='data/historical_data.csv', write=1):
""" Updates elo ratings based on results of yesterday's games.
:param filename:
:return:
"""
df = pd.read_csv(filename)
yesterday = dt.datetime.today() - dt.timedelta(days=1)
df = df[(df['date'] == str(yesterday.date()))]
for i in range(len(df)):
row = df.iloc[i]
r_fran = self.elo_dict[row['fran_id']][-1]
r_opp = self.elo_dict[row['opp_fran']][-1]
newRA, newRB = elo.update_elo_ratings(r_fran, r_opp, row['pts'] > row['opp_pts'], row['pts'] < row['opp_pts'])
self.elo_dict[row['fran_id']].append(newRA)
self.elo_dict[row['opp_fran']].append(newRB)
if write == 1:
pickle.dump(self.elo_dict, open('elo_dict.p', 'wb'))
def simulate_games(self, future_games, n):
""" Predict outcomes of future games. Average of n runs.
:param future_games: Data frame
:param n: int
:return: Two lists
"""
game_scores = np.zeros(len(future_games))
for it in range(n):
tmp_elo_dict = self.elo_dict.copy()
for i in range(len(future_games)):
game = future_games.iloc[i]
A = game['fran_id']
B = game['opp_fran']
rA = tmp_elo_dict[A][-1]
rB = tmp_elo_dict[B][-1]
pA, pB = elo.predict_score(rA, rB)
winner = np.random.choice([A, B], p=[pA, pB])
A_score = 1 if winner == A else 0
B_score = A_score - 1
game_scores[i] += A_score
# TODO Should we update elo ratings as we go?
# rA, rB = elo.update_elo_ratings(rA, rB, A_score, B_score)
# tmp_elo_dict[A] = rA
# tmp_elo_dict[B] = rB
A_probs = game_scores / float(n)
return A_probs, 1 - A_probs
def check(self, Aprobs, Bprobs, info):
""" Compare given predictions for a series of games against a real outcome.
:param Aprobs: Probability of the Away team winning each game
:param Bprobs: Probability of the Home team winning each game
:param info: A pandas dataframe with game outcome information
:return:
"""
correct = 0.
for i in range(len(Aprobs)):
truth = np.argmax([info.iloc[i]['pts'], info.iloc[i]['opp_pts']])
pred = np.argmax([Aprobs[i], Bprobs[i]])
correct += (truth == pred)
return correct / len(Aprobs)
def predict_today(self, filename='data/upcoming_games.csv', write=1):
""" Predict today's games, as pulled from the upcoming_games file.
Saves predictions to csv as today_predictions.csv for use on website.
:param filename: Input file with future games. Defaults to 'upcoming_games.csv'
:return: A pandas dataframe containing the probability of each team winning each game
"""
# TODO Append each day's predictions to the same file? Or store in unique file names?
df = pd.read_csv(filename)
today = dt.datetime.today()
df = df[(df['date'] == str(today.date()))]
preds = []
for i in range(len(df)):
elo_A = self.elo_dict[df.iloc[i]['fran_id']][-1]
elo_B = self.elo_dict[df.iloc[i]['opp_fran']][-1]
sc = elo.predict_score(elo_A - 46, elo_B + 46)
preds.append([df.iloc[i]['fran_id'], df.iloc[i]['opp_fran'], sc[0], sc[1]])
table = pd.DataFrame(preds, columns=['fran_id', 'opp_fran', 'prob', 'opp_prob'])
table[['opp_prob']] = np.rint(table[['opp_prob']] * 100).astype(np.int32)
table[['prob']] = np.rint(table[['prob']] * 100).astype(np.int32)
if write == 1:
table.to_csv('data/today_predictions.csv', index=None)
return table
def current_WL(self, filename='data/historical_data.csv'):
""" Count the current number of wins and losses for all teams in the 2016-2017 season.
Returns a dictionary with the team names as the keys and [wins,losses] as the values.
:param filename: Input file with games of the current season. Defaults to 'historical_data.csv'
:return: A dictionary with team names as the keys and a list [wins,losses] as the values.
"""
data = pd.read_csv(filename)
data['date'] = pd.to_datetime(data['date'])
teams = np.unique(data['fran_id'])
team_WL = {}
data['Won'] = (data['pts'] > data['opp_pts'])
for team in teams:
won = data[(data['fran_id'] == team) & (data['date'] >= dt.datetime(2016, 9, 1))]['Won'].sum()
won += (data[(data['opp_fran'] == team) & (data['date'] >= dt.datetime(2016, 9, 1))]['Won'] == False).sum()
lost = (data[(data['fran_id'] == team) & (data['date'] >= dt.datetime(2016, 9, 1))]['Won'] == False).sum()
lost += (data[(data['opp_fran'] == team) & (data['date'] >= dt.datetime(2016, 9, 1))]['Won'] == True).sum()
team_WL[team] = [won, lost]
return team_WL
def simulate_seasons(self, filename='data/upcoming_games.csv', n=100):
""" This simulates the rest of 2016-2017 season n times. This function assumes
that train has been run as it uses self.elo_dict. Elo scores are not updated
during the simulated season. This will also calculate the current wins and
losses through current_WL(). A pandas dataframe will be saved and returned.
:param filename: Input file with future games. Defaults to 'upcoming_games.csv'
:param n: Number of times the 2016-2017 season will be simulated. Defaults to 100
:return: Pandas dataframe with columns as Team name, Projected Wins, Projected Losses, and Elo rating
"""
future_games = pd.read_csv(filename)
teams = np.unique(future_games['fran_id'])
team_WL_Predicted = dict(zip(teams, np.zeros((len(teams),2))))
for it in range(n):
for i in range(len(future_games)):
game = future_games.iloc[i]
A = game['fran_id']
B = game['opp_fran']
rA = self.elo_dict[A][-1]
rB = self.elo_dict[B][-1]
pA, pB = elo.predict_score(rA, rB)
winner = np.random.choice([A, B], p=[pA, pB])
A_score = 1 if winner == A else 0
B_score = 1 - A_score
team_WL_Predicted[A][0] += A_score
team_WL_Predicted[A][1] += B_score
team_WL_Predicted[B][0] += B_score
team_WL_Predicted[B][1] += A_score
for team in teams:
team_WL_Predicted[team] = np.round(team_WL_Predicted[team]/float(n))
team_WL = self.current_WL()
total_WL = {}
for team in teams:
total_WL[team] = team_WL_Predicted[team]+team_WL[team]
Projected_WL = pd.DataFrame({'fran_id': teams, 'Projected W': [total_WL[team][0] for team in teams],
'Projected L': [total_WL[team][1] for team in teams], 'elo': [self.elo_dict[team][-1] for team in teams]})
table = Projected_WL.sort_values('elo', ascending=False)
table.to_csv('data/ProjectedWL.csv', index=False)
return table
def compare_to_538(self):
""" Create chart showing our predictions and 538's predictions side by side.
Shows predictions from scrape_538() and from predict_today(), which functions
are required to have been previously run.
:return:
"""
us = pd.read_csv('data/today_predictions.csv')
five38 = pd.read_csv('data/pred_538.csv')
del five38['date']
del five38['fran_city']
del five38['opp_city']
five38.sort_values('fran', inplace=True)
us.sort_values('fran_id', inplace=True)
five38.index = us.index
newd = pd.concat([us, five38], axis=1, join_axes=[us.index])
del newd['fran']
del newd['opp']
newd.columns = ['fran_id', 'opp_fran', 'Our prob', 'Our opp prob', '538 prob', '538 opp prob', '538 spread']
newd.to_csv('data/daily_pred_comparison.csv', index=False)
return newd
def plot_Elo(self, team_names, games=None, filename=None, window=None, order=None, figsize=None, legend=None):
""" Plots the elo history of a team. x-axis will be the game number.
:param team_names: List of names of the teams to be plotted
:param games: If None then the entire history of the teams is plotted.
Otherwise games will be the number of games plotted.
Default is None.
:param filename: If not None then the picture is saved as filename, otherwise it is
shown. Default is None.
"""
if team_names is None:
team_names = self.elo_dict.keys()
fig, ax = plt.subplots(figsize=figsize)
for team_name in team_names:
if games == None:
elo_history = self.elo_dict[team_name]
else:
elo_history = self.elo_dict[team_name][-games:]
if window is not None:
elo_history = savgol_filter(elo_history, window, order)
ax.plot(elo_history, label=team_name)
if legend is not None:
plt.legend(loc='center left', bbox_to_anchor=(1,.5))
else:
plt.legend()
if filename is not None:
plt.savefig(filename)
else:
plt.show()