Skip to article content

Étude des relations entre l'entraîneur sportif et la performance du club

Statistical analysis

Imports

import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
sns.set_theme(context = 'paper', style = 'ticks', palette = 'deep', color_codes = True)
plt.rcParams['figure.autolayout'] = True
plt.rcParams['figure.dpi'] = 300

Loading data

head_coach = pd.read_csv('data/head_coach.csv', parse_dates=['Appointed', 'EndDate'])
head_coach = head_coach[head_coach['Tenure'] <= 3000]

General plotting function

from sklearn.preprocessing import PolynomialFeatures
import statsmodels.api as sm


def prepare_data(data, x_value, y_value, degree=2):
    x = data[x_value].values.reshape(-1, 1)
    y = data[y_value]
    polynomial_features = PolynomialFeatures(degree=degree)
    xp = polynomial_features.fit_transform(x)
    return xp, y, polynomial_features

def fit_model(xp, y):
    model = sm.OLS(y, xp)
    results = model.fit()
    return results

def create_predictions(results, polynomial_features, x_min, x_max):
    xs = np.linspace(x_min, x_max).reshape(-1, 1)
    xs = polynomial_features.transform(xs)
    ys = results.predict(xs)
    predictions = results.get_prediction(xs)
    ci = predictions.conf_int()
    return xs, ys, ci

def create_polynomial_regression_plot(data, x_value, y_value, y_leg, color, title, xlabel, degree=2, integer_ticks=False):
    xp, y, polynomial_features = prepare_data(data, x_value, y_value, degree)
    results = fit_model(xp, y)
    xs, ys, ci = create_predictions(results, polynomial_features, data[x_value].min(), data[x_value].max())

    plt.figure()
    plt.scatter(data[x_value], y, color=color, alpha=0.2)
    line, = plt.plot(xs[:,1], ys, color=color)
    plt.fill_between(xs[:,1], ci[:,0], ci[:,1], color=color, alpha=0.3)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(f'{y_leg.capitalize()} percentage')
    plt.gca().yaxis.set_major_formatter(mticker.PercentFormatter(xmax=100))

    if integer_ticks:
        plt.xticks(np.arange(min(data[x_value]), max(data[x_value])+1, 1.0))

    r, p = pearsonr(data[x_value], data[y_value])
    plt.legend([line, line], 
               [f'Polynomial regression (degree {degree})', 
                f'Pearson correlation : $r = {r:.2f}$, $p = {p:.2f}$'], loc='upper right')

Relation between Head Coaches appointments results and Head Coaches Tenure in Club

head_coach['WinPercentage'] = head_coach['Wins'] / head_coach['Matches'] * 100
head_coach['DrawPercentage'] = head_coach['Draws'] / head_coach['Matches'] * 100
head_coach['LossPercentage'] = head_coach['Losses'] / head_coach['Matches'] * 100

title = "{} Ratio of Head Coach Appointment versus Head Coach Appointment Tenure"
x_label = 'Head Coach Appointment tenure (days)'
create_polynomial_regression_plot(head_coach, 'Tenure', 'WinPercentage', 'Win', 'green', title.format('Win'), x_label, degree=2)
<Figure size 1920x1440 with 1 Axes>
create_polynomial_regression_plot(head_coach, 'Tenure', 'DrawPercentage', 'Draw', 'gray', title.format('Draw'), x_label, degree=2)
<Figure size 1920x1440 with 1 Axes>
create_polynomial_regression_plot(head_coach, 'Tenure', 'LossPercentage', 'Loss', 'red', title.format('Loss'), x_label, degree=2)
<Figure size 1920x1440 with 1 Axes>

Relation between Clubs Results and Number of Head Coaches

club_results = head_coach.groupby('Team').agg({'Wins': 'sum', 'Draws': 'sum', 'Losses': 'sum', 'Matches': 'sum', 'HeadCoach': 'count'})
club_results = club_results.rename(columns={'HeadCoach': 'CoachCount'})
club_results['WinPercentage'] = club_results['Wins'] / club_results['Matches'] * 100
club_results['DrawPercentage'] = club_results['Draws'] / club_results['Matches'] * 100
club_results['LossPercentage'] = club_results['Losses'] / club_results['Matches'] * 100

title = '{} Ratio of Clubs versus Number of Head Coaches Appointment (2015-2023)'
x_label = 'Number of Head Coach Appointment per Club'
create_polynomial_regression_plot(club_results, 'CoachCount', 'WinPercentage', 'Win', 'green', title.format('Win'), x_label, degree=2, integer_ticks=True)
<Figure size 1920x1440 with 1 Axes>
create_polynomial_regression_plot(club_results, 'CoachCount', 'DrawPercentage', 'Draw', 'gray', title.format('Draw'), x_label, degree=2, integer_ticks=True)
<Figure size 1920x1440 with 1 Axes>
create_polynomial_regression_plot(club_results, 'CoachCount', 'LossPercentage', 'Loss', 'red', title.format('Loss'), x_label, degree=2, integer_ticks=True)
<Figure size 1920x1440 with 1 Axes>

Relation between Head Coach Aggregated Performance versus Total Number of Clubs Head Coaches Worked for

# Plot of wins, draw and losses percentage over number of club head coach has been

hc_results = head_coach.groupby('HeadCoach').agg({'Matches': 'sum', 'Wins': 'sum', 'Draws': 'sum', 'Losses': 'sum', 'Team': 'count'}).reset_index()
hc_results = hc_results.rename(columns={'Team': 'ClubCount'})
hc_results['WinPercentage'] = hc_results['Wins'] / hc_results['Matches'] * 100
hc_results['DrawPercentage'] = hc_results['Draws'] / hc_results['Matches'] * 100
hc_results['LossPercentage'] = hc_results['Losses'] / hc_results['Matches'] * 100

title = '{} Ratio of Head Coach Career versus Number of Head Coach Appointments (2015-2023)'
x_label = 'Number of Head Coach Appointments'
create_polynomial_regression_plot(hc_results, 'ClubCount', 'WinPercentage', 'Win', 'green', title.format('Win'), x_label, degree=2, integer_ticks=True)
<Figure size 1920x1440 with 1 Axes>
create_polynomial_regression_plot(hc_results, 'ClubCount', 'DrawPercentage', 'Draw', 'gray', title.format('Draw'), x_label, degree=2, integer_ticks=True)
<Figure size 1920x1440 with 1 Axes>
create_polynomial_regression_plot(hc_results, 'ClubCount', 'LossPercentage', 'Loss', 'red', title.format('Loss'), x_label, degree=2, integer_ticks=True)
<Figure size 1920x1440 with 1 Axes>

Relation between Head Coach Appointments Results versus Head Coach Appointments Counts

title = '{} Ratio of Head Coaches Appointments versus Head Coach Appointment Count (2015-2023)'
x_label = 'Head Coach Appointment Count'
create_polynomial_regression_plot(head_coach, 'AppointmentNumber', 'WinPercentage', 'Win', 'green', title.format('Win'), x_label, degree=2, integer_ticks=True)
<Figure size 1920x1440 with 1 Axes>
create_polynomial_regression_plot(head_coach, 'AppointmentNumber', 'DrawPercentage', 'Draw', 'gray', title.format('Draw'), x_label, degree=2, integer_ticks=True)
<Figure size 1920x1440 with 1 Axes>
create_polynomial_regression_plot(head_coach, 'AppointmentNumber', 'LossPercentage', 'Loss', 'red', title.format('Loss'), x_label, degree=2, integer_ticks=True)
<Figure size 1920x1440 with 1 Axes>

Loading data

match_results = pd.read_csv('data/match_results.csv', parse_dates=['Date'], dtype = {'HeadCoach' : 'str'})
match_results.head()
Loading...

Relation between match outcomes and head coaches days in post during match

# Exclude rows where don't have information about head coach days in post during match
match_results = match_results.dropna(subset=['DaysInPost'])
# Exclude rows with DaysInPost more than 4000
match_results = match_results[match_results['DaysInPost'] <= 4000]
# The reason for this is that we have records of Arsenal head coach Arsene Wenger who has been in post for 22 years.
# Our data start date for matches is 2015. This makes some matches start with a head coach tenure of 5000 days.

match_results['Win'] = match_results['Result'].apply(lambda x: 1 if x == 'win' else 0)
match_results['Loss'] = match_results['Result'].apply(lambda x: 1 if x == 'loss' else 0)
match_results['Draw'] = match_results['Result'].apply(lambda x: 1 if x == 'draw' else 0)
# Create a histogram of 'match_count' over 'days_in_post'
plt.figure()
sns.histplot(data=match_results, x='DaysInPost', bins=16, stat='proportion', binrange=(0, 4000), alpha = 1)
plt.gca().yaxis.set_major_formatter(mticker.PercentFormatter(xmax=1))
plt.xlim(0, 4000)
plt.xlabel('Days in Post')
plt.ylabel('Proportion of Matches')
plt.title('Distribution of Matches versus Head Coach Days in Posts (2017 - 2022)')
plt.show()
<Figure size 1920x1440 with 1 Axes>
def plot_match_outcome_over_coach_tenure(data, y_value, y_label, color):
    # Create a jointplot
    g = sns.jointplot(data=data, x='DaysInPost', y=y_value, kind='reg', 
                      scatter_kws={'alpha':0.5, 'color': color}, 
                      line_kws={'color': color}, 
                      ratio = 3, marginal_ticks = False)
    g.figure.set_figwidth(6)
    g.figure.set_figheight(2)
    g.figure.suptitle(f'Match outcome over Head Coach Days in Post', x = 0.4, y = 1.1)
    g.set_axis_labels('Head Coach Days in Post', 'Match Outcome')
    
    # Legend
    r, p = pearsonr(data['DaysInPost'], data[y_value])
    legend = g.ax_joint.legend([f'r = {r:.2f}, p = {p:.2f}'], loc='upper left', bbox_to_anchor=(1, 1.6))
    legend.set_title("Pearson correlation")
    
    # Set y-axis tick
    g.ax_joint.set_yticks([0, 1])
    g.ax_joint.set_yticklabels(['not ' + y_label, y_label])
plot_match_outcome_over_coach_tenure(match_results, 'Win', 'won', 'green')
<Figure size 1800x600 with 3 Axes>
plot_match_outcome_over_coach_tenure(match_results, 'Draw', 'draw', 'gray')
<Figure size 1800x600 with 3 Axes>
plot_match_outcome_over_coach_tenure(match_results, 'Loss', 'loss', 'red')
<Figure size 1800x600 with 3 Axes>
match_results_bis = match_results[match_results['DaysInPost'] < 1500]
match_results_bis = match_results_bis.groupby('DaysInPost').agg({'Win': 'mean', 'Draw': 'mean', 'Loss': 'mean', 'Result': 'count'})
match_results_bis.columns = ['WinRate', 'DrawRate', 'LossRate', 'MatchCount']
# Add missing days between the first and last day
match_results_bis = match_results_bis.reindex(range(int(min(match_results_bis.index)), int(max(match_results_bis.index) + 1)), fill_value=0)
def weighted_rolling_mean(data, weights, window_size=50):
    def weighted_mean(x):
        return np.average(data.loc[x.index], weights=weights.loc[x.index])

    return data.rolling(window_size, min_periods=1).apply(weighted_mean, raw=False)

window_size = 100

match_results_bis['WinRateRA'] = weighted_rolling_mean(match_results_bis['WinRate'], match_results_bis['MatchCount'], window_size)
match_results_bis['DrawRateRA'] = weighted_rolling_mean(match_results_bis['DrawRate'], match_results_bis['MatchCount'], window_size)
match_results_bis['LossRateRA'] = weighted_rolling_mean(match_results_bis['LossRate'], match_results_bis['MatchCount'], window_size)

# Ensures it sums to 1
(match_results_bis['WinRateRA'] + match_results_bis['DrawRateRA'] + match_results_bis['LossRateRA']).value_counts()
1.0 1465 1.0 33 Name: count, dtype: int64
stats = match_results[match_results['DaysInPost'] < 1500]
n_match = stats['Result'].count()
n_win = stats[stats['Result'] == 'win']['Result'].count()
n_draw = stats[stats['Result'] == 'draw']['Result'].count()
n_loss = stats[stats['Result'] == 'loss']['Result'].count()

Parmi l’ensemble des matchs où l’on possède des informations sur l’entraîneur sportif et où l’entraîneur sportif avait moins de 1500 jours d’ancienneté lors du match :

  • le pourcentage de match gagné est de '41.14' %.
  • le pourcentage de match nul est de '25.01' %.
  • le pourcentage de match perdu est de '33.85' %.
plt.figure()
plt.stackplot(  match_results_bis.index, match_results_bis['LossRateRA'], match_results_bis['DrawRateRA'],
                match_results_bis['WinRateRA'], colors=['red', 'gray', 'green'], alpha=0.60)
plt.xlabel('Head Coach Tenure on Match Day')
plt.ylabel('Proportion of Match Outcome')
plt.title(f'{window_size} Days Weighted Rolling Average of Match Outcome versus Head Coach Tenure on Match Day');
plt.gca().spines['left'].set_position(('data', 0))
plt.gca().spines['bottom'].set_position(('data', 0))
plt.gca().spines['top'].set_position(('data', 1))
plt.gca().spines['right'].set_position(('data', 1500))
plt.gca().yaxis.set_major_formatter(mticker.PercentFormatter(xmax=1))
plt.xlim(0, 1500)
plt.ylim(0, 1)

colors = ['green', 'gray', 'red']
light_colors = [sns.dark_palette(color, as_cmap=True)(0.3) for color in colors]
plt.text(750, 0.8, 'Win', color=light_colors[0], ha='center', fontsize=14)
plt.text(750, 0.45, 'Draw', color=light_colors[1], ha='center', fontsize=14)
plt.text(750, 0.13, 'Loss', color=light_colors[2], ha='center', fontsize=14);
<Figure size 1920x1440 with 1 Axes>
Étude des relations entre l'entraîneur sportif et la performance du club
Head Coaches