Skip to article content

Étude des relations entre l'entraîneur sportif et la performance du club

Preprocessing

import pandas as pd
from IPython.display import display, HTML

Preprocessing match results

# Column names : 'League', 'Country', 'Season', 'Date', 'Home', 'HomeGoals', 'Away', 'AwayGoals'
match_results = pd.read_csv('data/extracted_match_results.csv', parse_dates=['Date'])

# Fix encoding issue : renaming 'Fu\303\237ball-Bundesliga' to 'Bundesliga'
match_results['League'] = match_results['League'].replace('Fu<U+00C3><U+009F>ball-Bundesliga', 'Bundesliga')

# Map country codes to country names
country_map = {'ENG': 'England', 'ITA': 'Italy', 'FRA': 'France', 'GER': 'Germany', 'ESP': 'Spain', 'POR': 'Portugal', 'SCO': 'Scotland', 'POL': 'Poland', 'GRE': 'Greece', 'TUR': 'Turkey', 'SUI': 'Switzerland', 'NED': 'Netherlands', 'BEL': 'Belgium', 'AUT': 'Austria'}
match_results['Country'] = match_results['Country'].map(country_map)
match_results.describe()
Loading...
match_results.head()
Loading...
def return_result(goal1, goal2):
    if goal1 > goal2:
        return 'win'
    elif goal1 < goal2:
        return 'loss'
    else:
        return 'draw'
    
match_results['HomeResult'] = match_results.apply(lambda x: return_result(x['HomeGoals'], x['AwayGoals']), axis=1)
match_results['AwayResult'] = match_results.apply(lambda x: return_result(x['AwayGoals'], x['HomeGoals']), axis=1)

home_results = match_results[['League', 'Country', 'Date', 'Home', 'HomeGoals', 'HomeResult']]
home_results = home_results.rename(columns={'Home': 'Team', 'HomeGoals': 'Goals', 'HomeResult': 'Result'})
home_results['isHome'] = True

away_results = match_results[['League', 'Country', 'Away', 'Date', 'AwayGoals', 'AwayResult']]
away_results = away_results.rename(columns={'Away': 'Team', 'AwayGoals': 'Goals', 'AwayResult': 'Result'})
away_results['isHome'] = False

match_results = pd.concat([home_results, away_results], ignore_index=True)

Preprocessing head coach

# Column names : 'Team', 'League', 'Country', 'HeadCoach', 'Appointed', 'EndDate', 'Tenure', 'Matches', 'Wins', 'Draws', 'Losses'
head_coach = pd.read_csv('data/extracted_head_coach.csv', parse_dates=['Appointed', 'EndDate'])
head_coach.head()
Loading...
head_coach.describe()
Loading...

We need to filter head coach that were not active between 2015 to 2023.

last_match = match_results['Date'].max()
first_match = match_results['Date'].min()

# Remove head coach that were appointed after 2023 season
head_coach = head_coach[head_coach['Appointed'] <= last_match]
# Keep head coach that were dimissed after 2015 or that are still active
head_coach = head_coach[(head_coach['EndDate'] >= first_match) | (head_coach['EndDate'].isna())]
# Ensuring there is only 1 head coach at a time in any given team.

head_coach_bis = head_coach.copy()
# Sort data by 'Team' and 'Appointed'
head_coach_bis = head_coach_bis.sort_values(['Team', 'Appointed'])
# Fillna with end date of 2022-2023 season
head_coach_bis['EndDate'] = head_coach_bis['EndDate'].fillna('2024-01-14')
# Check if the next appointment is overlapping with the current one
head_coach_bis['OverlapDuration'] = head_coach_bis.groupby('Team')['Appointed'].shift(-1) - head_coach_bis['EndDate']
head_coach_bis['Overlap'] = head_coach_bis['OverlapDuration'].dt.days < 0

# Show team with overlapping appointments
overlapping = head_coach_bis[head_coach_bis['Overlap']]

Nous avons trouvé 11 enregistrements de mandats d’entraîneurs sportifs qui avaient lieu alors qu’un autre chef-entraîneur assurait l’entraînement de l’équipe. Ces enregistrements concernent les équipes suivante : 'FC Empoli, Hellas Verona, LOSC Lille, Newcastle United, Rayo Vallecano, SSC Napoli, Stade Reims, Stade Rennais FC, Torino FC, West Ham United'. Ces enregistrements sont exclus du jeu de données.

head_coach_bis.sort_values('OverlapDuration').head()
Loading...
head_coach_bis[head_coach_bis['Team'].isin(overlapping['Team'])][['Team', 'Appointed', 'EndDate', 'Overlap', 'OverlapDuration']].sort_values(['Team', 'Appointed']).head(10)
Loading...
head_coach_bis[head_coach_bis['Team'] == 'Stade Reims'][['Team', 'Appointed', 'EndDate', 'Overlap']].sort_values(['Appointed']).iloc[2:6]
Loading...
# Check records that are overlapping
head_coach[~head_coach.index.isin(head_coach_bis[head_coach_bis['OverlapDuration'].dt.days <= -20].index)]
# Drop overlapping records with a duration of more than 20 days
head_coach = head_coach[head_coach.index.isin(head_coach_bis[head_coach_bis['OverlapDuration'].dt.days > -20].index)]

Next we would like to add to each head coach record the number of appointment he is completing.

# Using cronological information about appointment
# Add a column to head coach records that tells us about how many appointment head coach has done

head_coach = head_coach.sort_values(['HeadCoach', 'Appointed'])
head_coach['AppointmentNumber'] = head_coach.groupby('HeadCoach').cumcount() + 1
head_coach[head_coach['HeadCoach'].isin(head_coach[head_coach['AppointmentNumber'] > 3]['HeadCoach'])].sort_values('HeadCoach').head()
Loading...
# Check if total_matches = wins + draws + losses
head_coach[head_coach['Matches'] != head_coach['Wins'] + head_coach['Draws'] + head_coach['Losses']].shape[0]
0
# Display Head Coach with lowest number of match
display(head_coach.sort_values('Matches').head(20))
# Remove head coach records with less than 5 matches
# We consider them to be not relevant as it a short stay of a head coach indicate either a temporary replacement or a very bad performance
head_coach = head_coach[head_coach['Matches'] > 5]
Loading...

Joining head coach with match results

# Compute number of team that are in head_coach but not in match_results
coach_teams = set(head_coach['Team'])
match_teams = set(match_results['Team'])

coach_team_not_in_match = coach_teams - match_teams
match_team_not_in_coach = match_teams - coach_teams

len(coach_team_not_in_match), len(match_team_not_in_coach)
(59, 153)

Les résultats de matchs contiennent 161 équipes et les mandats de coachs contiennent 67 équipes. Cependant, le nom de certaines équipes est différent entre les deux jeux de données. Par exemple, « Liverpool » dans les résultats de match devient « Liverpool FC » dans les mandats des entraîneurs.

Il existe 59 équipes présentes dans les mandats de coachs qui n’ont pas de correspondance dans les résultats de match.

# Group teams by country
coach_teams_by_country = head_coach.groupby('Country')['Team'].unique()
match_teams_by_country = match_results.groupby('Country')['Team'].unique()

coach_teams_by_country, match_teams_by_country
(Country England [Newcastle United, Crystal Palace, Chelsea FC,... France [OGC Nice, FC Toulouse, Olympique Marseille, F... Germany [1.FC Köln, 1.FSV Mainz 05, Eintracht Frankfur... Italy [Frosinone Calcio, FC Empoli, Genoa CFC, Hella... Spain [Granada CF, Valencia CF, Celta de Vigo, Real ... Name: Team, dtype: object, Country England [Manchester Utd, Stoke City, Leicester City, Q... France [Reims, Nice, Guingamp, Nantes, Evian, Lille, ... Germany [Bayern Munich, Hoffenheim, Hannover 96, Köln,... Italy [Chievo, Roma, Atalanta, Milan, Genoa, Palermo... Spain [Málaga, Granada, Sevilla, Almería, Eibar, Cel... Name: Team, dtype: object)
from thefuzz import process

team_name_mapping = {}

# For each country
for country in coach_teams_by_country.index:
    # Get teams for this country
    coach_teams = coach_teams_by_country[country]
    match_teams = match_teams_by_country.get(country, [])

    # For each team in coach_teams
    for coach_team in coach_teams:
        # Find the best match in match_teams
        matching_scores = process.extract(coach_team, match_teams, limit=1)

        if len(matching_scores) != 0 and matching_scores[0][1] >= 60:
            team_name_mapping[coach_team] = matching_scores[0][0]
        else:
            team_name_mapping[coach_team] = None
            print(f"No match found for {coach_team} among {match_teams} in {country}")
name_match = pd.DataFrame(team_name_mapping.items(), columns=['Team in head coach records', 'Team in match results'])
display(HTML(name_match.head().to_html(index=False)))
Loading...
# Map head_coach['team'] with name_match
head_coach['Team'] = head_coach['Team'].map(team_name_mapping)
head_coach.head()
Loading...

Nous pouvons maintenant ajouter le nombres de jours au poste d’entraîneur avec les résultats de match.

# Check match with NaN goals
display(match_results[match_results['Goals'].isna()])
# Remove match with NaN goals
match_results = match_results[~match_results['Goals'].isna()]
Loading...
# Merge head_coach with match_results
match = match_results.merge(head_coach[['Team', 'HeadCoach', 'Appointed', 'EndDate']], on=['Team'], how='left')
# Put aside team that don't have a head coach
no_headcoach = match[match['HeadCoach'].isna()]
match = match[~match['HeadCoach'].isna()]
# Filter match_results_bis to keep only head coach that were appointed before the match and with no end date or end date after the match
match = match[
    (match['Date'] >= match['Appointed']) &
    ((match['Date'] <= match['EndDate']) | match['EndDate'].isna())]
# Join back the team that don't have a head coach
match = pd.concat([match, no_headcoach], ignore_index=True)
# Compute daysInPost
match['DaysInPost'] = (match['Date'] - match['Appointed']).dt.days
match = match.drop(columns=['Appointed', 'EndDate'])
match.head()
Loading...

Missing head coach data

Il existe des matchs sur les lesquels nous n’avons pas d’information sur le coach en poste.

# Values count of coach record / no coach record per team
team_hc_count = match.groupby('Team')['HeadCoach'].count()
# Display list of team with no head coach record
team_no_hc_record = team_hc_count[team_hc_count == 0].index.tolist()
print(len(team_no_hc_record))
print(team_no_hc_record)
95
['Ajaccio', 'Alavés', 'Almería', 'Amiens', 'Angers', 'Arminia', 'Atlético Madrid', 'Auxerre', 'Benevento', 'Bochum', 'Bordeaux', 'Braunschweig', 'Brentford', 'Brescia', 'Brest', 'Brighton', 'Burnley', 'Caen', 'Cagliari', 'Cardiff City', 'Carpi', 'Cesena', 'Clermont Foot', 'Cremonese', 'Crotone', 'Cádiz', 'Córdoba', 'Dijon', 'Düsseldorf', 'Eibar', 'Elche', 'Espanyol', 'Evian', 'Freiburg', 'Fulham', 'Gazélec Ajaccio', 'Girona', 'Greuther Fürth', 'Guingamp', 'Hamburger SV', 'Hannover 96', 'Heidenheim', 'Hertha BSC', 'Holstein Kiel', 'Huddersfield', 'Huesca', 'Hull City', 'Ingolstadt 04', 'Inter', 'Karlsruher', 'La Coruña', 'Lecce', 'Leeds United', 'Leganés', 'Leicester City', 'Lens', 'Levante', 'Mallorca', 'Metz', 'Middlesbrough', 'Monza', 'Málaga', 'Nancy', 'Norwich City', "Nott'ham Forest", 'Nîmes', 'Nürnberg', 'Osasuna', 'Paderborn 07', 'Palermo', 'Parma', 'Pescara', 'QPR', 'RB Leipzig', 'Rennes', 'SPAL', 'Saint-Étienne', 'Salernitana', 'Sampdoria', 'Schalke 04', 'Sheffield Utd', 'Southampton', 'Spezia', 'Sporting Gijón', 'Stoke City', 'Strasbourg', 'Sunderland', 'Swansea City', 'Troyes', 'Union Berlin', 'Valladolid', 'Venezia', 'Watford', 'West Brom', 'Wolves']

Saving preprocessed data

# Save match_results
match.to_csv('data/match_results.csv', index=False)
head_coach.to_csv('data/head_coach.csv', index=False)
Étude des relations entre l'entraîneur sportif et la performance du club
Data extraction
Étude des relations entre l'entraîneur sportif et la performance du club
Exploratory Data Analysis