Data extraction
Nous collectons les données de Fbref et Transfermarkt en utilisant la bibliothèque WorldFootballR.
Nous collectons des données de 2015 à 2023 auprès des principales ligues européennes de première division : Angleterre, Espagne, Italie, Allemagne, France, Portugal, Écosse, Pologne, Grèce, Turquie, Suisse, Pays-Bas, Belgique, Autriche.
if (!require(devtools, quietly = TRUE)) {
install.packages("devtools")
library(devtools)
}
if (!require(worldfootballR)) {
devtools::install_github("JaseZiv/worldfootballR")
library(worldfootballR)
}
if (!require(readr)) {
install.packages("readr")
library(readr)
}Le chargement a nécessité le package : worldfootballR
Le chargement a nécessité le package : readr
Collecting match results¶
# Change parameter to study different teams and seaons
# country <- c("ENG", "ESP", "ITA", "GER", "FRA", "POR", "SCO", "POL", "GRE", "SUI", "NED", "BEL", "AUT")
# year <- c(2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023)
country <- c("ENG", "ESP", "ITA", "GER", "FRA")
year <- c(2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025)
match_result <- load_match_results(country = country, gender = "M", season_end_year = year, tier = "1st")→ Data last updated 2025-02-04 17:32:08.29795408248901 UTC
columns_to_keep <- c('Competition_Name', 'Country', 'Season_End_Year', 'Date', 'Home', 'HomeGoals', 'Away', 'AwayGoals')
match_result <- match_result[, columns_to_keep]
# Rename columns
colnames(match_result) <- c('league', 'country', 'season', 'date', 'home', 'home_goals', 'away', 'away_goals')
head(match_result)Loading...
unique(match_result$league)
# Reformat Bundesliga
match_result$league <- gsub("Fußball-Bundesliga", "Bundesliga", match_result$league)Loading...
summary(match_result) league country season date
Length:19969 Length:19969 Min. :2015 Min. :2014-08-08
Class :character Class :character 1st Qu.:2017 1st Qu.:2017-03-18
Mode :character Mode :character Median :2020 Median :2019-12-21
Mean :2020 Mean :2019-12-28
3rd Qu.:2023 3rd Qu.:2022-09-30
Max. :2025 Max. :2025-05-25
home home_goals away away_goals
Length:19969 Min. : 0.000 Length:19969 Min. :0.000
Class :character 1st Qu.: 1.000 Class :character 1st Qu.:0.000
Mode :character Median : 1.000 Mode :character Median :1.000
Mean : 1.542 Mean :1.228
3rd Qu.: 2.000 3rd Qu.:2.000
Max. :10.000 Max. :9.000
NA's :806 NA's :806 # Saving the data
write_csv(match_result, "../data/extracted_match_results.csv")Collecting head coach data¶
countries <- c("England", "Spain", "Italy", "Germany", "France")
get_team_url <- function(country) {
tryCatch({
tm_league_team_urls(country_name = country, start_year = 2015)
}, error = function(e) {
warning("Failed to fetch URLs for ", country, ": ", e$message)
character(0)
})
}
teams_url <- unlist(lapply(countries, get_team_url))
head(teams_url)Loading...
get_team_staff_url <- function(team_url) {
tryCatch({
tm_team_staff_urls(team_urls = team_url, staff_role = "Manager")
}, error = function(e) {
warning("Failed to fetch staff URLs for ", team_url, ": ", e$message)
data.frame()
})
}
teams_staff_url <- unlist(lapply(teams_url, get_team_staff_url))
head(teams_staff_url)Loading...
head_coach <- tm_team_staff_history(team_urls = teams_staff_url, staff_role = "Manager")
unique(head_coach$league)
unique(head_coach$team)Error: indice hors limites
Error: indice hors limites
Error: indice hors limites
Error: indice hors limites
Error: indice hors limites
Error: indice hors limites
Error: indice hors limites
There is some missing information about country and league in the data. We will add this information manually.
sapply(head_coach, function(x) sum(is.na(x)))
# Show unique teams with missing league and or country
unique(head_coach$team[is.na(head_coach$league) | is.na(head_coach$country)])Loading...
# Fix league and country for 'Chievo Verona' and 'GFC Ajaccio'
head_coach$league[head_coach$team == 'Chievo Verona'] <- 'Serie A'
head_coach$country[head_coach$team == 'Chievo Verona'] <- 'Italy'
head_coach$league[head_coach$team == 'GFC Ajaccio'] <- 'Ligue 2'
head_coach$country[head_coach$team == 'GFC Ajaccio'] <- 'France'Filter leagues that are not First Division Leagues
# Filter teams that are not First Division teams
# first_division_teams <- c(
# 'Premier League', 'LaLiga', 'Serie A', 'Bundesliga', 'Ligue 1',
# 'Liga Portugal', 'Scottish Premiership', 'PKO BP Ekstraklasa', 'Super League 1',
# 'Super League', 'Eredivisie', 'Jupiler Pro League')
first_division_teams <- c('Premier League', 'LaLiga', 'Serie A', 'Bundesliga', 'Ligue 1')
# Ensure the every first_division_teams is in the head_coach$league
all(first_division_teams %in% head_coach$league)
# Filter the head_coach data
head_coach <- head_coach[head_coach$league %in% first_division_teams, ]
head(head_coach, 5)Loading...
columns_to_keep <- c('team_name', 'league', 'country', 'staff_name', 'appointed', 'end_date', 'days_in_post', 'matches', 'wins', 'draws', 'losses')
head_coach <- head_coach[, columns_to_keep]
# Rename columns
colnames(head_coach) <- c('Team', 'League', 'Country', 'HeadCoach', 'Appointed', 'EndDate', 'Tenure', 'Matches', 'Wins', 'Draws', 'Losses')
summary(head_coach) Team League Country HeadCoach
Length:3532 Length:3532 Length:3532 Length:3532
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
Appointed EndDate Tenure Matches
Min. :1886-06-26 Min. :1893-08-01 Min. : -242.0 Min. : 0.00
1st Qu.:1961-11-02 1st Qu.:1963-06-30 1st Qu.: 186.0 1st Qu.: 10.00
Median :1987-07-01 Median :1988-03-06 Median : 364.0 Median : 29.00
Mean :1982-05-15 Mean :1983-04-16 Mean : 608.2 Mean : 51.59
3rd Qu.:2004-12-29 3rd Qu.:2005-06-30 3rd Qu.: 730.0 3rd Qu.: 67.00
Max. :2024-04-23 Max. :2024-06-30 Max. :14613.0 Max. :1490.00
NA's :64
Wins Draws Losses
Min. : 0.00 Min. : 0.00 Min. : 0.00
1st Qu.: 2.00 1st Qu.: 2.00 1st Qu.: 4.00
Median : 10.00 Median : 7.00 Median : 10.00
Mean : 22.53 Mean : 13.01 Mean : 16.05
3rd Qu.: 28.00 3rd Qu.: 17.00 3rd Qu.: 21.00
Max. :895.00 Max. :323.00 Max. :272.00
# Saving the data
write_csv(head_coach, "../data/extracted_head_coach.csv")