from joblib import parallel_backend
parallel_backend("loky", n_jobs=-1)<joblib.parallel.parallel_backend at 0x13835b110>import sys
sys.path.append("./../src/")
from get_dataset import dataset_loaders
dataset = list(dataset_loaders.keys())[6]
dataset'bankmarketing'# Parameters
dataset = "wdbc"
from get_dataset import load_dataset
X, y = load_dataset(dataset)Data presentation¶
*Unexecuted inline expression for: dataset* dataset contains n = Unexecuted inline expression for: X.shape[0] samples and p = Unexecuted inline expression for: X.shape[1] features.
The target variable is binary and Unexecuted inline expression for: y.mean() * 100:.2f% of the samples are positive.
from sklearn.model_selection import train_test_split
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
from sklearn.preprocessing import StandardScaler
# Normalize data using only the training set
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)Prepare model results storage¶
MODELS = dict()
def store_results(name, grid):
MODELS[name] = {
"best_params": grid.best_params_,
"X_test": X_test,
"y_true": y_test,
"y_pred": grid.predict(X_test),
"y_proba": grid.predict_proba(X_test)
}
passfrom sklearn.model_selection import GridSearchCV
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV
def get_grid(model, params):
# grid = GridSearchCV(model, params, n_jobs=-1, cv=5)
grid = HalvingRandomSearchCV(model, params, n_jobs=-1, cv=5, verbose=1, scoring="accuracy", refit=True)
return gridEntraînement des classifieurs¶
Classifieurs non paramétriques¶
K-Nearest Neighbors¶
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(weights='uniform', algorithm='auto')
param_grid = {
'n_neighbors': [3, 5, 7, 9],
}
grid_search = get_grid(model, param_grid)
grid_search.fit(X_train, y_train)
store_results('KNN', grid_search)n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 3
min_resources_: 20
max_resources_: 398
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 4
n_resources: 20
/Users/mathisderenne/Documents/02 - Scolaire/M1 MIASHS/02 - Guillaume Mezler/Projet/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search.py:317: UserWarning: The total space of parameters 4 is smaller than n_iter=19. Running 4 iterations. For exhaustive searches, use GridSearchCV.
warnings.warn(
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 1
n_candidates: 2
n_resources: 60
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Distance-Weighted KNN¶
model = KNeighborsClassifier(weights='distance', algorithm='auto')
param_grid = {
'n_neighbors': [3, 5, 7, 9],
}
grid_search = get_grid(model, param_grid)
grid_search.fit(X_train, y_train)
store_results('KNN Distance Weighted', grid_search)n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 3
min_resources_: 20
max_resources_: 398
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 4
n_resources: 20
Fitting 5 folds for each of 4 candidates, totalling 20 fits
/Users/mathisderenne/Documents/02 - Scolaire/M1 MIASHS/02 - Guillaume Mezler/Projet/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search.py:317: UserWarning: The total space of parameters 4 is smaller than n_iter=19. Running 4 iterations. For exhaustive searches, use GridSearchCV.
warnings.warn(
----------
iter: 1
n_candidates: 2
n_resources: 60
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Condensed Nearest Neighbor¶
from imblearn.under_sampling import CondensedNearestNeighbour
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_X_y
from sklearn.utils.validation import validate_data
# Wrap CondensedNearestNeighbour into an sklearn compatible transformer for use in pipelines
class CondensedNearestNeighbourTransformer(BaseEstimator, TransformerMixin):
def __init__(self, sampling_strategy = "auto", random_state = 42, n_neighbors = None, n_seeds_S = 1):
self.sampling_strategy = sampling_strategy
self.random_state = random_state
self.n_neighbors = n_neighbors
self.n_seeds_S = n_seeds_S
def fit(self, X, y=None):
# validate_data(X, y, accept_sparse=True, reset=True)
self.n_features_in_ = X.shape[1]
return self
def transform(self, X, y=None):
# check_X_y(X, y)
if y is None:
return X
else:
return CondensedNearestNeighbour(
sampling_strategy = self.sampling_strategy,
random_state = self.random_state,
n_neighbors = self.n_neighbors,
n_seeds_S = self.n_seeds_S
).fit_resample(X, y)
from sklearn.utils.estimator_checks import check_estimator
# check_estimator(CondensedNearestNeighbourTransformer())from sklearn.pipeline import Pipeline
model = Pipeline([
('cnn', CondensedNearestNeighbourTransformer(sampling_strategy='auto', n_neighbors=3, n_seeds_S=1)),
('knn', KNeighborsClassifier(weights='uniform', algorithm='auto'))
])
param_grid = {
'cnn__n_neighbors': [3, 5, 7, 9],
'knn__n_neighbors': [3, 5, 7, 9],
}
grid_search = get_grid(model, param_grid)
grid_search.fit(X_train, y_train)
store_results('KNN Condensed Nearest Neighbor', grid_search)n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 20
max_resources_: 398
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 16
n_resources: 20
Fitting 5 folds for each of 16 candidates, totalling 80 fits
/Users/mathisderenne/Documents/02 - Scolaire/M1 MIASHS/02 - Guillaume Mezler/Projet/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search.py:317: UserWarning: The total space of parameters 16 is smaller than n_iter=19. Running 16 iterations. For exhaustive searches, use GridSearchCV.
warnings.warn(
----------
iter: 1
n_candidates: 6
n_resources: 60
Fitting 5 folds for each of 6 candidates, totalling 30 fits
----------
iter: 2
n_candidates: 2
n_resources: 180
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Locally Adaptive KNN¶
class LocallyAdaptiveKNN(KNeighborsClassifier):
def predict(self, X):
distances, indices = self.kneighbors(X)
predictions = []
for i, neighbors in enumerate(indices):
local_k = int(len(neighbors) / 2) # Example of adapting k locally
local_knn = KNeighborsClassifier(n_neighbors=local_k)
local_knn.fit(self._fit_X[neighbors], self._y[neighbors])
predictions.append(local_knn.predict([X[i]])[0])
return predictions
model = LocallyAdaptiveKNN(weights='uniform', algorithm='auto')
param_grid = {
'n_neighbors': [3, 5, 7, 9],
}
grid_search = get_grid(model, param_grid)
grid_search.fit(X_train, y_train)
store_results('KNN Locally Adaptive', grid_search)n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 3
min_resources_: 20
max_resources_: 398
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 4
n_resources: 20
Fitting 5 folds for each of 4 candidates, totalling 20 fits
/Users/mathisderenne/Documents/02 - Scolaire/M1 MIASHS/02 - Guillaume Mezler/Projet/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search.py:317: UserWarning: The total space of parameters 4 is smaller than n_iter=19. Running 4 iterations. For exhaustive searches, use GridSearchCV.
warnings.warn(
----------
iter: 1
n_candidates: 2
n_resources: 60
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Classifieurs binaires non linéaires¶
Arbre de décision (Decision Tree)¶
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42)
param_grid = {
'max_depth': [3, 5, 7, 9],
'min_samples_split': [2, 5, 10]
}
grid_search = get_grid(model, param_grid)
grid_search.fit(X_train, y_train)
store_results('Decision Tree', grid_search)n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 20
max_resources_: 398
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 12
n_resources: 20
Fitting 5 folds for each of 12 candidates, totalling 60 fits
/Users/mathisderenne/Documents/02 - Scolaire/M1 MIASHS/02 - Guillaume Mezler/Projet/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search.py:317: UserWarning: The total space of parameters 12 is smaller than n_iter=19. Running 12 iterations. For exhaustive searches, use GridSearchCV.
warnings.warn(
----------
iter: 1
n_candidates: 4
n_resources: 60
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 2
n_candidates: 2
n_resources: 180
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Forêt aléatoire (RandomForest)¶
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42, class_weight=None)
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 7, 9],
'min_samples_split': [2, 5, 10],
}
grid_search = get_grid(model, param_grid)
grid_search.fit(X_train, y_train)
store_results('Random Forest', grid_search)n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 20
max_resources_: 398
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 19
n_resources: 20
Fitting 5 folds for each of 19 candidates, totalling 95 fits
----------
iter: 1
n_candidates: 7
n_resources: 60
Fitting 5 folds for each of 7 candidates, totalling 35 fits
----------
iter: 2
n_candidates: 3
n_resources: 180
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Forêt aléatoire avec cost-sensitive learning¶
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42, class_weight='balanced')
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 7, 9],
'min_samples_split': [2, 5, 10],
}
grid_search = get_grid(model, param_grid)
grid_search.fit(X_train, y_train)
store_results('Random Forest - cost-sensitive learning', grid_search)n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 20
max_resources_: 398
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 19
n_resources: 20
Fitting 5 folds for each of 19 candidates, totalling 95 fits
----------
iter: 1
n_candidates: 7
n_resources: 60
Fitting 5 folds for each of 7 candidates, totalling 35 fits
----------
iter: 2
n_candidates: 3
n_resources: 180
Fitting 5 folds for each of 3 candidates, totalling 15 fits
AdaBoost¶
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(random_state=42)
param_grid = {
'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.1, 1.0]
}
grid_search = get_grid(model, param_grid)
grid_search.fit(X_train, y_train)
store_results('AdaBoost', grid_search)n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 20
max_resources_: 398
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 9
n_resources: 20
Fitting 5 folds for each of 9 candidates, totalling 45 fits
/Users/mathisderenne/Documents/02 - Scolaire/M1 MIASHS/02 - Guillaume Mezler/Projet/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search.py:317: UserWarning: The total space of parameters 9 is smaller than n_iter=19. Running 9 iterations. For exhaustive searches, use GridSearchCV.
warnings.warn(
----------
iter: 1
n_candidates: 3
n_resources: 60
Fitting 5 folds for each of 3 candidates, totalling 15 fits
----------
iter: 2
n_candidates: 1
n_resources: 180
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Gradient Boosting¶
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(random_state=42)
param_grid = {
'loss': ['log_loss', 'exponential'],
'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.1, 1.0],
'max_depth': [3, 5, 7, 9]
}
grid_search = get_grid(model, param_grid)
grid_search.fit(X_train, y_train)
store_results('Gradient Boosting', grid_search)n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 20
max_resources_: 398
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 19
n_resources: 20
Fitting 5 folds for each of 19 candidates, totalling 95 fits
----------
iter: 1
n_candidates: 7
n_resources: 60
Fitting 5 folds for each of 7 candidates, totalling 35 fits
----------
iter: 2
n_candidates: 3
n_resources: 180
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Classifieurs binaires paramétriques¶
SVM Linéaire¶
from sklearn.svm import SVC
model = SVC(
kernel='linear',
random_state=42, probability=True)
param_grid = {
'C': [0.1, 0.5, 1],
'degree': [2, 3, 4]
}
grid_search = get_grid(model, param_grid)
grid_search.fit(X_train, y_train)
store_results('SVM', grid_search)n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 20
max_resources_: 398
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 9
n_resources: 20
Fitting 5 folds for each of 9 candidates, totalling 45 fits
----------
iter: 1
n_candidates: 3
n_resources: 60
Fitting 5 folds for each of 3 candidates, totalling 15 fits
/Users/mathisderenne/Documents/02 - Scolaire/M1 MIASHS/02 - Guillaume Mezler/Projet/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search.py:317: UserWarning: The total space of parameters 9 is smaller than n_iter=19. Running 9 iterations. For exhaustive searches, use GridSearchCV.
warnings.warn(
----------
iter: 2
n_candidates: 1
n_resources: 180
Fitting 5 folds for each of 1 candidates, totalling 5 fits
SVM non linéaire¶
from sklearn.svm import SVC
model = SVC(random_state=42, probability=True)
param_grid = {
'kernel': ['poly', 'rbf', 'sigmoid'],
'C': [0.1, 0.5, 1],
'gamma': ['scale', 'auto']
}
grid_search = get_grid(model, param_grid)
grid_search.fit(X_train, y_train)
store_results('SVM non linéaire', grid_search)n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 20
max_resources_: 398
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 18
n_resources: 20
Fitting 5 folds for each of 18 candidates, totalling 90 fits
/Users/mathisderenne/Documents/02 - Scolaire/M1 MIASHS/02 - Guillaume Mezler/Projet/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search.py:317: UserWarning: The total space of parameters 18 is smaller than n_iter=19. Running 18 iterations. For exhaustive searches, use GridSearchCV.
warnings.warn(
----------
iter: 1
n_candidates: 6
n_resources: 60
Fitting 5 folds for each of 6 candidates, totalling 30 fits
----------
iter: 2
n_candidates: 2
n_resources: 180
Fitting 5 folds for each of 2 candidates, totalling 10 fits
SVM non linéaire avec sur-échantillonnage¶
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
from sklearn.preprocessing import StandardScaler
# Normalize data using only the training set
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
model = ImbPipeline([
('smote', SMOTE(sampling_strategy='auto', k_neighbors=1, random_state=42)),
('svm', SVC(random_state=42, probability=True))
])
param_grid = {
'svm__kernel': ['poly', 'rbf', 'sigmoid'],
'svm__C': [0.1, 0.5, 1],
'svm__gamma': ['scale', 'auto']
}
grid_search = get_grid(model, param_grid)
grid_search.fit(X_train, y_train)
store_results('SVM non linéaire avec SMOTE', grid_search)n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 20
max_resources_: 398
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 18
n_resources: 20
Fitting 5 folds for each of 18 candidates, totalling 90 fits
/Users/mathisderenne/Documents/02 - Scolaire/M1 MIASHS/02 - Guillaume Mezler/Projet/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search.py:317: UserWarning: The total space of parameters 18 is smaller than n_iter=19. Running 18 iterations. For exhaustive searches, use GridSearchCV.
warnings.warn(
----------
iter: 1
n_candidates: 6
n_resources: 60
Fitting 5 folds for each of 6 candidates, totalling 30 fits
----------
iter: 2
n_candidates: 2
n_resources: 180
Fitting 5 folds for each of 2 candidates, totalling 10 fits
SVM avec cost-sensitive learning (ajustement pénalité C)¶
from sklearn.svm import SVC
model = SVC(random_state=42, probability=True, class_weight='balanced')
param_grid = {
'kernel': ['poly', 'rbf', 'sigmoid'],
'C': [0.1, 0.5, 1],
'gamma': ['scale', 'auto']
}
grid_search = get_grid(model, param_grid)
grid_search.fit(X_train, y_train)
store_results('SVM cost-sensitive learning', grid_search)n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 20
max_resources_: 398
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 18
n_resources: 20
Fitting 5 folds for each of 18 candidates, totalling 90 fits
/Users/mathisderenne/Documents/02 - Scolaire/M1 MIASHS/02 - Guillaume Mezler/Projet/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search.py:317: UserWarning: The total space of parameters 18 is smaller than n_iter=19. Running 18 iterations. For exhaustive searches, use GridSearchCV.
warnings.warn(
----------
iter: 1
n_candidates: 6
n_resources: 60
Fitting 5 folds for each of 6 candidates, totalling 30 fits
----------
iter: 2
n_candidates: 2
n_resources: 180
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Régression logistique¶
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=42, solver='liblinear', dual=False)
param_grid = {
'C': [0.1, 0.5, 1], # Inverse de la force de régularisation
'penalty': ['l1', 'l2'], # Type de régularisation
'class_weight': [None, 'balanced'] # Poids des classes
}
grid_search = get_grid(model, param_grid)
grid_search.fit(X_train, y_train)
store_results('Logistic Regression', grid_search)n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 20
max_resources_: 398
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 12
n_resources: 20
Fitting 5 folds for each of 12 candidates, totalling 60 fits
/Users/mathisderenne/Documents/02 - Scolaire/M1 MIASHS/02 - Guillaume Mezler/Projet/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search.py:317: UserWarning: The total space of parameters 12 is smaller than n_iter=19. Running 12 iterations. For exhaustive searches, use GridSearchCV.
warnings.warn(
----------
iter: 1
n_candidates: 4
n_resources: 60
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 2
n_candidates: 2
n_resources: 180
Fitting 5 folds for each of 2 candidates, totalling 10 fits
/Users/mathisderenne/Documents/02 - Scolaire/M1 MIASHS/02 - Guillaume Mezler/Projet/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1271: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 8.
warnings.warn(
Sauvegarde des prédictions et paramètres des modèles¶
from pathlib import Path
from joblib import dump
# Save models results
dump(MODELS, f"./../results/{dataset}.joblib")['./../results/wdbc.joblib']Performance des modèles sur les données de test¶
from utils import plot_roc, plot_precision_recall, table_report
for model_name, model in MODELS.items():
print(f"Model: {model_name}")
table_report(model['y_true'], model['y_pred'])
plot_roc(model['y_true'], model['y_proba'][:, 1])
plot_precision_recall(model['y_true'], model['y_proba'][:, 1])Loading...