From 26f2c44509dd648ebc8f30adbe9e32a525c65937 Mon Sep 17 00:00:00 2001 From: Marko Djordjevic Date: Wed, 18 Feb 2026 23:58:24 +0100 Subject: [PATCH] Fix XGBoost label encoding and single-class guard --- services/ml/training/models/xgboost_model.py | 36 +++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/services/ml/training/models/xgboost_model.py b/services/ml/training/models/xgboost_model.py index d18104d..e472e59 100644 --- a/services/ml/training/models/xgboost_model.py +++ b/services/ml/training/models/xgboost_model.py @@ -10,6 +10,7 @@ from typing import Any, Dict, Optional import numpy as np from xgboost import XGBClassifier from sklearn.utils.class_weight import compute_class_weight +from sklearn.preprocessing import LabelEncoder class XGBoostModel: @@ -33,6 +34,7 @@ class XGBoostModel: self.hyperparameters = hyperparameters.copy() self.class_weights = class_weights self._sample_weights = None + self.label_encoder_ = None # XGBoost doesn't have built-in class_weight parameter like sklearn # We'll compute sample weights manually when class_weights is "balanced" @@ -51,25 +53,38 @@ class XGBoostModel: Returns: self """ + classes = np.unique(y) + if classes.size < 2: + raise ValueError( + f"XGBoost requires at least 2 classes for training; got {classes.size} ({classes})" + ) + + y_encoded = y + if not ( + np.issubdtype(np.asarray(y).dtype, np.integer) + and np.array_equal(np.sort(classes), np.arange(classes.size)) + ): + self.label_encoder_ = LabelEncoder() + y_encoded = self.label_encoder_.fit_transform(y) + # Compute sample weights if class weighting is enabled if self.class_weights == "balanced": # Compute class weights - classes = np.unique(y) class_weights = compute_class_weight( class_weight="balanced", - classes=classes, - y=y + classes=np.unique(y_encoded), + y=y_encoded ) # Map class weights to sample weights - class_weight_dict = dict(zip(classes, class_weights)) - sample_weights = np.array([class_weight_dict[label] for label in y]) + class_weight_dict = dict(zip(np.unique(y_encoded), class_weights)) + sample_weights = np.array([class_weight_dict[label] for label in y_encoded]) # Fit with sample weights - self.model.fit(X, y, sample_weight=sample_weights) + self.model.fit(X, y_encoded, sample_weight=sample_weights) else: # Fit without sample weights - self.model.fit(X, y) + self.model.fit(X, y_encoded) return self @@ -83,7 +98,10 @@ class XGBoostModel: Returns: Predicted labels (n_samples,) """ - return self.model.predict(X) + preds = self.model.predict(X) + if self.label_encoder_ is not None: + return self.label_encoder_.inverse_transform(preds.astype(int)) + return preds def predict_proba(self, X: np.ndarray) -> np.ndarray: """ @@ -100,6 +118,8 @@ class XGBoostModel: @property def classes_(self): """Get fitted class labels.""" + if self.label_encoder_ is not None: + return self.label_encoder_.classes_ return self.model.classes_ @property