Fix XGBoost label encoding and single-class guard

2026-02-18 23:58:24 +01:00 · 2026-02-18 23:58:24 +01:00 · 26f2c44509
commit 26f2c44509
parent 73c10a4156
1 changed files with 28 additions and 8 deletions
--- a/services/ml/training/models/xgboost_model.py
+++ b/services/ml/training/models/xgboost_model.py
@ -10,6 +10,7 @@ from typing import Any, Dict, Optional
 import numpy as np
 from xgboost import XGBClassifier
 from sklearn.utils.class_weight import compute_class_weight
+from sklearn.preprocessing import LabelEncoder


 class XGBoostModel:
@ -33,6 +34,7 @@ class XGBoostModel:
        self.hyperparameters = hyperparameters.copy()
        self.class_weights = class_weights
        self._sample_weights = None
+        self.label_encoder_ = None
        
        # XGBoost doesn't have built-in class_weight parameter like sklearn
        # We'll compute sample weights manually when class_weights is "balanced"
@ -51,25 +53,38 @@ class XGBoostModel:
        Returns:
            self
        """
+        classes = np.unique(y)
+        if classes.size < 2:
+            raise ValueError(
+                f"XGBoost requires at least 2 classes for training; got {classes.size} ({classes})"
+            )
+
+        y_encoded = y
+        if not (
+            np.issubdtype(np.asarray(y).dtype, np.integer)
+            and np.array_equal(np.sort(classes), np.arange(classes.size))
+        ):
+            self.label_encoder_ = LabelEncoder()
+            y_encoded = self.label_encoder_.fit_transform(y)
+
        # Compute sample weights if class weighting is enabled
        if self.class_weights == "balanced":
            # Compute class weights
-            classes = np.unique(y)
            class_weights = compute_class_weight(
                class_weight="balanced",
-                classes=classes,
-                y=y
+                classes=np.unique(y_encoded),
+                y=y_encoded
            )
            
            # Map class weights to sample weights
-            class_weight_dict = dict(zip(classes, class_weights))
-            sample_weights = np.array([class_weight_dict[label] for label in y])
+            class_weight_dict = dict(zip(np.unique(y_encoded), class_weights))
+            sample_weights = np.array([class_weight_dict[label] for label in y_encoded])
            
            # Fit with sample weights
-            self.model.fit(X, y, sample_weight=sample_weights)
+            self.model.fit(X, y_encoded, sample_weight=sample_weights)
        else:
            # Fit without sample weights
-            self.model.fit(X, y)
+            self.model.fit(X, y_encoded)
        
        return self
    
@ -83,7 +98,10 @@ class XGBoostModel:
        Returns:
            Predicted labels (n_samples,)
        """
-        return self.model.predict(X)
+        preds = self.model.predict(X)
+        if self.label_encoder_ is not None:
+            return self.label_encoder_.inverse_transform(preds.astype(int))
+        return preds
    
    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        """
@ -100,6 +118,8 @@ class XGBoostModel:
    @property
    def classes_(self):
        """Get fitted class labels."""
+        if self.label_encoder_ is not None:
+            return self.label_encoder_.classes_
        return self.model.classes_
    
    @property