From 26f2c44509dd648ebc8f30adbe9e32a525c65937 Mon Sep 17 00:00:00 2001
From: Marko Djordjevic <marko.homoludens@gmail.com>
Date: Wed, 18 Feb 2026 23:58:24 +0100
Subject: [PATCH] Fix XGBoost label encoding and single-class guard

---
 services/ml/training/models/xgboost_model.py | 36 +++++++++++++++-----
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/services/ml/training/models/xgboost_model.py b/services/ml/training/models/xgboost_model.py
index d18104d..e472e59 100644
--- a/services/ml/training/models/xgboost_model.py
+++ b/services/ml/training/models/xgboost_model.py
@@ -10,6 +10,7 @@ from typing import Any, Dict, Optional
 import numpy as np
 from xgboost import XGBClassifier
 from sklearn.utils.class_weight import compute_class_weight
+from sklearn.preprocessing import LabelEncoder
 
 
 class XGBoostModel:
@@ -33,6 +34,7 @@ class XGBoostModel:
         self.hyperparameters = hyperparameters.copy()
         self.class_weights = class_weights
         self._sample_weights = None
+        self.label_encoder_ = None
         
         # XGBoost doesn't have built-in class_weight parameter like sklearn
         # We'll compute sample weights manually when class_weights is "balanced"
@@ -51,25 +53,38 @@ class XGBoostModel:
         Returns:
             self
         """
+        classes = np.unique(y)
+        if classes.size < 2:
+            raise ValueError(
+                f"XGBoost requires at least 2 classes for training; got {classes.size} ({classes})"
+            )
+
+        y_encoded = y
+        if not (
+            np.issubdtype(np.asarray(y).dtype, np.integer)
+            and np.array_equal(np.sort(classes), np.arange(classes.size))
+        ):
+            self.label_encoder_ = LabelEncoder()
+            y_encoded = self.label_encoder_.fit_transform(y)
+
         # Compute sample weights if class weighting is enabled
         if self.class_weights == "balanced":
             # Compute class weights
-            classes = np.unique(y)
             class_weights = compute_class_weight(
                 class_weight="balanced",
-                classes=classes,
-                y=y
+                classes=np.unique(y_encoded),
+                y=y_encoded
             )
             
             # Map class weights to sample weights
-            class_weight_dict = dict(zip(classes, class_weights))
-            sample_weights = np.array([class_weight_dict[label] for label in y])
+            class_weight_dict = dict(zip(np.unique(y_encoded), class_weights))
+            sample_weights = np.array([class_weight_dict[label] for label in y_encoded])
             
             # Fit with sample weights
-            self.model.fit(X, y, sample_weight=sample_weights)
+            self.model.fit(X, y_encoded, sample_weight=sample_weights)
         else:
             # Fit without sample weights
-            self.model.fit(X, y)
+            self.model.fit(X, y_encoded)
         
         return self
     
@@ -83,7 +98,10 @@ class XGBoostModel:
         Returns:
             Predicted labels (n_samples,)
         """
-        return self.model.predict(X)
+        preds = self.model.predict(X)
+        if self.label_encoder_ is not None:
+            return self.label_encoder_.inverse_transform(preds.astype(int))
+        return preds
     
     def predict_proba(self, X: np.ndarray) -> np.ndarray:
         """
@@ -100,6 +118,8 @@ class XGBoostModel:
     @property
     def classes_(self):
         """Get fitted class labels."""
+        if self.label_encoder_ is not None:
+            return self.label_encoder_.classes_
         return self.model.classes_
     
     @property