candle-annotator/services/ml/training/models/xgboost_model.py

"""
XGBoost model wrapper for candlestick pattern classification.

Provides a wrapper around XGBoost's XGBClassifier with support for
class weight balancing.
"""

from typing import Any, Dict, Optional

import numpy as np
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder


class XGBoostModel:
    """
    XGBoost classifier wrapper for candlestick patterns.

    Attributes:
        model: The underlying XGBClassifier instance
        classes_: Fitted class labels
        feature_importances_: Feature importance scores (after fitting)
    """

    def __init__(self, hyperparameters: Dict[str, Any], class_weights: Optional[str] = None):
        """
        Initialize XGBoost model.

        Args:
            hyperparameters: Model hyperparameters from config
            class_weights: "balanced" for inverse-frequency weighting, None for no weighting
        """
        self.hyperparameters = hyperparameters.copy()
        self.class_weights = class_weights
        self._sample_weights = None
        self.label_encoder_ = None

        # XGBoost doesn't have built-in class_weight parameter like sklearn
        # We'll compute sample weights manually when class_weights is "balanced"

        # Initialize XGBoost model
        self.model = XGBClassifier(**self.hyperparameters)

    def fit(self, X: np.ndarray, y: np.ndarray):
        """
        Train the XGBoost model.

        Args:
            X: Training features (n_samples, n_features)
            y: Training labels (n_samples,)

        Returns:
            self
        """
        classes = np.unique(y)
        if classes.size < 2:
            raise ValueError(
                f"XGBoost requires at least 2 classes for training; got {classes.size} ({classes})"
            )

        y_encoded = y
        if not (
            np.issubdtype(np.asarray(y).dtype, np.integer)
            and np.array_equal(np.sort(classes), np.arange(classes.size))
        ):
            self.label_encoder_ = LabelEncoder()
            y_encoded = self.label_encoder_.fit_transform(y)

        # Compute sample weights if class weighting is enabled
        if self.class_weights == "balanced":
            # Compute class weights
            class_weights = compute_class_weight(
                class_weight="balanced",
                classes=np.unique(y_encoded),
                y=y_encoded
            )

            # Map class weights to sample weights
            class_weight_dict = dict(zip(np.unique(y_encoded), class_weights))
            sample_weights = np.array([class_weight_dict[label] for label in y_encoded])

            # Fit with sample weights
            self.model.fit(X, y_encoded, sample_weight=sample_weights)
        else:
            # Fit without sample weights
            self.model.fit(X, y_encoded)

        return self

    def predict(self, X: np.ndarray) -> np.ndarray:
        """
        Predict class labels.

        Args:
            X: Features (n_samples, n_features)

        Returns:
            Predicted labels (n_samples,)
        """
        preds = self.model.predict(X)
        if self.label_encoder_ is not None:
            return self.label_encoder_.inverse_transform(preds.astype(int))
        return preds

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        """
        Predict class probabilities.

        Args:
            X: Features (n_samples, n_features)

        Returns:
            Class probabilities (n_samples, n_classes)
        """
        return self.model.predict_proba(X)

    @property
    def classes_(self):
        """Get fitted class labels."""
        if self.label_encoder_ is not None:
            return self.label_encoder_.classes_
        return self.model.classes_

    @property
    def feature_importances_(self):
        """Get feature importance scores."""
        return self.model.feature_importances_

    def get_params(self) -> Dict[str, Any]:
        """
        Get model parameters.

        Returns:
            Dictionary of model hyperparameters
        """
        return self.model.get_params()

    def __repr__(self):
        return f"XGBoostModel(n_estimators={self.hyperparameters.get('n_estimators', 100)})"