feat(ml): implement training stage with MLflow tracking and model wrappers

- Create RandomForestModel and XGBoostModel wrappers with class weight support - Implement temporal and random train/val/test splitting - Add MLflow experiment tracking with full parameter and metric logging - Create evaluation module for confusion matrix, feature importance, and classification reports - Implement model training with sklearn/xgboost flavor logging and optional registry registration - Store training run metadata in PostgreSQL - Wire training stage into pipeline.py orchestrator - Support both RandomForest and XGBoost models with configurable hyperparameters
2026-02-15 14:22:19 +01:00 · 2026-02-15 14:22:19 +01:00 · f4c0f9a836
commit f4c0f9a836
parent 16763b967e
8 changed files with 900 additions and 14 deletions
--- a/services/ml/training/models/init.py
+++ b/services/ml/training/models/init.py
--- a/services/ml/training/models/random_forest.py
+++ b/services/ml/training/models/random_forest.py
@ -0,0 +1,100 @@
+"""
+RandomForest model wrapper for candlestick pattern classification.
+
+Provides a wrapper around scikit-learn's RandomForestClassifier with
+support for class weight balancing.
+"""
+
+from typing import Any, Dict, Optional
+
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier
+
+
+class RandomForestModel:
+    """
+    RandomForest classifier wrapper for candlestick patterns.
+    
+    Attributes:
+        model: The underlying RandomForestClassifier instance
+        classes_: Fitted class labels
+        feature_importances_: Feature importance scores (after fitting)
+    """
+    
+    def __init__(self, hyperparameters: Dict[str, Any], class_weights: Optional[str] = None):
+        """
+        Initialize RandomForest model.
+        
+        Args:
+            hyperparameters: Model hyperparameters from config
+            class_weights: "balanced" for inverse-frequency weighting, None for no weighting
+        """
+        self.hyperparameters = hyperparameters.copy()
+        self.class_weights = class_weights
+        
+        # Set class_weight parameter
+        if class_weights == "balanced":
+            self.hyperparameters["class_weight"] = "balanced"
+        
+        # Initialize scikit-learn model
+        self.model = RandomForestClassifier(**self.hyperparameters)
+        
+    def fit(self, X: np.ndarray, y: np.ndarray):
+        """
+        Train the RandomForest model.
+        
+        Args:
+            X: Training features (n_samples, n_features)
+            y: Training labels (n_samples,)
+            
+        Returns:
+            self
+        """
+        self.model.fit(X, y)
+        return self
+    
+    def predict(self, X: np.ndarray) -> np.ndarray:
+        """
+        Predict class labels.
+        
+        Args:
+            X: Features (n_samples, n_features)
+            
+        Returns:
+            Predicted labels (n_samples,)
+        """
+        return self.model.predict(X)
+    
+    def predict_proba(self, X: np.ndarray) -> np.ndarray:
+        """
+        Predict class probabilities.
+        
+        Args:
+            X: Features (n_samples, n_features)
+            
+        Returns:
+            Class probabilities (n_samples, n_classes)
+        """
+        return self.model.predict_proba(X)
+    
+    @property
+    def classes_(self):
+        """Get fitted class labels."""
+        return self.model.classes_
+    
+    @property
+    def feature_importances_(self):
+        """Get feature importance scores."""
+        return self.model.feature_importances_
+    
+    def get_params(self) -> Dict[str, Any]:
+        """
+        Get model parameters.
+        
+        Returns:
+            Dictionary of model hyperparameters
+        """
+        return self.model.get_params()
+    
+    def __repr__(self):
+        return f"RandomForestModel(n_estimators={self.hyperparameters.get('n_estimators', 100)})"
--- a/services/ml/training/models/xgboost_model.py
+++ b/services/ml/training/models/xgboost_model.py
@ -0,0 +1,120 @@
+"""
+XGBoost model wrapper for candlestick pattern classification.
+
+Provides a wrapper around XGBoost's XGBClassifier with support for
+class weight balancing.
+"""
+
+from typing import Any, Dict, Optional
+
+import numpy as np
+from xgboost import XGBClassifier
+from sklearn.utils.class_weight import compute_class_weight
+
+
+class XGBoostModel:
+    """
+    XGBoost classifier wrapper for candlestick patterns.
+    
+    Attributes:
+        model: The underlying XGBClassifier instance
+        classes_: Fitted class labels
+        feature_importances_: Feature importance scores (after fitting)
+    """
+    
+    def __init__(self, hyperparameters: Dict[str, Any], class_weights: Optional[str] = None):
+        """
+        Initialize XGBoost model.
+        
+        Args:
+            hyperparameters: Model hyperparameters from config
+            class_weights: "balanced" for inverse-frequency weighting, None for no weighting
+        """
+        self.hyperparameters = hyperparameters.copy()
+        self.class_weights = class_weights
+        self._sample_weights = None
+        
+        # XGBoost doesn't have built-in class_weight parameter like sklearn
+        # We'll compute sample weights manually when class_weights is "balanced"
+        
+        # Initialize XGBoost model
+        self.model = XGBClassifier(**self.hyperparameters)
+        
+    def fit(self, X: np.ndarray, y: np.ndarray):
+        """
+        Train the XGBoost model.
+        
+        Args:
+            X: Training features (n_samples, n_features)
+            y: Training labels (n_samples,)
+            
+        Returns:
+            self
+        """
+        # Compute sample weights if class weighting is enabled
+        if self.class_weights == "balanced":
+            # Compute class weights
+            classes = np.unique(y)
+            class_weights = compute_class_weight(
+                class_weight="balanced",
+                classes=classes,
+                y=y
+            )
+            
+            # Map class weights to sample weights
+            class_weight_dict = dict(zip(classes, class_weights))
+            sample_weights = np.array([class_weight_dict[label] for label in y])
+            
+            # Fit with sample weights
+            self.model.fit(X, y, sample_weight=sample_weights)
+        else:
+            # Fit without sample weights
+            self.model.fit(X, y)
+        
+        return self
+    
+    def predict(self, X: np.ndarray) -> np.ndarray:
+        """
+        Predict class labels.
+        
+        Args:
+            X: Features (n_samples, n_features)
+            
+        Returns:
+            Predicted labels (n_samples,)
+        """
+        return self.model.predict(X)
+    
+    def predict_proba(self, X: np.ndarray) -> np.ndarray:
+        """
+        Predict class probabilities.
+        
+        Args:
+            X: Features (n_samples, n_features)
+            
+        Returns:
+            Class probabilities (n_samples, n_classes)
+        """
+        return self.model.predict_proba(X)
+    
+    @property
+    def classes_(self):
+        """Get fitted class labels."""
+        return self.model.classes_
+    
+    @property
+    def feature_importances_(self):
+        """Get feature importance scores."""
+        return self.model.feature_importances_
+    
+    def get_params(self) -> Dict[str, Any]:
+        """
+        Get model parameters.
+        
+        Returns:
+            Dictionary of model hyperparameters
+        """
+        return self.model.get_params()
+    
+    def __repr__(self):
+        return f"XGBoostModel(n_estimators={self.hyperparameters.get('n_estimators', 100)})"