feat: add ML service scaffolding with Python FastAPI, Docker, and MLflow setup

2026-02-15 11:58:31 +01:00 · 2026-02-15 11:58:31 +01:00 · 1a653c5866
commit 1a653c5866
parent 92abab5316
18 changed files with 1952 additions and 2593 deletions
--- a/services/ml/config/pipeline.yaml
+++ b/services/ml/config/pipeline.yaml
@ -0,0 +1,152 @@
+# ML Pipeline Configuration
+# Full config for feature engineering, annotation ingestion, training, and inference
+
+data:
+  raw_path: "data/raw/OHLCV.csv"
+  enriched_path: "data/enriched/features.csv"
+  labeled_path: "data/labeled/dataset.csv"
+  annotations_path: "data/annotations/export.json"
+
+stages:
+  feature_engineering:
+    enabled: true
+    
+    # TA-Lib technical indicators
+    talib_indicators:
+      - name: "RSI"
+        params:
+          timeperiod: 14
+      - name: "EMA"
+        params:
+          timeperiod: 20
+      - name: "EMA"
+        params:
+          timeperiod: 50
+      - name: "MACD"
+        params:
+          fastperiod: 12
+          slowperiod: 26
+          signalperiod: 9
+      - name: "BBANDS"
+        params:
+          timeperiod: 20
+          nbdevup: 2
+          nbdevdn: 2
+      - name: "ATR"
+        params:
+          timeperiod: 14
+      - name: "ADX"
+        params:
+          timeperiod: 14
+      - name: "CCI"
+        params:
+          timeperiod: 14
+      - name: "MFI"
+        params:
+          timeperiod: 14
+      - name: "STOCH"
+        params:
+          fastk_period: 14
+          slowk_period: 3
+          slowd_period: 3
+    
+    # Candle-derived features
+    candle_features: true
+    
+    # Custom feature functions (module paths)
+    custom_features: []
+    
+  annotation_ingestion:
+    enabled: true
+    
+    # Label encoding: "window" or "bio"
+    label_encoding: "window"
+    
+    # For windowed classification
+    window_size: 30
+    
+    # Context padding (candles before/after)
+    context_padding: 20
+    
+    # Minimum confidence for human annotations
+    min_confidence: 1
+    
+    # Programmatic TA-Lib pattern labels
+    programmatic_labels:
+      enabled: true
+      talib_patterns:
+        - "CDLENGULFING"
+        - "CDLHAMMER"
+        - "CDLINVERTEDHAMMER"
+        - "CDLSHOOTINGSTAR"
+        - "CDLDOJI"
+        - "CDLDOJISTAR"
+        - "CDLMORNINGSTAR"
+        - "CDLEVENINGSTAR"
+        - "CDLHARAMI"
+        - "CDLPIERCING"
+        - "CDLDARKCLOUDCOVER"
+        - "CDLTHREEWHITESOLDIERS"
+        - "CDLTHREEBLACKCROWS"
+    
+    # Label merge strategy: "human_priority", "programmatic_priority", "both"
+    merge_strategy: "human_priority"
+  
+  training:
+    enabled: true
+    
+    # Model type: "random_forest", "xgboost"
+    model_type: "random_forest"
+    
+    # Train/test split
+    split_method: "temporal"  # "temporal" or "random"
+    test_split: 0.2
+    validation_split: 0.1
+    
+    # Class balancing
+    class_weights: "balanced"  # "balanced" or null
+    
+    # Hyperparameters (model-specific)
+    hyperparameters:
+      # RandomForest
+      n_estimators: 200
+      max_depth: 15
+      min_samples_split: 5
+      min_samples_leaf: 2
+      random_state: 42
+      n_jobs: -1
+      
+      # XGBoost (when model_type is "xgboost")
+      # n_estimators: 500
+      # max_depth: 6
+      # learning_rate: 0.01
+      # subsample: 0.8
+      # colsample_bytree: 0.8
+      # random_state: 42
+      # n_jobs: -1
+    
+    # MLflow settings
+    mlflow:
+      tracking_uri: "http://mlflow:5000"
+      experiment_name: "candlestick_patterns"
+      log_artifacts: true
+      register_model: false  # Set to true to register in model registry
+
+  inference:
+    enabled: true
+    
+    # Model source: "mlflow" or "local"
+    model_source: "local"
+    
+    # For MLflow source
+    mlflow_model_name: "candlestick_pattern_v1"
+    mlflow_model_stage: "Production"  # "Production", "Staging", "None"
+    
+    # For local source
+    local_model_path: "models/best_model.pkl"
+    
+    # Batch processing
+    batch_size: 1000
+    
+    # Preprocessing config loaded from MLflow artifact or use current config
+    use_training_config: true