candle-annotator/services/ml/config/pipeline.yaml

# ML Pipeline Configuration
# Full config for feature engineering, annotation ingestion, training, and inference

data:
  raw_path: "data/raw/OHLCV.csv"
  enriched_path: "data/enriched/features.csv"
  labeled_path: "data/labeled/dataset.csv"
  annotations_path: "data/annotations/export.json"

stages:
  feature_engineering:
    enabled: true

    # TA-Lib technical indicators
    talib_indicators:
      - name: "RSI"
        params:
          timeperiod: 14
      - name: "EMA"
        params:
          timeperiod: 20
      - name: "EMA"
        params:
          timeperiod: 50
      - name: "MACD"
        params:
          fastperiod: 12
          slowperiod: 26
          signalperiod: 9
      - name: "BBANDS"
        params:
          timeperiod: 20
          nbdevup: 2
          nbdevdn: 2
      - name: "ATR"
        params:
          timeperiod: 14
      - name: "ADX"
        params:
          timeperiod: 14
      - name: "CCI"
        params:
          timeperiod: 14
      - name: "MFI"
        params:
          timeperiod: 14
      - name: "STOCH"
        params:
          fastk_period: 14
          slowk_period: 3
          slowd_period: 3

    # Candle-derived features
    candle_features: true

    # Custom feature functions (module paths)
    custom_features: []

  annotation_ingestion:
    enabled: true

    # Label encoding: "window" or "bio"
    label_encoding: "window"

    # For windowed classification
    window_size: 30

    # Context padding (candles before/after)
    context_padding: 20

    # Minimum confidence for human annotations
    min_confidence: 1

    # Programmatic TA-Lib pattern labels
    programmatic_labels:
      enabled: true
      talib_patterns:
        - "CDLENGULFING"
        - "CDLHAMMER"
        - "CDLINVERTEDHAMMER"
        - "CDLSHOOTINGSTAR"
        - "CDLDOJI"
        - "CDLDOJISTAR"
        - "CDLMORNINGSTAR"
        - "CDLEVENINGSTAR"
        - "CDLHARAMI"
        - "CDLPIERCING"
        - "CDLDARKCLOUDCOVER"
        - "CDLTHREEWHITESOLDIERS"
        - "CDLTHREEBLACKCROWS"

    # Label merge strategy: "human_priority", "programmatic_priority", "both"
    merge_strategy: "human_priority"

  training:
    enabled: true

    # Model type: "random_forest", "xgboost"
    model_type: "random_forest"

    # Train/test split
    split_method: "temporal"  # "temporal" or "random"
    test_split: 0.2
    validation_split: 0.1

    # Class balancing
    class_weights: "balanced"  # "balanced" or null

    # Hyperparameters (model-specific)
    hyperparameters:
      # RandomForest
      n_estimators: 200
      max_depth: 15
      min_samples_split: 5
      min_samples_leaf: 2
      random_state: 42
      n_jobs: -1

      # XGBoost (when model_type is "xgboost")
      # n_estimators: 500
      # max_depth: 6
      # learning_rate: 0.01
      # subsample: 0.8
      # colsample_bytree: 0.8
      # random_state: 42
      # n_jobs: -1

    # MLflow settings
    mlflow:
      tracking_uri: "http://mlflow:5000"
      experiment_name: "candlestick_patterns"
      log_artifacts: true
      register_model: false  # Set to true to register in model registry

  inference:
    enabled: true

    # Model source: "mlflow" or "local"
    model_source: "local"

    # For MLflow source
    mlflow_model_name: "candlestick_pattern_v1"
    mlflow_model_stage: "Production"  # "Production", "Staging", "None"

    # For local source
    local_model_path: "models/best_model.pkl"

    # Batch processing
    batch_size: 1000

    # Preprocessing config loaded from MLflow artifact or use current config
    use_training_config: true