# ML Pipeline Configuration # Full config for feature engineering, annotation ingestion, training, and inference data: raw_path: "data/raw/OHLCV.csv" enriched_path: "data/enriched/features.csv" labeled_path: "data/labeled/dataset.csv" annotations_path: "data/annotations/export.json" stages: feature_engineering: enabled: true # TA-Lib technical indicators talib_indicators: - name: "RSI" params: timeperiod: 14 - name: "EMA" params: timeperiod: 20 - name: "EMA" params: timeperiod: 50 - name: "MACD" params: fastperiod: 12 slowperiod: 26 signalperiod: 9 - name: "BBANDS" params: timeperiod: 20 nbdevup: 2 nbdevdn: 2 - name: "ATR" params: timeperiod: 14 - name: "ADX" params: timeperiod: 14 - name: "CCI" params: timeperiod: 14 - name: "MFI" params: timeperiod: 14 - name: "STOCH" params: fastk_period: 14 slowk_period: 3 slowd_period: 3 # Candle-derived features candle_features: true # Custom feature functions (module paths) custom_features: [] annotation_ingestion: enabled: true # Label encoding: "window" or "bio" label_encoding: "window" # For windowed classification window_size: 30 # Context padding (candles before/after) context_padding: 20 # Minimum confidence for human annotations min_confidence: 1 # Programmatic TA-Lib pattern labels programmatic_labels: enabled: true talib_patterns: - "CDLENGULFING" - "CDLHAMMER" - "CDLINVERTEDHAMMER" - "CDLSHOOTINGSTAR" - "CDLDOJI" - "CDLDOJISTAR" - "CDLMORNINGSTAR" - "CDLEVENINGSTAR" - "CDLHARAMI" - "CDLPIERCING" - "CDLDARKCLOUDCOVER" - "CDLTHREEWHITESOLDIERS" - "CDLTHREEBLACKCROWS" # Label merge strategy: "human_priority", "programmatic_priority", "both" merge_strategy: "human_priority" training: enabled: true # Model type: "random_forest", "xgboost" model_type: "random_forest" # Train/test split split_method: "temporal" # "temporal" or "random" test_split: 0.2 validation_split: 0.1 # Class balancing class_weights: "balanced" # "balanced" or null # Hyperparameters (model-specific) hyperparameters: # RandomForest n_estimators: 200 max_depth: 15 min_samples_split: 5 min_samples_leaf: 2 random_state: 42 n_jobs: -1 # XGBoost (when model_type is "xgboost") # n_estimators: 500 # max_depth: 6 # learning_rate: 0.01 # subsample: 0.8 # colsample_bytree: 0.8 # random_state: 42 # n_jobs: -1 # MLflow settings mlflow: tracking_uri: "http://mlflow:5000" experiment_name: "candlestick_patterns" log_artifacts: true register_model: false # Set to true to register in model registry inference: enabled: true # Model source: "mlflow" or "local" model_source: "local" # For MLflow source mlflow_model_name: "candlestick_pattern_v1" mlflow_model_stage: "Production" # "Production", "Staging", "None" # For local source local_model_path: "models/best_model.pkl" # Batch processing batch_size: 1000 # Preprocessing config loaded from MLflow artifact or use current config use_training_config: true