candle-annotator/services/ml/config/pipeline.yaml

152 lines
3.6 KiB
YAML

# ML Pipeline Configuration
# Full config for feature engineering, annotation ingestion, training, and inference
data:
raw_path: "data/raw/OHLCV.csv"
enriched_path: "data/enriched/features.csv"
labeled_path: "data/labeled/dataset.csv"
annotations_path: "data/annotations/export.json"
stages:
feature_engineering:
enabled: true
# TA-Lib technical indicators
talib_indicators:
- name: "RSI"
params:
timeperiod: 14
- name: "EMA"
params:
timeperiod: 20
- name: "EMA"
params:
timeperiod: 50
- name: "MACD"
params:
fastperiod: 12
slowperiod: 26
signalperiod: 9
- name: "BBANDS"
params:
timeperiod: 20
nbdevup: 2
nbdevdn: 2
- name: "ATR"
params:
timeperiod: 14
- name: "ADX"
params:
timeperiod: 14
- name: "CCI"
params:
timeperiod: 14
- name: "MFI"
params:
timeperiod: 14
- name: "STOCH"
params:
fastk_period: 14
slowk_period: 3
slowd_period: 3
# Candle-derived features
candle_features: true
# Custom feature functions (module paths)
custom_features: []
annotation_ingestion:
enabled: true
# Label encoding: "window" or "bio"
label_encoding: "window"
# For windowed classification
window_size: 30
# Context padding (candles before/after)
context_padding: 20
# Minimum confidence for human annotations
min_confidence: 1
# Programmatic TA-Lib pattern labels
programmatic_labels:
enabled: true
talib_patterns:
- "CDLENGULFING"
- "CDLHAMMER"
- "CDLINVERTEDHAMMER"
- "CDLSHOOTINGSTAR"
- "CDLDOJI"
- "CDLDOJISTAR"
- "CDLMORNINGSTAR"
- "CDLEVENINGSTAR"
- "CDLHARAMI"
- "CDLPIERCING"
- "CDLDARKCLOUDCOVER"
- "CDLTHREEWHITESOLDIERS"
- "CDLTHREEBLACKCROWS"
# Label merge strategy: "human_priority", "programmatic_priority", "both"
merge_strategy: "human_priority"
training:
enabled: true
# Model type: "random_forest", "xgboost"
model_type: "random_forest"
# Train/test split
split_method: "temporal" # "temporal" or "random"
test_split: 0.2
validation_split: 0.1
# Class balancing
class_weights: "balanced" # "balanced" or null
# Hyperparameters (model-specific)
hyperparameters:
# RandomForest
n_estimators: 200
max_depth: 15
min_samples_split: 5
min_samples_leaf: 2
random_state: 42
n_jobs: -1
# XGBoost (when model_type is "xgboost")
# n_estimators: 500
# max_depth: 6
# learning_rate: 0.01
# subsample: 0.8
# colsample_bytree: 0.8
# random_state: 42
# n_jobs: -1
# MLflow settings
mlflow:
tracking_uri: "http://mlflow:5000"
experiment_name: "candlestick_patterns"
log_artifacts: true
register_model: false # Set to true to register in model registry
inference:
enabled: true
# Model source: "mlflow" or "local"
model_source: "local"
# For MLflow source
mlflow_model_name: "candlestick_pattern_v1"
mlflow_model_stage: "Production" # "Production", "Staging", "None"
# For local source
local_model_path: "models/best_model.pkl"
# Batch processing
batch_size: 1000
# Preprocessing config loaded from MLflow artifact or use current config
use_training_config: true