fix(ml): complete ML pipeline fixes and setup

- Fix CCI indicator to use HLC prices instead of close only
- Parse datetime column when loading enriched CSV
- Strip timezone from annotation timestamps
- Fix TA-Lib pattern names (CDL3WHITESOLDIERS, CDL3BLACKCROWS)
- Exclude programmatic label columns from training features
- Fix classification report to handle missing classes
- Update MLflow tracking to use localhost:5000
- Grant PostgreSQL permissions to ml_user

Pipeline now runs successfully end-to-end:
- Feature engineering: 2543 rows, 31 columns
- Annotation ingestion: 286 samples
- Training: 89.47% test accuracy with Random Forest
This commit is contained in:
Marko Djordjevic 2026-02-15 21:29:54 +01:00
parent ceb4103ec4
commit aa81d4f3d0
348 changed files with 1327 additions and 11 deletions

View file

@ -0,0 +1,8 @@
precision recall f1-score support
Bearish Engulfing 0.8065 1.0000 0.8929 25
Bullish Engulfing 1.0000 0.8125 0.8966 32
accuracy 0.8947 57
macro avg 0.9032 0.9062 0.8947 57
weighted avg 0.9151 0.8947 0.8949 57

View file

@ -0,0 +1,97 @@
data:
annotations_path: data/annotations/export.json
enriched_path: data/enriched/features.csv
labeled_path: data/labeled/dataset.csv
raw_path: data/raw/OHLCV.csv
stages:
annotation_ingestion:
context_padding: 20
enabled: true
label_encoding: window
merge_strategy: human_priority
min_confidence: 1
programmatic_labels:
enabled: true
talib_patterns:
- CDLENGULFING
- CDLHAMMER
- CDLINVERTEDHAMMER
- CDLSHOOTINGSTAR
- CDLDOJI
- CDLDOJISTAR
- CDLMORNINGSTAR
- CDLEVENINGSTAR
- CDLHARAMI
- CDLPIERCING
- CDLDARKCLOUDCOVER
- CDL3WHITESOLDIERS
- CDL3BLACKCROWS
window_size: 30
feature_engineering:
candle_features: true
custom_features: []
enabled: true
talib_indicators:
- name: RSI
params:
timeperiod: 14
- name: EMA
params:
timeperiod: 20
- name: EMA
params:
timeperiod: 50
- name: MACD
params:
fastperiod: 12
signalperiod: 9
slowperiod: 26
- name: BBANDS
params:
nbdevdn: 2
nbdevup: 2
timeperiod: 20
- name: ATR
params:
timeperiod: 14
- name: ADX
params:
timeperiod: 14
- name: CCI
params:
timeperiod: 14
- name: MFI
params:
timeperiod: 14
- name: STOCH
params:
fastk_period: 14
slowd_period: 3
slowk_period: 3
inference:
batch_size: 1000
enabled: true
local_model_path: models/best_model.pkl
mlflow_model_name: candlestick_pattern_v1
mlflow_model_stage: Production
model_source: local
use_training_config: true
training:
class_weights: balanced
enabled: true
hyperparameters:
max_depth: 15
min_samples_leaf: 2
min_samples_split: 5
n_estimators: 200
n_jobs: -1
random_state: 42
mlflow:
experiment_name: candlestick_patterns
log_artifacts: true
register_model: false
tracking_uri: http://localhost:5000
model_type: random_forest
split_method: temporal
test_split: 0.2
validation_split: 0.1