fix(ml): complete ML pipeline fixes and setup

- Fix CCI indicator to use HLC prices instead of close only
- Parse datetime column when loading enriched CSV
- Strip timezone from annotation timestamps
- Fix TA-Lib pattern names (CDL3WHITESOLDIERS, CDL3BLACKCROWS)
- Exclude programmatic label columns from training features
- Fix classification report to handle missing classes
- Update MLflow tracking to use localhost:5000
- Grant PostgreSQL permissions to ml_user

Pipeline now runs successfully end-to-end:
- Feature engineering: 2543 rows, 31 columns
- Annotation ingestion: 286 samples
- Training: 89.47% test accuracy with Random Forest
This commit is contained in:
Marko Djordjevic 2026-02-15 21:29:54 +01:00
parent ceb4103ec4
commit aa81d4f3d0
348 changed files with 1327 additions and 11 deletions

View file

@ -15,6 +15,11 @@ from sqlalchemy.orm import sessionmaker, Session
from sqlalchemy.sql import func
# CREATE DATABASE ml_service;
# CREATE USER ml_user WITH ENCRYPTED PASSWORD 'ml_password';
# GRANT ALL PRIVILEGES ON DATABASE ml_service TO ml_user;
# Database connection configuration from environment
DATABASE_URL = os.getenv(
"DATABASE_URL",
@ -43,9 +48,9 @@ Base = declarative_base()
# Training runs model
class TrainingRun(Base):
"""Model for tracking ML training runs."""
__tablename__ = "training_runs"
id = Column(Integer, primary_key=True, index=True)
run_id = Column(String(255), unique=True, nullable=False, index=True)
model_type = Column(String(100), nullable=False)
@ -56,7 +61,7 @@ class TrainingRun(Base):
status = Column(String(50), nullable=False, default="running", index=True)
created_at = Column(DateTime(timezone=True), server_default=func.now(), index=True)
completed_at = Column(DateTime(timezone=True))
def __repr__(self):
return f"<TrainingRun(run_id='{self.run_id}', status='{self.status}')>"
@ -73,14 +78,14 @@ def init_db():
def get_db() -> Generator[Session, None, None]:
"""
Context manager for database sessions.
Usage:
with get_db() as db:
# Use db session here
training_run = TrainingRun(run_id="123", ...)
db.add(training_run)
db.commit()
Yields:
Database session
"""
@ -94,12 +99,12 @@ def get_db() -> Generator[Session, None, None]:
def get_db_session() -> Session:
"""
Get a database session (for dependency injection).
Usage with FastAPI:
@app.get("/")
def endpoint(db: Session = Depends(get_db_session)):
# Use db here
Returns:
Database session (caller must close it)
"""

View file

@ -127,7 +127,7 @@ stages:
# MLflow settings
mlflow:
tracking_uri: "http://mlflow:5000"
tracking_uri: "http://localhost:5000"
experiment_name: "candlestick_patterns"
log_artifacts: true
register_model: false # Set to true to register in model registry

View file

@ -0,0 +1,97 @@
data:
annotations_path: data/annotations/export.json
enriched_path: data/enriched/features.csv
labeled_path: data/labeled/dataset.csv
raw_path: data/raw/OHLCV.csv
stages:
annotation_ingestion:
context_padding: 20
enabled: true
label_encoding: window
merge_strategy: human_priority
min_confidence: 1
programmatic_labels:
enabled: true
talib_patterns:
- CDLENGULFING
- CDLHAMMER
- CDLINVERTEDHAMMER
- CDLSHOOTINGSTAR
- CDLDOJI
- CDLDOJISTAR
- CDLMORNINGSTAR
- CDLEVENINGSTAR
- CDLHARAMI
- CDLPIERCING
- CDLDARKCLOUDCOVER
- CDL3WHITESOLDIERS
- CDL3BLACKCROWS
window_size: 30
feature_engineering:
candle_features: true
custom_features: []
enabled: true
talib_indicators:
- name: RSI
params:
timeperiod: 14
- name: EMA
params:
timeperiod: 20
- name: EMA
params:
timeperiod: 50
- name: MACD
params:
fastperiod: 12
signalperiod: 9
slowperiod: 26
- name: BBANDS
params:
nbdevdn: 2
nbdevup: 2
timeperiod: 20
- name: ATR
params:
timeperiod: 14
- name: ADX
params:
timeperiod: 14
- name: CCI
params:
timeperiod: 14
- name: MFI
params:
timeperiod: 14
- name: STOCH
params:
fastk_period: 14
slowd_period: 3
slowk_period: 3
inference:
batch_size: 1000
enabled: true
local_model_path: models/best_model.pkl
mlflow_model_name: candlestick_pattern_v1
mlflow_model_stage: Production
model_source: local
use_training_config: true
training:
class_weights: balanced
enabled: true
hyperparameters:
max_depth: 15
min_samples_leaf: 2
min_samples_split: 5
n_estimators: 200
n_jobs: -1
random_state: 42
mlflow:
experiment_name: candlestick_patterns
log_artifacts: true
register_model: false
tracking_uri: http://localhost:5000
model_type: random_forest
split_method: temporal
test_split: 0.2
validation_split: 0.1

View file

@ -0,0 +1,14 @@
artifact_uri: file:///home/homoludens/projekti/bitcon/candle_annotator/services/ml/mlruns/358560345319124639/0114b8ddfb2a45dabcb10e9836872de0/artifacts
end_time: 1771187325601
entry_point_name: ''
experiment_id: '358560345319124639'
lifecycle_stage: active
run_id: 0114b8ddfb2a45dabcb10e9836872de0
run_name: abundant-crane-122
source_name: ''
source_type: 4
source_version: ''
start_time: 1771187322888
status: 4
tags: []
user_id: homoludens

View file

@ -0,0 +1 @@
1771187324134 0.8947368421052632 0

View file

@ -0,0 +1 @@
1771187324195 0.8928571428571429 0

View file

@ -0,0 +1 @@
1771187324229 0.896551724137931 0

View file

@ -0,0 +1 @@
1771187324147 0.8947044334975369 0

View file

@ -0,0 +1 @@
1771187324158 0.8949312937516204 0

View file

@ -0,0 +1 @@
1771187324176 0.8064516129032258 0

View file

@ -0,0 +1 @@
ceb4103ec4eefe0f0b6444db2bab6efbe6526f5e

View file

@ -0,0 +1,97 @@
data:
annotations_path: data/annotations/export.json
enriched_path: data/enriched/features.csv
labeled_path: data/labeled/dataset.csv
raw_path: data/raw/OHLCV.csv
stages:
annotation_ingestion:
context_padding: 20
enabled: true
label_encoding: window
merge_strategy: human_priority
min_confidence: 1
programmatic_labels:
enabled: true
talib_patterns:
- CDLENGULFING
- CDLHAMMER
- CDLINVERTEDHAMMER
- CDLSHOOTINGSTAR
- CDLDOJI
- CDLDOJISTAR
- CDLMORNINGSTAR
- CDLEVENINGSTAR
- CDLHARAMI
- CDLPIERCING
- CDLDARKCLOUDCOVER
- CDL3WHITESOLDIERS
- CDL3BLACKCROWS
window_size: 30
feature_engineering:
candle_features: true
custom_features: []
enabled: true
talib_indicators:
- name: RSI
params:
timeperiod: 14
- name: EMA
params:
timeperiod: 20
- name: EMA
params:
timeperiod: 50
- name: MACD
params:
fastperiod: 12
signalperiod: 9
slowperiod: 26
- name: BBANDS
params:
nbdevdn: 2
nbdevup: 2
timeperiod: 20
- name: ATR
params:
timeperiod: 14
- name: ADX
params:
timeperiod: 14
- name: CCI
params:
timeperiod: 14
- name: MFI
params:
timeperiod: 14
- name: STOCH
params:
fastk_period: 14
slowd_period: 3
slowk_period: 3
inference:
batch_size: 1000
enabled: true
local_model_path: models/best_model.pkl
mlflow_model_name: candlestick_pattern_v1
mlflow_model_stage: Production
model_source: local
use_training_config: true
training:
class_weights: balanced
enabled: true
hyperparameters:
max_depth: 15
min_samples_leaf: 2
min_samples_split: 5
n_estimators: 200
n_jobs: -1
random_state: 42
mlflow:
experiment_name: candlestick_patterns
log_artifacts: true
register_model: false
tracking_uri: http://localhost:5000
model_type: random_forest
split_method: temporal
test_split: 0.2
validation_split: 0.1

View file

@ -0,0 +1,14 @@
artifact_uri: file:///home/homoludens/projekti/bitcon/candle_annotator/services/ml/mlruns/358560345319124639/26c245a4e6dc45e8b11e617e5702be1a/artifacts
end_time: 1771187348138
entry_point_name: ''
experiment_id: '358560345319124639'
lifecycle_stage: active
run_id: 26c245a4e6dc45e8b11e617e5702be1a
run_name: bald-bear-921
source_name: ''
source_type: 4
source_version: ''
start_time: 1771187345480
status: 4
tags: []
user_id: homoludens

View file

@ -0,0 +1 @@
1771187346671 0.8947368421052632 0

View file

@ -0,0 +1 @@
1771187346744 0.8928571428571429 0

View file

@ -0,0 +1 @@
1771187346777 0.896551724137931 0

View file

@ -0,0 +1 @@
1771187346686 0.8947044334975369 0

View file

@ -0,0 +1 @@
1771187346702 0.8949312937516204 0

View file

@ -0,0 +1 @@
1771187346722 0.8064516129032258 0

View file

@ -0,0 +1 @@
ceb4103ec4eefe0f0b6444db2bab6efbe6526f5e

View file

@ -0,0 +1,97 @@
data:
annotations_path: data/annotations/export.json
enriched_path: data/enriched/features.csv
labeled_path: data/labeled/dataset.csv
raw_path: data/raw/OHLCV.csv
stages:
annotation_ingestion:
context_padding: 20
enabled: true
label_encoding: window
merge_strategy: human_priority
min_confidence: 1
programmatic_labels:
enabled: true
talib_patterns:
- CDLENGULFING
- CDLHAMMER
- CDLINVERTEDHAMMER
- CDLSHOOTINGSTAR
- CDLDOJI
- CDLDOJISTAR
- CDLMORNINGSTAR
- CDLEVENINGSTAR
- CDLHARAMI
- CDLPIERCING
- CDLDARKCLOUDCOVER
- CDL3WHITESOLDIERS
- CDL3BLACKCROWS
window_size: 30
feature_engineering:
candle_features: true
custom_features: []
enabled: true
talib_indicators:
- name: RSI
params:
timeperiod: 14
- name: EMA
params:
timeperiod: 20
- name: EMA
params:
timeperiod: 50
- name: MACD
params:
fastperiod: 12
signalperiod: 9
slowperiod: 26
- name: BBANDS
params:
nbdevdn: 2
nbdevup: 2
timeperiod: 20
- name: ATR
params:
timeperiod: 14
- name: ADX
params:
timeperiod: 14
- name: CCI
params:
timeperiod: 14
- name: MFI
params:
timeperiod: 14
- name: STOCH
params:
fastk_period: 14
slowd_period: 3
slowk_period: 3
inference:
batch_size: 1000
enabled: true
local_model_path: models/best_model.pkl
mlflow_model_name: candlestick_pattern_v1
mlflow_model_stage: Production
model_source: local
use_training_config: true
training:
class_weights: balanced
enabled: true
hyperparameters:
max_depth: 15
min_samples_leaf: 2
min_samples_split: 5
n_estimators: 200
n_jobs: -1
random_state: 42
mlflow:
experiment_name: candlestick_patterns
log_artifacts: true
register_model: false
tracking_uri: http://localhost:5000
model_type: random_forest
split_method: temporal
test_split: 0.2
validation_split: 0.1

View file

@ -0,0 +1,14 @@
artifact_uri: file:///home/homoludens/projekti/bitcon/candle_annotator/services/ml/mlruns/358560345319124639/808c2e97d90c40feb8d35a7e348c8e4c/artifacts
end_time: 1771187283377
entry_point_name: ''
experiment_id: '358560345319124639'
lifecycle_stage: active
run_id: 808c2e97d90c40feb8d35a7e348c8e4c
run_name: carefree-sheep-418
source_name: ''
source_type: 4
source_version: ''
start_time: 1771187280389
status: 4
tags: []
user_id: homoludens

View file

@ -0,0 +1 @@
1771187281809 0.8947368421052632 0

View file

@ -0,0 +1 @@
1771187281869 0.8928571428571429 0

View file

@ -0,0 +1 @@
1771187281896 0.896551724137931 0

View file

@ -0,0 +1 @@
1771187281826 0.8947044334975369 0

View file

@ -0,0 +1 @@
1771187281836 0.8949312937516204 0

View file

@ -0,0 +1 @@
1771187281852 0.8064516129032258 0

Some files were not shown because too many files have changed in this diff Show more