fix(ml): complete ML pipeline fixes and setup

- Fix CCI indicator to use HLC prices instead of close only - Parse datetime column when loading enriched CSV - Strip timezone from annotation timestamps - Fix TA-Lib pattern names (CDL3WHITESOLDIERS, CDL3BLACKCROWS) - Exclude programmatic label columns from training features - Fix classification report to handle missing classes - Update MLflow tracking to use localhost:5000 - Grant PostgreSQL permissions to ml_user Pipeline now runs successfully end-to-end: - Feature engineering: 2543 rows, 31 columns - Annotation ingestion: 286 samples - Training: 89.47% test accuracy with Random Forest
2026-02-15 21:29:54 +01:00 · 2026-02-15 21:29:54 +01:00 · aa81d4f3d0
commit aa81d4f3d0
parent ceb4103ec4
348 changed files with 1327 additions and 11 deletions
--- a/services/ml/training/pycache/evaluation.cpython-313.pyc
+++ b/services/ml/training/pycache/evaluation.cpython-313.pyc
--- a/services/ml/training/pycache/train.cpython-313.pyc
+++ b/services/ml/training/pycache/train.cpython-313.pyc
--- a/services/ml/training/evaluation.py
+++ b/services/ml/training/evaluation.py
@ -133,11 +133,28 @@ def generate_classification_report_text(
    Returns:
        Classification report as string
    """
+    # Get unique labels present in y_true and y_pred
+    present_labels = np.unique(np.concatenate([y_true, y_pred]))
+    
+    # If labels provided, use them as target names for the present labels
+    if labels is not None:
+        # If labels are strings, filter to only present ones
+        if isinstance(labels[0], str):
+            target_names = [label for label in labels if label in present_labels]
+        else:
+            # If labels are indices, map them
+            target_names = [labels[i] if i < len(labels) else str(i) 
+                           for i in present_labels]
+    else:
+        target_names = None
+    
    return classification_report(
        y_true,
        y_pred,
-        target_names=labels,
-        digits=4
+        labels=present_labels if labels is None or isinstance(labels[0], str) else None,
+        target_names=target_names,
+        digits=4,
+        zero_division=0
    )


--- a/services/ml/training/train.py
+++ b/services/ml/training/train.py
@ -208,7 +208,10 @@ def train(
        raise ValueError("Labeled dataset must have 'label' column")
    
    label_col = 'label'
-    feature_cols = [col for col in df.columns if col not in ['label', 'time', 'timestamp']]
+    # Exclude label columns, time columns, and programmatic label columns (which contain string values)
+    feature_cols = [col for col in df.columns 
+                    if col not in ['label', 'time', 'timestamp'] 
+                    and not col.startswith('label_programmatic_')]
    
    X = df[feature_cols].values
    y = df[label_col].values