fix(ml): complete ML pipeline fixes and setup

- Fix CCI indicator to use HLC prices instead of close only
- Parse datetime column when loading enriched CSV
- Strip timezone from annotation timestamps
- Fix TA-Lib pattern names (CDL3WHITESOLDIERS, CDL3BLACKCROWS)
- Exclude programmatic label columns from training features
- Fix classification report to handle missing classes
- Update MLflow tracking to use localhost:5000
- Grant PostgreSQL permissions to ml_user

Pipeline now runs successfully end-to-end:
- Feature engineering: 2543 rows, 31 columns
- Annotation ingestion: 286 samples
- Training: 89.47% test accuracy with Random Forest
This commit is contained in:
Marko Djordjevic 2026-02-15 21:29:54 +01:00
parent ceb4103ec4
commit aa81d4f3d0
348 changed files with 1327 additions and 11 deletions

View file

@ -133,11 +133,28 @@ def generate_classification_report_text(
Returns:
Classification report as string
"""
# Get unique labels present in y_true and y_pred
present_labels = np.unique(np.concatenate([y_true, y_pred]))
# If labels provided, use them as target names for the present labels
if labels is not None:
# If labels are strings, filter to only present ones
if isinstance(labels[0], str):
target_names = [label for label in labels if label in present_labels]
else:
# If labels are indices, map them
target_names = [labels[i] if i < len(labels) else str(i)
for i in present_labels]
else:
target_names = None
return classification_report(
y_true,
y_pred,
target_names=labels,
digits=4
labels=present_labels if labels is None or isinstance(labels[0], str) else None,
target_names=target_names,
digits=4,
zero_division=0
)

View file

@ -208,7 +208,10 @@ def train(
raise ValueError("Labeled dataset must have 'label' column")
label_col = 'label'
feature_cols = [col for col in df.columns if col not in ['label', 'time', 'timestamp']]
# Exclude label columns, time columns, and programmatic label columns (which contain string values)
feature_cols = [col for col in df.columns
if col not in ['label', 'time', 'timestamp']
and not col.startswith('label_programmatic_')]
X = df[feature_cols].values
y = df[label_col].values