fix(ml): add windowed feature flattening for inference parity

The model was trained on 94-candle sliding windows flattened to 2820 features (94 candles x 30 features). Inference was sending raw per-candle features (27 columns). Changes: - Rewrite preprocessing to return (X, window_times) tuple - Add sliding window creation with correct feature ordering - Fill missing columns (average, barCount) with 0 for feature parity - Fill NaN from indicator warmup with 0 instead of dropping rows - Always compute all indicators (including MFI) for feature parity - Update predict and batch predict endpoints for new signature
2026-02-15 22:07:06 +01:00 · 2026-02-15 22:07:06 +01:00 · 40d6d1739e
commit 40d6d1739e
parent 4c7b3f2676
2 changed files with 111 additions and 63 deletions
--- a/services/ml/app/main.py
+++ b/services/ml/app/main.py
@ -524,14 +524,8 @@ async def predict(request: PredictRequest):
        # Convert candles to list of dicts
        candles_data = [candle.model_dump() for candle in request.candles]
        
-        # Preprocess candles (feature engineering)
-        df_preprocessed = preprocess_candles(candles_data, state.pipeline_config)
-        
-        # Keep times for results mapping
-        times = df_preprocessed['time'].values
-        
-        # Extract feature columns (exclude 'time')
-        X = extract_feature_columns(df_preprocessed)
+        # Preprocess candles (feature engineering + windowing)
+        X, window_times = preprocess_candles(candles_data, state.pipeline_config)
        
        # Get predictions and probabilities
        if hasattr(state.model, 'predict_proba'):
@ -547,20 +541,18 @@ async def predict(request: PredictRequest):
        
        # Get label names (handle both string and int predictions)
        if state.label_encoder is not None:
-            # Model predicts integers, map to labels
            labels = [state.label_encoder.get(int(pred), f"unknown_{pred}") for pred in y_pred]
        else:
-            # Model predicts strings directly
            labels = [str(pred) for pred in y_pred]
        
-        # Build per-candle predictions
+        # Build per-window predictions (each window maps to its last candle time)
        predictions = [
            PredictionResult(
                time=int(time),
                label=label,
                confidence=float(conf)
            )
-            for time, label, conf in zip(times, labels, confidences)
+            for time, label, conf in zip(window_times, labels, confidences)
        ]
        
        # Group into spans
@ -577,7 +569,7 @@ async def predict(request: PredictRequest):
        )
        
        logger.info(
-            f"Prediction complete: {len(predictions)} candles, "
+            f"Prediction complete: {len(predictions)} windows, "
            f"{len(spans)} spans, {len([p for p in predictions if p.label != 'O'])} patterns"
        )
        
@ -675,14 +667,8 @@ async def predict_batch(request: BatchPredictRequest):
            # Convert batch to candles format
            batch_candles = batch_df.to_dict('records')
            
-            # Preprocess
-            df_preprocessed = preprocess_candles(batch_candles, state.pipeline_config)
-            
-            # Keep times
-            times = df_preprocessed['time'].values
-            
-            # Extract features
-            X = extract_feature_columns(df_preprocessed)
+            # Preprocess (feature engineering + windowing)
+            X, window_times = preprocess_candles(batch_candles, state.pipeline_config)
            
            # Predict
            if hasattr(state.model, 'predict_proba'):
@ -706,7 +692,7 @@ async def predict_batch(request: BatchPredictRequest):
                    label=label,
                    confidence=float(conf)
                )
-                for time, label, conf in zip(times, labels, confidences)
+                for time, label, conf in zip(window_times, labels, confidences)
            ]
            
            all_predictions.extend(batch_predictions)