Fix inference feature mismatch with training metadata
This commit is contained in:
parent
328476a581
commit
73c10a4156
3 changed files with 137 additions and 10 deletions
|
|
@ -6,7 +6,8 @@ between training and inference.
|
|||
"""
|
||||
|
||||
import logging
|
||||
from typing import List, Tuple
|
||||
import re
|
||||
from typing import List, Tuple, Optional
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
|
@ -34,9 +35,51 @@ TRAINING_FEATURE_ORDER = [
|
|||
]
|
||||
|
||||
|
||||
def _parse_training_feature_columns(
|
||||
feature_columns: List[str]
|
||||
) -> Tuple[int, List[str]]:
|
||||
"""
|
||||
Derive window size and per-candle feature order from flattened training columns.
|
||||
|
||||
Expected column format: "<feature>_<index>" (e.g., "open_0", "rsi_14_12").
|
||||
"""
|
||||
if not feature_columns:
|
||||
raise ValueError("Training feature columns are empty")
|
||||
|
||||
feature_order: List[str] = []
|
||||
max_idx = -1
|
||||
idx_set = set()
|
||||
|
||||
for col in feature_columns:
|
||||
match = re.match(r"^(.*)_([0-9]+)$", col)
|
||||
if not match:
|
||||
raise ValueError(f"Invalid training feature column format: {col}")
|
||||
base = match.group(1)
|
||||
idx = int(match.group(2))
|
||||
if idx == 0:
|
||||
feature_order.append(base)
|
||||
if idx > max_idx:
|
||||
max_idx = idx
|
||||
idx_set.add(idx)
|
||||
|
||||
window_size = max_idx + 1
|
||||
if window_size <= 0:
|
||||
raise ValueError("Could not derive window size from training feature columns")
|
||||
|
||||
missing_idx = set(range(window_size)) - idx_set
|
||||
if missing_idx:
|
||||
raise ValueError(f"Missing window indices in training feature columns: {sorted(missing_idx)[:5]}")
|
||||
|
||||
if not feature_order:
|
||||
raise ValueError("Could not derive per-candle feature order from training feature columns")
|
||||
|
||||
return window_size, feature_order
|
||||
|
||||
|
||||
def preprocess_candles(
|
||||
candles: List[dict],
|
||||
pipeline_config: PipelineConfig
|
||||
pipeline_config: PipelineConfig,
|
||||
training_feature_columns: Optional[List[str]] = None
|
||||
) -> Tuple[pd.DataFrame, np.ndarray]:
|
||||
"""
|
||||
Preprocess candle data for inference.
|
||||
|
|
@ -124,16 +167,24 @@ def preprocess_candles(
|
|||
logger.info(f"Filling NaN values in {len(nan_cols)} columns (indicator warmup + missing data)")
|
||||
df = df.fillna(0.0)
|
||||
|
||||
# Determine expected feature order and window size
|
||||
if training_feature_columns:
|
||||
window_size, feature_order = _parse_training_feature_columns(training_feature_columns)
|
||||
logger.info(f"Using training feature columns: {len(feature_order)} features, window_size={window_size}")
|
||||
else:
|
||||
window_size = TRAINING_WINDOW_SIZE
|
||||
feature_order = TRAINING_FEATURE_ORDER
|
||||
|
||||
# Ensure all expected per-candle features exist
|
||||
for col in TRAINING_FEATURE_ORDER:
|
||||
for col in feature_order:
|
||||
if col not in df.columns:
|
||||
logger.warning(f"Missing expected feature column '{col}', filling with 0")
|
||||
df[col] = 0.0
|
||||
|
||||
logger.info(f"Preprocessing complete: {len(df)} candles with {len(TRAINING_FEATURE_ORDER)} features each")
|
||||
logger.info(f"Preprocessing complete: {len(df)} candles with {len(feature_order)} features each")
|
||||
|
||||
# Create sliding windows and flatten
|
||||
X, window_times = create_sliding_windows(df, TRAINING_WINDOW_SIZE, TRAINING_FEATURE_ORDER)
|
||||
X, window_times = create_sliding_windows(df, window_size, feature_order)
|
||||
|
||||
return X, window_times
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue