candle-annotator/services/ml/features/candle_features.py
Marko Djordjevic fd29ab91e0 feat(ml): implement feature engineering pipeline
- Create pipeline.py with CLI argument parsing for running stages
- Implement TA-Lib indicator computation with multi-output support
- Add candle feature extraction (body_size, wicks, ratios, etc.)
- Create custom feature loader with dynamic module import
- Wire all feature engineering stages with NaN handling
- Tasks completed: 2.2, 2.3, 3.1, 3.2, 3.3, 3.4, 3.5
2026-02-15 12:22:59 +01:00

134 lines
4.1 KiB
Python

"""
Candle-derived feature extraction.
Computes geometric and structural features from OHLCV candlestick data.
"""
import logging
import pandas as pd
import numpy as np
logger = logging.getLogger(__name__)
def compute_candle_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Compute derived candle features for each row.
Features computed:
- body_size: abs(close - open) — size of the candle body
- body_direction: 1 if close >= open (bullish), -1 otherwise (bearish)
- upper_wick: high - max(open, close) — upper shadow length
- lower_wick: min(open, close) - low — lower shadow length
- wick_ratio: upper_wick / lower_wick (0 if lower_wick is 0)
- body_to_range: body_size / (high - low) — body as fraction of total range (0 if range is 0)
- gap: open - previous close (0 for first candle)
- range: high - low — total candle range
Args:
df: DataFrame with OHLCV columns (open, high, low, close)
Returns:
DataFrame with original columns + candle feature columns
Raises:
ValueError: If required OHLCV columns are missing
"""
# Validate required columns
required_cols = ['open', 'high', 'low', 'close']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
raise ValueError(f"Missing required OHLC columns: {missing_cols}")
logger.info("Computing candle features")
# Make a copy to avoid modifying the original
result_df = df.copy()
# Body size
result_df['body_size'] = np.abs(result_df['close'] - result_df['open'])
# Body direction
result_df['body_direction'] = np.where(
result_df['close'] >= result_df['open'],
1, # Bullish
-1 # Bearish
)
# Upper wick
result_df['upper_wick'] = result_df['high'] - np.maximum(
result_df['open'],
result_df['close']
)
# Lower wick
result_df['lower_wick'] = np.minimum(
result_df['open'],
result_df['close']
) - result_df['low']
# Wick ratio (handle division by zero)
result_df['wick_ratio'] = np.where(
result_df['lower_wick'] != 0,
result_df['upper_wick'] / result_df['lower_wick'],
0.0
)
# Range
result_df['range'] = result_df['high'] - result_df['low']
# Body to range ratio (handle division by zero)
result_df['body_to_range'] = np.where(
result_df['range'] != 0,
result_df['body_size'] / result_df['range'],
0.0
)
# Gap (open - previous close)
# For the first candle, gap is 0
result_df['gap'] = result_df['open'] - result_df['close'].shift(1)
result_df['gap'].fillna(0.0, inplace=True)
logger.info("Computed 8 candle features: body_size, body_direction, upper_wick, "
"lower_wick, wick_ratio, body_to_range, gap, range")
return result_df
def validate_candle_data(df: pd.DataFrame) -> None:
"""
Validate OHLC data consistency.
Checks:
- high >= low
- high >= open
- high >= close
- low <= open
- low <= close
Args:
df: DataFrame with OHLC columns
Raises:
ValueError: If data validation fails
"""
# Check high >= low
invalid_hl = df[df['high'] < df['low']]
if not invalid_hl.empty:
logger.warning(f"Found {len(invalid_hl)} rows where high < low")
# Check high >= open and high >= close
invalid_h = df[(df['high'] < df['open']) | (df['high'] < df['close'])]
if not invalid_h.empty:
logger.warning(f"Found {len(invalid_h)} rows where high < open or high < close")
# Check low <= open and low <= close
invalid_l = df[(df['low'] > df['open']) | (df['low'] > df['close'])]
if not invalid_l.empty:
logger.warning(f"Found {len(invalid_l)} rows where low > open or low > close")
# If there are many invalid rows, this could indicate a data quality issue
total_invalid = len(invalid_hl) + len(invalid_h) + len(invalid_l)
if total_invalid > 0:
logger.warning(f"Total invalid candles: {total_invalid} out of {len(df)}")