candle-annotator/services/ml/features/candle_features.py

"""
Candle-derived feature extraction.

Computes geometric and structural features from OHLCV candlestick data.
"""

import logging
import pandas as pd
import numpy as np


logger = logging.getLogger(__name__)


def compute_candle_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute derived candle features for each row.

    Features computed:
    - body_size: abs(close - open) — size of the candle body
    - body_direction: 1 if close >= open (bullish), -1 otherwise (bearish)
    - upper_wick: high - max(open, close) — upper shadow length
    - lower_wick: min(open, close) - low — lower shadow length
    - wick_ratio: upper_wick / lower_wick (0 if lower_wick is 0)
    - body_to_range: body_size / (high - low) — body as fraction of total range (0 if range is 0)
    - gap: open - previous close (0 for first candle)
    - range: high - low — total candle range

    Args:
        df: DataFrame with OHLCV columns (open, high, low, close)

    Returns:
        DataFrame with original columns + candle feature columns

    Raises:
        ValueError: If required OHLCV columns are missing
    """
    # Validate required columns
    required_cols = ['open', 'high', 'low', 'close']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required OHLC columns: {missing_cols}")

    logger.info("Computing candle features")

    # Make a copy to avoid modifying the original
    result_df = df.copy()

    # Body size
    result_df['body_size'] = np.abs(result_df['close'] - result_df['open'])

    # Body direction
    result_df['body_direction'] = np.where(
        result_df['close'] >= result_df['open'],
        1,  # Bullish
        -1  # Bearish
    )

    # Upper wick
    result_df['upper_wick'] = result_df['high'] - np.maximum(
        result_df['open'],
        result_df['close']
    )

    # Lower wick
    result_df['lower_wick'] = np.minimum(
        result_df['open'],
        result_df['close']
    ) - result_df['low']

    # Wick ratio (handle division by zero)
    result_df['wick_ratio'] = np.where(
        result_df['lower_wick'] != 0,
        result_df['upper_wick'] / result_df['lower_wick'],
        0.0
    )

    # Range
    result_df['range'] = result_df['high'] - result_df['low']

    # Body to range ratio (handle division by zero)
    result_df['body_to_range'] = np.where(
        result_df['range'] != 0,
        result_df['body_size'] / result_df['range'],
        0.0
    )

    # Gap (open - previous close)
    # For the first candle, gap is 0
    result_df['gap'] = result_df['open'] - result_df['close'].shift(1)
    result_df['gap'].fillna(0.0, inplace=True)

    logger.info("Computed 8 candle features: body_size, body_direction, upper_wick, "
                "lower_wick, wick_ratio, body_to_range, gap, range")

    return result_df


def validate_candle_data(df: pd.DataFrame) -> None:
    """
    Validate OHLC data consistency.

    Checks:
    - high >= low
    - high >= open
    - high >= close
    - low <= open
    - low <= close

    Args:
        df: DataFrame with OHLC columns

    Raises:
        ValueError: If data validation fails
    """
    # Check high >= low
    invalid_hl = df[df['high'] < df['low']]
    if not invalid_hl.empty:
        logger.warning(f"Found {len(invalid_hl)} rows where high < low")

    # Check high >= open and high >= close
    invalid_h = df[(df['high'] < df['open']) | (df['high'] < df['close'])]
    if not invalid_h.empty:
        logger.warning(f"Found {len(invalid_h)} rows where high < open or high < close")

    # Check low <= open and low <= close
    invalid_l = df[(df['low'] > df['open']) | (df['low'] > df['close'])]
    if not invalid_l.empty:
        logger.warning(f"Found {len(invalid_l)} rows where low > open or low > close")

    # If there are many invalid rows, this could indicate a data quality issue
    total_invalid = len(invalid_hl) + len(invalid_h) + len(invalid_l)
    if total_invalid > 0:
        logger.warning(f"Total invalid candles: {total_invalid} out of {len(df)}")