""" Candle-derived feature extraction. Computes geometric and structural features from OHLCV candlestick data. """ import logging import pandas as pd import numpy as np logger = logging.getLogger(__name__) def compute_candle_features(df: pd.DataFrame) -> pd.DataFrame: """ Compute derived candle features for each row. Features computed: - body_size: abs(close - open) — size of the candle body - body_direction: 1 if close >= open (bullish), -1 otherwise (bearish) - upper_wick: high - max(open, close) — upper shadow length - lower_wick: min(open, close) - low — lower shadow length - wick_ratio: upper_wick / lower_wick (0 if lower_wick is 0) - body_to_range: body_size / (high - low) — body as fraction of total range (0 if range is 0) - gap: open - previous close (0 for first candle) - range: high - low — total candle range Args: df: DataFrame with OHLCV columns (open, high, low, close) Returns: DataFrame with original columns + candle feature columns Raises: ValueError: If required OHLCV columns are missing """ # Validate required columns required_cols = ['open', 'high', 'low', 'close'] missing_cols = [col for col in required_cols if col not in df.columns] if missing_cols: raise ValueError(f"Missing required OHLC columns: {missing_cols}") logger.info("Computing candle features") # Make a copy to avoid modifying the original result_df = df.copy() # Body size result_df['body_size'] = np.abs(result_df['close'] - result_df['open']) # Body direction result_df['body_direction'] = np.where( result_df['close'] >= result_df['open'], 1, # Bullish -1 # Bearish ) # Upper wick result_df['upper_wick'] = result_df['high'] - np.maximum( result_df['open'], result_df['close'] ) # Lower wick result_df['lower_wick'] = np.minimum( result_df['open'], result_df['close'] ) - result_df['low'] # Wick ratio (handle division by zero) result_df['wick_ratio'] = np.where( result_df['lower_wick'] != 0, result_df['upper_wick'] / result_df['lower_wick'], 0.0 ) # Range result_df['range'] = result_df['high'] - result_df['low'] # Body to range ratio (handle division by zero) result_df['body_to_range'] = np.where( result_df['range'] != 0, result_df['body_size'] / result_df['range'], 0.0 ) # Gap (open - previous close) # For the first candle, gap is 0 result_df['gap'] = result_df['open'] - result_df['close'].shift(1) result_df['gap'] = result_df['gap'].fillna(0.0) logger.info("Computed 8 candle features: body_size, body_direction, upper_wick, " "lower_wick, wick_ratio, body_to_range, gap, range") return result_df def validate_candle_data(df: pd.DataFrame) -> None: """ Validate OHLC data consistency. Checks: - high >= low - high >= open - high >= close - low <= open - low <= close Args: df: DataFrame with OHLC columns Raises: ValueError: If data validation fails """ # Check high >= low invalid_hl = df[df['high'] < df['low']] if not invalid_hl.empty: logger.warning(f"Found {len(invalid_hl)} rows where high < low") # Check high >= open and high >= close invalid_h = df[(df['high'] < df['open']) | (df['high'] < df['close'])] if not invalid_h.empty: logger.warning(f"Found {len(invalid_h)} rows where high < open or high < close") # Check low <= open and low <= close invalid_l = df[(df['low'] > df['open']) | (df['low'] > df['close'])] if not invalid_l.empty: logger.warning(f"Found {len(invalid_l)} rows where low > open or low > close") # If there are many invalid rows, this could indicate a data quality issue total_invalid = len(invalid_hl) + len(invalid_h) + len(invalid_l) if total_invalid > 0: logger.warning(f"Total invalid candles: {total_invalid} out of {len(df)}")