- Create pipeline.py with CLI argument parsing for running stages - Implement TA-Lib indicator computation with multi-output support - Add candle feature extraction (body_size, wicks, ratios, etc.) - Create custom feature loader with dynamic module import - Wire all feature engineering stages with NaN handling - Tasks completed: 2.2, 2.3, 3.1, 3.2, 3.3, 3.4, 3.5
134 lines
4.1 KiB
Python
134 lines
4.1 KiB
Python
"""
|
|
Candle-derived feature extraction.
|
|
|
|
Computes geometric and structural features from OHLCV candlestick data.
|
|
"""
|
|
|
|
import logging
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def compute_candle_features(df: pd.DataFrame) -> pd.DataFrame:
|
|
"""
|
|
Compute derived candle features for each row.
|
|
|
|
Features computed:
|
|
- body_size: abs(close - open) — size of the candle body
|
|
- body_direction: 1 if close >= open (bullish), -1 otherwise (bearish)
|
|
- upper_wick: high - max(open, close) — upper shadow length
|
|
- lower_wick: min(open, close) - low — lower shadow length
|
|
- wick_ratio: upper_wick / lower_wick (0 if lower_wick is 0)
|
|
- body_to_range: body_size / (high - low) — body as fraction of total range (0 if range is 0)
|
|
- gap: open - previous close (0 for first candle)
|
|
- range: high - low — total candle range
|
|
|
|
Args:
|
|
df: DataFrame with OHLCV columns (open, high, low, close)
|
|
|
|
Returns:
|
|
DataFrame with original columns + candle feature columns
|
|
|
|
Raises:
|
|
ValueError: If required OHLCV columns are missing
|
|
"""
|
|
# Validate required columns
|
|
required_cols = ['open', 'high', 'low', 'close']
|
|
missing_cols = [col for col in required_cols if col not in df.columns]
|
|
if missing_cols:
|
|
raise ValueError(f"Missing required OHLC columns: {missing_cols}")
|
|
|
|
logger.info("Computing candle features")
|
|
|
|
# Make a copy to avoid modifying the original
|
|
result_df = df.copy()
|
|
|
|
# Body size
|
|
result_df['body_size'] = np.abs(result_df['close'] - result_df['open'])
|
|
|
|
# Body direction
|
|
result_df['body_direction'] = np.where(
|
|
result_df['close'] >= result_df['open'],
|
|
1, # Bullish
|
|
-1 # Bearish
|
|
)
|
|
|
|
# Upper wick
|
|
result_df['upper_wick'] = result_df['high'] - np.maximum(
|
|
result_df['open'],
|
|
result_df['close']
|
|
)
|
|
|
|
# Lower wick
|
|
result_df['lower_wick'] = np.minimum(
|
|
result_df['open'],
|
|
result_df['close']
|
|
) - result_df['low']
|
|
|
|
# Wick ratio (handle division by zero)
|
|
result_df['wick_ratio'] = np.where(
|
|
result_df['lower_wick'] != 0,
|
|
result_df['upper_wick'] / result_df['lower_wick'],
|
|
0.0
|
|
)
|
|
|
|
# Range
|
|
result_df['range'] = result_df['high'] - result_df['low']
|
|
|
|
# Body to range ratio (handle division by zero)
|
|
result_df['body_to_range'] = np.where(
|
|
result_df['range'] != 0,
|
|
result_df['body_size'] / result_df['range'],
|
|
0.0
|
|
)
|
|
|
|
# Gap (open - previous close)
|
|
# For the first candle, gap is 0
|
|
result_df['gap'] = result_df['open'] - result_df['close'].shift(1)
|
|
result_df['gap'].fillna(0.0, inplace=True)
|
|
|
|
logger.info("Computed 8 candle features: body_size, body_direction, upper_wick, "
|
|
"lower_wick, wick_ratio, body_to_range, gap, range")
|
|
|
|
return result_df
|
|
|
|
|
|
def validate_candle_data(df: pd.DataFrame) -> None:
|
|
"""
|
|
Validate OHLC data consistency.
|
|
|
|
Checks:
|
|
- high >= low
|
|
- high >= open
|
|
- high >= close
|
|
- low <= open
|
|
- low <= close
|
|
|
|
Args:
|
|
df: DataFrame with OHLC columns
|
|
|
|
Raises:
|
|
ValueError: If data validation fails
|
|
"""
|
|
# Check high >= low
|
|
invalid_hl = df[df['high'] < df['low']]
|
|
if not invalid_hl.empty:
|
|
logger.warning(f"Found {len(invalid_hl)} rows where high < low")
|
|
|
|
# Check high >= open and high >= close
|
|
invalid_h = df[(df['high'] < df['open']) | (df['high'] < df['close'])]
|
|
if not invalid_h.empty:
|
|
logger.warning(f"Found {len(invalid_h)} rows where high < open or high < close")
|
|
|
|
# Check low <= open and low <= close
|
|
invalid_l = df[(df['low'] > df['open']) | (df['low'] > df['close'])]
|
|
if not invalid_l.empty:
|
|
logger.warning(f"Found {len(invalid_l)} rows where low > open or low > close")
|
|
|
|
# If there are many invalid rows, this could indicate a data quality issue
|
|
total_invalid = len(invalid_hl) + len(invalid_h) + len(invalid_l)
|
|
if total_invalid > 0:
|
|
logger.warning(f"Total invalid candles: {total_invalid} out of {len(df)}")
|