feat(ml): implement feature engineering pipeline

- Create pipeline.py with CLI argument parsing for running stages
- Implement TA-Lib indicator computation with multi-output support
- Add candle feature extraction (body_size, wicks, ratios, etc.)
- Create custom feature loader with dynamic module import
- Wire all feature engineering stages with NaN handling
- Tasks completed: 2.2, 2.3, 3.1, 3.2, 3.3, 3.4, 3.5
This commit is contained in:
Marko Djordjevic 2026-02-15 12:22:59 +01:00
parent ea339a54a7
commit fd29ab91e0
6 changed files with 889 additions and 7 deletions

207
services/ml/pipeline.py Normal file
View file

@ -0,0 +1,207 @@
"""
ML Pipeline orchestrator.
Runs feature engineering, annotation ingestion, training, and inference stages
based on configuration.
"""
import argparse
import logging
from pathlib import Path
from typing import Optional
from app.config import load_config, PipelineConfig
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def run_feature_engineering(config: PipelineConfig) -> None:
"""
Run the feature engineering stage.
Args:
config: Pipeline configuration
"""
logger.info("=" * 60)
logger.info("FEATURE ENGINEERING STAGE")
logger.info("=" * 60)
if not config.stages.feature_engineering.enabled:
logger.info("Feature engineering disabled in config, skipping")
return
# Import here to avoid circular dependencies
from features.engineer import run_feature_engineering_stage
logger.info(f"Reading raw data from: {config.data.raw_path}")
run_feature_engineering_stage(config)
logger.info(f"Enriched data written to: {config.data.enriched_path}")
logger.info("Feature engineering stage completed successfully")
def run_annotation_ingestion(config: PipelineConfig) -> None:
"""
Run the annotation ingestion stage.
Args:
config: Pipeline configuration
"""
logger.info("=" * 60)
logger.info("ANNOTATION INGESTION STAGE")
logger.info("=" * 60)
if not config.stages.annotation_ingestion.enabled:
logger.info("Annotation ingestion disabled in config, skipping")
return
# Import here to avoid circular dependencies
from app.annotation_ingestion import run_annotation_ingestion_stage
logger.info(f"Reading enriched data from: {config.data.enriched_path}")
logger.info(f"Reading annotations from: {config.data.annotations_path}")
run_annotation_ingestion_stage(config)
logger.info(f"Labeled data written to: {config.data.labeled_path}")
logger.info("Annotation ingestion stage completed successfully")
def run_training(config: PipelineConfig) -> None:
"""
Run the training stage.
Args:
config: Pipeline configuration
"""
logger.info("=" * 60)
logger.info("TRAINING STAGE")
logger.info("=" * 60)
if not config.stages.training.enabled:
logger.info("Training disabled in config, skipping")
return
# Import here to avoid circular dependencies
from training.train import run_training_stage
logger.info(f"Reading labeled data from: {config.data.labeled_path}")
run_training_stage(config)
logger.info("Training stage completed successfully")
def run_pipeline(
config: PipelineConfig,
stage: Optional[str] = None
) -> None:
"""
Run the full pipeline or a specific stage.
Args:
config: Pipeline configuration
stage: Optional stage name to run. If None, runs all enabled stages.
Valid values: "feature_engineering", "annotation_ingestion", "training"
"""
logger.info("Starting ML pipeline")
logger.info(f"Config loaded from: {config}")
if stage:
logger.info(f"Running single stage: {stage}")
if stage == "feature_engineering":
run_feature_engineering(config)
elif stage == "annotation_ingestion":
run_annotation_ingestion(config)
elif stage == "training":
run_training(config)
else:
raise ValueError(
f"Invalid stage: {stage}. "
f"Valid stages: feature_engineering, annotation_ingestion, training"
)
else:
logger.info("Running all enabled stages")
run_feature_engineering(config)
run_annotation_ingestion(config)
run_training(config)
logger.info("=" * 60)
logger.info("PIPELINE COMPLETED")
logger.info("=" * 60)
def main():
"""Main entry point for the pipeline CLI."""
parser = argparse.ArgumentParser(
description="ML Pipeline for candlestick pattern recognition",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Run full pipeline with default config
python pipeline.py
# Run with custom config
python pipeline.py --config config/custom_pipeline.yaml
# Run only feature engineering stage
python pipeline.py --stage feature_engineering
# Run only training stage with custom config
python pipeline.py --config config/pipeline.yaml --stage training
"""
)
parser.add_argument(
"--config",
type=str,
default="config/pipeline.yaml",
help="Path to pipeline configuration YAML file (default: config/pipeline.yaml)"
)
parser.add_argument(
"--stage",
type=str,
choices=["feature_engineering", "annotation_ingestion", "training"],
default=None,
help="Run a specific stage only. If not specified, runs all enabled stages."
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Enable verbose debug logging"
)
args = parser.parse_args()
# Set logging level
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
logger.debug("Verbose logging enabled")
try:
# Load and validate config
logger.info(f"Loading configuration from: {args.config}")
config = load_config(args.config)
# Run pipeline
run_pipeline(config, stage=args.stage)
except FileNotFoundError as e:
logger.error(f"Configuration file not found: {e}")
return 1
except ValueError as e:
logger.error(f"Configuration validation error: {e}")
return 1
except Exception as e:
logger.error(f"Pipeline failed with error: {e}", exc_info=True)
return 1
return 0
if __name__ == "__main__":
exit(main())