feat(ml): implement feature engineering pipeline
- Create pipeline.py with CLI argument parsing for running stages - Implement TA-Lib indicator computation with multi-output support - Add candle feature extraction (body_size, wicks, ratios, etc.) - Create custom feature loader with dynamic module import - Wire all feature engineering stages with NaN handling - Tasks completed: 2.2, 2.3, 3.1, 3.2, 3.3, 3.4, 3.5
This commit is contained in:
parent
ea339a54a7
commit
fd29ab91e0
6 changed files with 889 additions and 7 deletions
207
services/ml/pipeline.py
Normal file
207
services/ml/pipeline.py
Normal file
|
|
@ -0,0 +1,207 @@
|
|||
"""
|
||||
ML Pipeline orchestrator.
|
||||
|
||||
Runs feature engineering, annotation ingestion, training, and inference stages
|
||||
based on configuration.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from app.config import load_config, PipelineConfig
|
||||
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def run_feature_engineering(config: PipelineConfig) -> None:
|
||||
"""
|
||||
Run the feature engineering stage.
|
||||
|
||||
Args:
|
||||
config: Pipeline configuration
|
||||
"""
|
||||
logger.info("=" * 60)
|
||||
logger.info("FEATURE ENGINEERING STAGE")
|
||||
logger.info("=" * 60)
|
||||
|
||||
if not config.stages.feature_engineering.enabled:
|
||||
logger.info("Feature engineering disabled in config, skipping")
|
||||
return
|
||||
|
||||
# Import here to avoid circular dependencies
|
||||
from features.engineer import run_feature_engineering_stage
|
||||
|
||||
logger.info(f"Reading raw data from: {config.data.raw_path}")
|
||||
run_feature_engineering_stage(config)
|
||||
logger.info(f"Enriched data written to: {config.data.enriched_path}")
|
||||
logger.info("Feature engineering stage completed successfully")
|
||||
|
||||
|
||||
def run_annotation_ingestion(config: PipelineConfig) -> None:
|
||||
"""
|
||||
Run the annotation ingestion stage.
|
||||
|
||||
Args:
|
||||
config: Pipeline configuration
|
||||
"""
|
||||
logger.info("=" * 60)
|
||||
logger.info("ANNOTATION INGESTION STAGE")
|
||||
logger.info("=" * 60)
|
||||
|
||||
if not config.stages.annotation_ingestion.enabled:
|
||||
logger.info("Annotation ingestion disabled in config, skipping")
|
||||
return
|
||||
|
||||
# Import here to avoid circular dependencies
|
||||
from app.annotation_ingestion import run_annotation_ingestion_stage
|
||||
|
||||
logger.info(f"Reading enriched data from: {config.data.enriched_path}")
|
||||
logger.info(f"Reading annotations from: {config.data.annotations_path}")
|
||||
run_annotation_ingestion_stage(config)
|
||||
logger.info(f"Labeled data written to: {config.data.labeled_path}")
|
||||
logger.info("Annotation ingestion stage completed successfully")
|
||||
|
||||
|
||||
def run_training(config: PipelineConfig) -> None:
|
||||
"""
|
||||
Run the training stage.
|
||||
|
||||
Args:
|
||||
config: Pipeline configuration
|
||||
"""
|
||||
logger.info("=" * 60)
|
||||
logger.info("TRAINING STAGE")
|
||||
logger.info("=" * 60)
|
||||
|
||||
if not config.stages.training.enabled:
|
||||
logger.info("Training disabled in config, skipping")
|
||||
return
|
||||
|
||||
# Import here to avoid circular dependencies
|
||||
from training.train import run_training_stage
|
||||
|
||||
logger.info(f"Reading labeled data from: {config.data.labeled_path}")
|
||||
run_training_stage(config)
|
||||
logger.info("Training stage completed successfully")
|
||||
|
||||
|
||||
def run_pipeline(
|
||||
config: PipelineConfig,
|
||||
stage: Optional[str] = None
|
||||
) -> None:
|
||||
"""
|
||||
Run the full pipeline or a specific stage.
|
||||
|
||||
Args:
|
||||
config: Pipeline configuration
|
||||
stage: Optional stage name to run. If None, runs all enabled stages.
|
||||
Valid values: "feature_engineering", "annotation_ingestion", "training"
|
||||
"""
|
||||
logger.info("Starting ML pipeline")
|
||||
logger.info(f"Config loaded from: {config}")
|
||||
|
||||
if stage:
|
||||
logger.info(f"Running single stage: {stage}")
|
||||
if stage == "feature_engineering":
|
||||
run_feature_engineering(config)
|
||||
elif stage == "annotation_ingestion":
|
||||
run_annotation_ingestion(config)
|
||||
elif stage == "training":
|
||||
run_training(config)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid stage: {stage}. "
|
||||
f"Valid stages: feature_engineering, annotation_ingestion, training"
|
||||
)
|
||||
else:
|
||||
logger.info("Running all enabled stages")
|
||||
run_feature_engineering(config)
|
||||
run_annotation_ingestion(config)
|
||||
run_training(config)
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("PIPELINE COMPLETED")
|
||||
logger.info("=" * 60)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the pipeline CLI."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="ML Pipeline for candlestick pattern recognition",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Run full pipeline with default config
|
||||
python pipeline.py
|
||||
|
||||
# Run with custom config
|
||||
python pipeline.py --config config/custom_pipeline.yaml
|
||||
|
||||
# Run only feature engineering stage
|
||||
python pipeline.py --stage feature_engineering
|
||||
|
||||
# Run only training stage with custom config
|
||||
python pipeline.py --config config/pipeline.yaml --stage training
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
default="config/pipeline.yaml",
|
||||
help="Path to pipeline configuration YAML file (default: config/pipeline.yaml)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--stage",
|
||||
type=str,
|
||||
choices=["feature_engineering", "annotation_ingestion", "training"],
|
||||
default=None,
|
||||
help="Run a specific stage only. If not specified, runs all enabled stages."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
"-v",
|
||||
action="store_true",
|
||||
help="Enable verbose debug logging"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set logging level
|
||||
if args.verbose:
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
logger.debug("Verbose logging enabled")
|
||||
|
||||
try:
|
||||
# Load and validate config
|
||||
logger.info(f"Loading configuration from: {args.config}")
|
||||
config = load_config(args.config)
|
||||
|
||||
# Run pipeline
|
||||
run_pipeline(config, stage=args.stage)
|
||||
|
||||
except FileNotFoundError as e:
|
||||
logger.error(f"Configuration file not found: {e}")
|
||||
return 1
|
||||
except ValueError as e:
|
||||
logger.error(f"Configuration validation error: {e}")
|
||||
return 1
|
||||
except Exception as e:
|
||||
logger.error(f"Pipeline failed with error: {e}", exc_info=True)
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue