feat(ml): add database schema, config parser, and DVC setup

- Initialize DVC with local storage backend (task 1.6) - Create PostgreSQL schema for training_runs table (task 1.7) - Add SQLAlchemy database connection setup (task 1.8) - Create Pydantic config models for pipeline.yaml (task 2.1) - Add migration runner for database setup - Fix pyproject.toml package discovery config
2026-02-15 12:08:53 +01:00 · 2026-02-15 12:08:53 +01:00 · ea339a54a7
commit ea339a54a7
parent 1a653c5866
15 changed files with 412 additions and 4 deletions
--- a/services/ml/app/init.py
+++ b/services/ml/app/init.py
@ -0,0 +1 @@
+"""ML service application package."""
--- a/services/ml/app/config.py
+++ b/services/ml/app/config.py
@ -0,0 +1,147 @@
+"""
+Pipeline configuration module.
+
+Pydantic models for validating and loading the pipeline.yaml configuration.
+"""
+
+from typing import List, Dict, Any, Optional, Literal
+from pathlib import Path
+
+import yaml
+from pydantic import BaseModel, Field, field_validator
+
+
+class TALibIndicator(BaseModel):
+    """Configuration for a single TA-Lib indicator."""
+    name: str
+    params: Dict[str, Any] = Field(default_factory=dict)
+
+
+class FeatureEngineeringConfig(BaseModel):
+    """Feature engineering stage configuration."""
+    enabled: bool = True
+    talib_indicators: List[TALibIndicator] = Field(default_factory=list)
+    candle_features: bool = True
+    custom_features: List[str] = Field(default_factory=list)
+
+
+class ProgrammaticLabelsConfig(BaseModel):
+    """Configuration for programmatic TA-Lib pattern labels."""
+    enabled: bool = True
+    talib_patterns: List[str] = Field(default_factory=list)
+
+
+class AnnotationIngestionConfig(BaseModel):
+    """Annotation ingestion stage configuration."""
+    enabled: bool = True
+    label_encoding: Literal["window", "bio"] = "window"
+    window_size: int = 30
+    context_padding: int = 20
+    min_confidence: int = 1
+    programmatic_labels: ProgrammaticLabelsConfig = Field(
+        default_factory=ProgrammaticLabelsConfig
+    )
+    merge_strategy: Literal["human_priority", "programmatic_priority", "both"] = "human_priority"
+
+
+class MLflowConfig(BaseModel):
+    """MLflow experiment tracking configuration."""
+    tracking_uri: str = "http://mlflow:5000"
+    experiment_name: str = "candlestick_patterns"
+    log_artifacts: bool = True
+    register_model: bool = False
+
+
+class TrainingConfig(BaseModel):
+    """Training stage configuration."""
+    enabled: bool = True
+    model_type: Literal["random_forest", "xgboost"] = "random_forest"
+    split_method: Literal["temporal", "random"] = "temporal"
+    test_split: float = Field(0.2, ge=0.0, le=1.0)
+    validation_split: float = Field(0.1, ge=0.0, le=1.0)
+    class_weights: Optional[Literal["balanced"]] = "balanced"
+    hyperparameters: Dict[str, Any] = Field(default_factory=dict)
+    mlflow: MLflowConfig = Field(default_factory=MLflowConfig)
+    
+    @field_validator("test_split", "validation_split")
+    @classmethod
+    def validate_split(cls, v):
+        if not 0.0 <= v <= 1.0:
+            raise ValueError("Split must be between 0.0 and 1.0")
+        return v
+
+
+class InferenceConfig(BaseModel):
+    """Inference stage configuration."""
+    enabled: bool = True
+    model_source: Literal["mlflow", "local"] = "local"
+    mlflow_model_name: Optional[str] = "candlestick_pattern_v1"
+    mlflow_model_stage: Literal["Production", "Staging", "None"] = "Production"
+    local_model_path: str = "models/best_model.pkl"
+    batch_size: int = 1000
+    use_training_config: bool = True
+
+
+class DataConfig(BaseModel):
+    """Data paths configuration."""
+    raw_path: str = "data/raw/OHLCV.csv"
+    enriched_path: str = "data/enriched/features.csv"
+    labeled_path: str = "data/labeled/dataset.csv"
+    annotations_path: str = "data/annotations/export.json"
+
+
+class StagesConfig(BaseModel):
+    """All pipeline stages configuration."""
+    feature_engineering: FeatureEngineeringConfig = Field(
+        default_factory=FeatureEngineeringConfig
+    )
+    annotation_ingestion: AnnotationIngestionConfig = Field(
+        default_factory=AnnotationIngestionConfig
+    )
+    training: TrainingConfig = Field(default_factory=TrainingConfig)
+    inference: InferenceConfig = Field(default_factory=InferenceConfig)
+
+
+class PipelineConfig(BaseModel):
+    """Root pipeline configuration."""
+    data: DataConfig = Field(default_factory=DataConfig)
+    stages: StagesConfig = Field(default_factory=StagesConfig)
+
+
+def load_config(config_path: str | Path) -> PipelineConfig:
+    """
+    Load and validate pipeline configuration from YAML file.
+    
+    Args:
+        config_path: Path to pipeline.yaml file
+        
+    Returns:
+        Validated PipelineConfig object
+        
+    Raises:
+        FileNotFoundError: If config file doesn't exist
+        ValueError: If config validation fails
+        yaml.YAMLError: If YAML parsing fails
+    """
+    config_path = Path(config_path)
+    
+    if not config_path.exists():
+        raise FileNotFoundError(f"Config file not found: {config_path}")
+    
+    with open(config_path, 'r') as f:
+        config_dict = yaml.safe_load(f)
+    
+    try:
+        return PipelineConfig(**config_dict)
+    except Exception as e:
+        raise ValueError(f"Config validation failed: {e}")
+
+
+def get_default_config() -> PipelineConfig:
+    """
+    Get default pipeline configuration.
+    
+    Returns:
+        PipelineConfig with default values
+    """
+    return PipelineConfig()
--- a/services/ml/app/db.py
+++ b/services/ml/app/db.py
@ -0,0 +1,106 @@
+"""
+Database connection and session management for the ML service.
+
+This module provides SQLAlchemy engine and session setup for PostgreSQL.
+Environment variables control the connection parameters.
+"""
+
+import os
+from contextlib import contextmanager
+from typing import Generator
+
+from sqlalchemy import create_engine, Column, Integer, String, DateTime, JSON
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import sessionmaker, Session
+from sqlalchemy.sql import func
+
+
+# Database connection configuration from environment
+DATABASE_URL = os.getenv(
+    "DATABASE_URL",
+    f"postgresql://{os.getenv('POSTGRES_USER', 'ml_user')}:"
+    f"{os.getenv('POSTGRES_PASSWORD', 'ml_password')}@"
+    f"{os.getenv('POSTGRES_HOST', 'localhost')}:"
+    f"{os.getenv('POSTGRES_PORT', '5432')}/"
+    f"{os.getenv('POSTGRES_DB', 'ml_service')}"
+)
+
+# Create SQLAlchemy engine
+engine = create_engine(
+    DATABASE_URL,
+    pool_pre_ping=True,  # Verify connections before using them
+    pool_size=5,         # Number of connections to maintain
+    max_overflow=10,     # Max connections beyond pool_size
+)
+
+# Create session factory
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+
+# Base class for declarative models
+Base = declarative_base()
+
+
+# Training runs model
+class TrainingRun(Base):
+    """Model for tracking ML training runs."""
+    
+    __tablename__ = "training_runs"
+    
+    id = Column(Integer, primary_key=True, index=True)
+    run_id = Column(String(255), unique=True, nullable=False, index=True)
+    model_type = Column(String(100), nullable=False)
+    experiment_name = Column(String(255), nullable=False, index=True)
+    pipeline_config_hash = Column(String(64), nullable=False)
+    dataset_version = Column(String(100))
+    metrics_summary = Column(JSON)
+    status = Column(String(50), nullable=False, default="running", index=True)
+    created_at = Column(DateTime(timezone=True), server_default=func.now(), index=True)
+    completed_at = Column(DateTime(timezone=True))
+    
+    def __repr__(self):
+        return f"<TrainingRun(run_id='{self.run_id}', status='{self.status}')>"
+
+
+def init_db():
+    """
+    Initialize the database schema.
+    Creates all tables defined by Base.metadata.
+    """
+    Base.metadata.create_all(bind=engine)
+
+
+@contextmanager
+def get_db() -> Generator[Session, None, None]:
+    """
+    Context manager for database sessions.
+    
+    Usage:
+        with get_db() as db:
+            # Use db session here
+            training_run = TrainingRun(run_id="123", ...)
+            db.add(training_run)
+            db.commit()
+    
+    Yields:
+        Database session
+    """
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()
+
+
+def get_db_session() -> Session:
+    """
+    Get a database session (for dependency injection).
+    
+    Usage with FastAPI:
+        @app.get("/")
+        def endpoint(db: Session = Depends(get_db_session)):
+            # Use db here
+    
+    Returns:
+        Database session (caller must close it)
+    """
+    return SessionLocal()