feat(ml): add database schema, config parser, and DVC setup
- Initialize DVC with local storage backend (task 1.6) - Create PostgreSQL schema for training_runs table (task 1.7) - Add SQLAlchemy database connection setup (task 1.8) - Create Pydantic config models for pipeline.yaml (task 2.1) - Add migration runner for database setup - Fix pyproject.toml package discovery config
This commit is contained in:
parent
1a653c5866
commit
ea339a54a7
15 changed files with 412 additions and 4 deletions
1
services/ml/app/__init__.py
Normal file
1
services/ml/app/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
"""ML service application package."""
|
||||
147
services/ml/app/config.py
Normal file
147
services/ml/app/config.py
Normal file
|
|
@ -0,0 +1,147 @@
|
|||
"""
|
||||
Pipeline configuration module.
|
||||
|
||||
Pydantic models for validating and loading the pipeline.yaml configuration.
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any, Optional, Literal
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
|
||||
|
||||
class TALibIndicator(BaseModel):
|
||||
"""Configuration for a single TA-Lib indicator."""
|
||||
name: str
|
||||
params: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class FeatureEngineeringConfig(BaseModel):
|
||||
"""Feature engineering stage configuration."""
|
||||
enabled: bool = True
|
||||
talib_indicators: List[TALibIndicator] = Field(default_factory=list)
|
||||
candle_features: bool = True
|
||||
custom_features: List[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class ProgrammaticLabelsConfig(BaseModel):
|
||||
"""Configuration for programmatic TA-Lib pattern labels."""
|
||||
enabled: bool = True
|
||||
talib_patterns: List[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class AnnotationIngestionConfig(BaseModel):
|
||||
"""Annotation ingestion stage configuration."""
|
||||
enabled: bool = True
|
||||
label_encoding: Literal["window", "bio"] = "window"
|
||||
window_size: int = 30
|
||||
context_padding: int = 20
|
||||
min_confidence: int = 1
|
||||
programmatic_labels: ProgrammaticLabelsConfig = Field(
|
||||
default_factory=ProgrammaticLabelsConfig
|
||||
)
|
||||
merge_strategy: Literal["human_priority", "programmatic_priority", "both"] = "human_priority"
|
||||
|
||||
|
||||
class MLflowConfig(BaseModel):
|
||||
"""MLflow experiment tracking configuration."""
|
||||
tracking_uri: str = "http://mlflow:5000"
|
||||
experiment_name: str = "candlestick_patterns"
|
||||
log_artifacts: bool = True
|
||||
register_model: bool = False
|
||||
|
||||
|
||||
class TrainingConfig(BaseModel):
|
||||
"""Training stage configuration."""
|
||||
enabled: bool = True
|
||||
model_type: Literal["random_forest", "xgboost"] = "random_forest"
|
||||
split_method: Literal["temporal", "random"] = "temporal"
|
||||
test_split: float = Field(0.2, ge=0.0, le=1.0)
|
||||
validation_split: float = Field(0.1, ge=0.0, le=1.0)
|
||||
class_weights: Optional[Literal["balanced"]] = "balanced"
|
||||
hyperparameters: Dict[str, Any] = Field(default_factory=dict)
|
||||
mlflow: MLflowConfig = Field(default_factory=MLflowConfig)
|
||||
|
||||
@field_validator("test_split", "validation_split")
|
||||
@classmethod
|
||||
def validate_split(cls, v):
|
||||
if not 0.0 <= v <= 1.0:
|
||||
raise ValueError("Split must be between 0.0 and 1.0")
|
||||
return v
|
||||
|
||||
|
||||
class InferenceConfig(BaseModel):
|
||||
"""Inference stage configuration."""
|
||||
enabled: bool = True
|
||||
model_source: Literal["mlflow", "local"] = "local"
|
||||
mlflow_model_name: Optional[str] = "candlestick_pattern_v1"
|
||||
mlflow_model_stage: Literal["Production", "Staging", "None"] = "Production"
|
||||
local_model_path: str = "models/best_model.pkl"
|
||||
batch_size: int = 1000
|
||||
use_training_config: bool = True
|
||||
|
||||
|
||||
class DataConfig(BaseModel):
|
||||
"""Data paths configuration."""
|
||||
raw_path: str = "data/raw/OHLCV.csv"
|
||||
enriched_path: str = "data/enriched/features.csv"
|
||||
labeled_path: str = "data/labeled/dataset.csv"
|
||||
annotations_path: str = "data/annotations/export.json"
|
||||
|
||||
|
||||
class StagesConfig(BaseModel):
|
||||
"""All pipeline stages configuration."""
|
||||
feature_engineering: FeatureEngineeringConfig = Field(
|
||||
default_factory=FeatureEngineeringConfig
|
||||
)
|
||||
annotation_ingestion: AnnotationIngestionConfig = Field(
|
||||
default_factory=AnnotationIngestionConfig
|
||||
)
|
||||
training: TrainingConfig = Field(default_factory=TrainingConfig)
|
||||
inference: InferenceConfig = Field(default_factory=InferenceConfig)
|
||||
|
||||
|
||||
class PipelineConfig(BaseModel):
|
||||
"""Root pipeline configuration."""
|
||||
data: DataConfig = Field(default_factory=DataConfig)
|
||||
stages: StagesConfig = Field(default_factory=StagesConfig)
|
||||
|
||||
|
||||
def load_config(config_path: str | Path) -> PipelineConfig:
|
||||
"""
|
||||
Load and validate pipeline configuration from YAML file.
|
||||
|
||||
Args:
|
||||
config_path: Path to pipeline.yaml file
|
||||
|
||||
Returns:
|
||||
Validated PipelineConfig object
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If config file doesn't exist
|
||||
ValueError: If config validation fails
|
||||
yaml.YAMLError: If YAML parsing fails
|
||||
"""
|
||||
config_path = Path(config_path)
|
||||
|
||||
if not config_path.exists():
|
||||
raise FileNotFoundError(f"Config file not found: {config_path}")
|
||||
|
||||
with open(config_path, 'r') as f:
|
||||
config_dict = yaml.safe_load(f)
|
||||
|
||||
try:
|
||||
return PipelineConfig(**config_dict)
|
||||
except Exception as e:
|
||||
raise ValueError(f"Config validation failed: {e}")
|
||||
|
||||
|
||||
def get_default_config() -> PipelineConfig:
|
||||
"""
|
||||
Get default pipeline configuration.
|
||||
|
||||
Returns:
|
||||
PipelineConfig with default values
|
||||
"""
|
||||
return PipelineConfig()
|
||||
106
services/ml/app/db.py
Normal file
106
services/ml/app/db.py
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
"""
|
||||
Database connection and session management for the ML service.
|
||||
|
||||
This module provides SQLAlchemy engine and session setup for PostgreSQL.
|
||||
Environment variables control the connection parameters.
|
||||
"""
|
||||
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
from typing import Generator
|
||||
|
||||
from sqlalchemy import create_engine, Column, Integer, String, DateTime, JSON
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import sessionmaker, Session
|
||||
from sqlalchemy.sql import func
|
||||
|
||||
|
||||
# Database connection configuration from environment
|
||||
DATABASE_URL = os.getenv(
|
||||
"DATABASE_URL",
|
||||
f"postgresql://{os.getenv('POSTGRES_USER', 'ml_user')}:"
|
||||
f"{os.getenv('POSTGRES_PASSWORD', 'ml_password')}@"
|
||||
f"{os.getenv('POSTGRES_HOST', 'localhost')}:"
|
||||
f"{os.getenv('POSTGRES_PORT', '5432')}/"
|
||||
f"{os.getenv('POSTGRES_DB', 'ml_service')}"
|
||||
)
|
||||
|
||||
# Create SQLAlchemy engine
|
||||
engine = create_engine(
|
||||
DATABASE_URL,
|
||||
pool_pre_ping=True, # Verify connections before using them
|
||||
pool_size=5, # Number of connections to maintain
|
||||
max_overflow=10, # Max connections beyond pool_size
|
||||
)
|
||||
|
||||
# Create session factory
|
||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
||||
|
||||
# Base class for declarative models
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
# Training runs model
|
||||
class TrainingRun(Base):
|
||||
"""Model for tracking ML training runs."""
|
||||
|
||||
__tablename__ = "training_runs"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
run_id = Column(String(255), unique=True, nullable=False, index=True)
|
||||
model_type = Column(String(100), nullable=False)
|
||||
experiment_name = Column(String(255), nullable=False, index=True)
|
||||
pipeline_config_hash = Column(String(64), nullable=False)
|
||||
dataset_version = Column(String(100))
|
||||
metrics_summary = Column(JSON)
|
||||
status = Column(String(50), nullable=False, default="running", index=True)
|
||||
created_at = Column(DateTime(timezone=True), server_default=func.now(), index=True)
|
||||
completed_at = Column(DateTime(timezone=True))
|
||||
|
||||
def __repr__(self):
|
||||
return f"<TrainingRun(run_id='{self.run_id}', status='{self.status}')>"
|
||||
|
||||
|
||||
def init_db():
|
||||
"""
|
||||
Initialize the database schema.
|
||||
Creates all tables defined by Base.metadata.
|
||||
"""
|
||||
Base.metadata.create_all(bind=engine)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def get_db() -> Generator[Session, None, None]:
|
||||
"""
|
||||
Context manager for database sessions.
|
||||
|
||||
Usage:
|
||||
with get_db() as db:
|
||||
# Use db session here
|
||||
training_run = TrainingRun(run_id="123", ...)
|
||||
db.add(training_run)
|
||||
db.commit()
|
||||
|
||||
Yields:
|
||||
Database session
|
||||
"""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def get_db_session() -> Session:
|
||||
"""
|
||||
Get a database session (for dependency injection).
|
||||
|
||||
Usage with FastAPI:
|
||||
@app.get("/")
|
||||
def endpoint(db: Session = Depends(get_db_session)):
|
||||
# Use db here
|
||||
|
||||
Returns:
|
||||
Database session (caller must close it)
|
||||
"""
|
||||
return SessionLocal()
|
||||
Loading…
Add table
Add a link
Reference in a new issue