feat(ml): add database schema, config parser, and DVC setup

- Initialize DVC with local storage backend (task 1.6) - Create PostgreSQL schema for training_runs table (task 1.7) - Add SQLAlchemy database connection setup (task 1.8) - Create Pydantic config models for pipeline.yaml (task 2.1) - Add migration runner for database setup - Fix pyproject.toml package discovery config
2026-02-15 12:08:53 +01:00 · 2026-02-15 12:08:53 +01:00 · ea339a54a7
commit ea339a54a7
parent 1a653c5866
15 changed files with 412 additions and 4 deletions
--- a/services/ml/migrations/001_create_training_runs.sql
+++ b/services/ml/migrations/001_create_training_runs.sql
@ -0,0 +1,27 @@
+-- Create training_runs table for tracking ML training runs
+CREATE TABLE IF NOT EXISTS training_runs (
+    id SERIAL PRIMARY KEY,
+    run_id VARCHAR(255) NOT NULL UNIQUE,
+    model_type VARCHAR(100) NOT NULL,
+    experiment_name VARCHAR(255) NOT NULL,
+    pipeline_config_hash VARCHAR(64) NOT NULL,
+    dataset_version VARCHAR(100),
+    metrics_summary JSONB,
+    status VARCHAR(50) NOT NULL DEFAULT 'running',
+    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    completed_at TIMESTAMP WITH TIME ZONE,
+    
+    CONSTRAINT valid_status CHECK (status IN ('running', 'completed', 'failed', 'cancelled'))
+);
+
+-- Create index on run_id for faster lookups
+CREATE INDEX idx_training_runs_run_id ON training_runs(run_id);
+
+-- Create index on experiment_name for filtering by experiment
+CREATE INDEX idx_training_runs_experiment ON training_runs(experiment_name);
+
+-- Create index on status for filtering active runs
+CREATE INDEX idx_training_runs_status ON training_runs(status);
+
+-- Create index on created_at for chronological queries
+CREATE INDEX idx_training_runs_created_at ON training_runs(created_at DESC);
--- a/services/ml/migrations/run_migrations.py
+++ b/services/ml/migrations/run_migrations.py
@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+"""
+Simple database migration runner for the ML service.
+Runs all SQL files in the migrations directory in order.
+"""
+
+import os
+import sys
+from pathlib import Path
+
+import psycopg2
+from psycopg2 import sql
+
+
+def get_db_connection():
+    """Get database connection from environment variables."""
+    return psycopg2.connect(
+        host=os.getenv("POSTGRES_HOST", "localhost"),
+        port=os.getenv("POSTGRES_PORT", "5432"),
+        database=os.getenv("POSTGRES_DB", "ml_service"),
+        user=os.getenv("POSTGRES_USER", "ml_user"),
+        password=os.getenv("POSTGRES_PASSWORD", "ml_password")
+    )
+
+
+def run_migrations():
+    """Run all migration files in order."""
+    migrations_dir = Path(__file__).parent
+    migration_files = sorted(migrations_dir.glob("*.sql"))
+    
+    if not migration_files:
+        print("No migration files found")
+        return
+    
+    print(f"Found {len(migration_files)} migration file(s)")
+    
+    conn = get_db_connection()
+    cur = conn.cursor()
+    
+    try:
+        for migration_file in migration_files:
+            print(f"Running migration: {migration_file.name}")
+            
+            with open(migration_file, 'r') as f:
+                migration_sql = f.read()
+            
+            cur.execute(migration_sql)
+            conn.commit()
+            
+            print(f"  ✓ {migration_file.name} completed")
+        
+        print("\nAll migrations completed successfully")
+        
+    except Exception as e:
+        conn.rollback()
+        print(f"\n✗ Migration failed: {e}", file=sys.stderr)
+        sys.exit(1)
+        
+    finally:
+        cur.close()
+        conn.close()
+
+
+if __name__ == "__main__":
+    run_migrations()