feat(ml): add database schema, config parser, and DVC setup

- Initialize DVC with local storage backend (task 1.6)
- Create PostgreSQL schema for training_runs table (task 1.7)
- Add SQLAlchemy database connection setup (task 1.8)
- Create Pydantic config models for pipeline.yaml (task 2.1)
- Add migration runner for database setup
- Fix pyproject.toml package discovery config
This commit is contained in:
Marko Djordjevic 2026-02-15 12:08:53 +01:00
parent 1a653c5866
commit ea339a54a7
15 changed files with 412 additions and 4 deletions

View file

@ -0,0 +1,27 @@
-- Create training_runs table for tracking ML training runs
CREATE TABLE IF NOT EXISTS training_runs (
id SERIAL PRIMARY KEY,
run_id VARCHAR(255) NOT NULL UNIQUE,
model_type VARCHAR(100) NOT NULL,
experiment_name VARCHAR(255) NOT NULL,
pipeline_config_hash VARCHAR(64) NOT NULL,
dataset_version VARCHAR(100),
metrics_summary JSONB,
status VARCHAR(50) NOT NULL DEFAULT 'running',
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
completed_at TIMESTAMP WITH TIME ZONE,
CONSTRAINT valid_status CHECK (status IN ('running', 'completed', 'failed', 'cancelled'))
);
-- Create index on run_id for faster lookups
CREATE INDEX idx_training_runs_run_id ON training_runs(run_id);
-- Create index on experiment_name for filtering by experiment
CREATE INDEX idx_training_runs_experiment ON training_runs(experiment_name);
-- Create index on status for filtering active runs
CREATE INDEX idx_training_runs_status ON training_runs(status);
-- Create index on created_at for chronological queries
CREATE INDEX idx_training_runs_created_at ON training_runs(created_at DESC);

View file

@ -0,0 +1,65 @@
#!/usr/bin/env python3
"""
Simple database migration runner for the ML service.
Runs all SQL files in the migrations directory in order.
"""
import os
import sys
from pathlib import Path
import psycopg2
from psycopg2 import sql
def get_db_connection():
"""Get database connection from environment variables."""
return psycopg2.connect(
host=os.getenv("POSTGRES_HOST", "localhost"),
port=os.getenv("POSTGRES_PORT", "5432"),
database=os.getenv("POSTGRES_DB", "ml_service"),
user=os.getenv("POSTGRES_USER", "ml_user"),
password=os.getenv("POSTGRES_PASSWORD", "ml_password")
)
def run_migrations():
"""Run all migration files in order."""
migrations_dir = Path(__file__).parent
migration_files = sorted(migrations_dir.glob("*.sql"))
if not migration_files:
print("No migration files found")
return
print(f"Found {len(migration_files)} migration file(s)")
conn = get_db_connection()
cur = conn.cursor()
try:
for migration_file in migration_files:
print(f"Running migration: {migration_file.name}")
with open(migration_file, 'r') as f:
migration_sql = f.read()
cur.execute(migration_sql)
conn.commit()
print(f"{migration_file.name} completed")
print("\nAll migrations completed successfully")
except Exception as e:
conn.rollback()
print(f"\n✗ Migration failed: {e}", file=sys.stderr)
sys.exit(1)
finally:
cur.close()
conn.close()
if __name__ == "__main__":
run_migrations()