feat(ml): add database schema, config parser, and DVC setup
- Initialize DVC with local storage backend (task 1.6) - Create PostgreSQL schema for training_runs table (task 1.7) - Add SQLAlchemy database connection setup (task 1.8) - Create Pydantic config models for pipeline.yaml (task 2.1) - Add migration runner for database setup - Fix pyproject.toml package discovery config
This commit is contained in:
parent
1a653c5866
commit
ea339a54a7
15 changed files with 412 additions and 4 deletions
27
services/ml/migrations/001_create_training_runs.sql
Normal file
27
services/ml/migrations/001_create_training_runs.sql
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
-- Create training_runs table for tracking ML training runs
|
||||
CREATE TABLE IF NOT EXISTS training_runs (
|
||||
id SERIAL PRIMARY KEY,
|
||||
run_id VARCHAR(255) NOT NULL UNIQUE,
|
||||
model_type VARCHAR(100) NOT NULL,
|
||||
experiment_name VARCHAR(255) NOT NULL,
|
||||
pipeline_config_hash VARCHAR(64) NOT NULL,
|
||||
dataset_version VARCHAR(100),
|
||||
metrics_summary JSONB,
|
||||
status VARCHAR(50) NOT NULL DEFAULT 'running',
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
||||
completed_at TIMESTAMP WITH TIME ZONE,
|
||||
|
||||
CONSTRAINT valid_status CHECK (status IN ('running', 'completed', 'failed', 'cancelled'))
|
||||
);
|
||||
|
||||
-- Create index on run_id for faster lookups
|
||||
CREATE INDEX idx_training_runs_run_id ON training_runs(run_id);
|
||||
|
||||
-- Create index on experiment_name for filtering by experiment
|
||||
CREATE INDEX idx_training_runs_experiment ON training_runs(experiment_name);
|
||||
|
||||
-- Create index on status for filtering active runs
|
||||
CREATE INDEX idx_training_runs_status ON training_runs(status);
|
||||
|
||||
-- Create index on created_at for chronological queries
|
||||
CREATE INDEX idx_training_runs_created_at ON training_runs(created_at DESC);
|
||||
65
services/ml/migrations/run_migrations.py
Executable file
65
services/ml/migrations/run_migrations.py
Executable file
|
|
@ -0,0 +1,65 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simple database migration runner for the ML service.
|
||||
Runs all SQL files in the migrations directory in order.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import psycopg2
|
||||
from psycopg2 import sql
|
||||
|
||||
|
||||
def get_db_connection():
|
||||
"""Get database connection from environment variables."""
|
||||
return psycopg2.connect(
|
||||
host=os.getenv("POSTGRES_HOST", "localhost"),
|
||||
port=os.getenv("POSTGRES_PORT", "5432"),
|
||||
database=os.getenv("POSTGRES_DB", "ml_service"),
|
||||
user=os.getenv("POSTGRES_USER", "ml_user"),
|
||||
password=os.getenv("POSTGRES_PASSWORD", "ml_password")
|
||||
)
|
||||
|
||||
|
||||
def run_migrations():
|
||||
"""Run all migration files in order."""
|
||||
migrations_dir = Path(__file__).parent
|
||||
migration_files = sorted(migrations_dir.glob("*.sql"))
|
||||
|
||||
if not migration_files:
|
||||
print("No migration files found")
|
||||
return
|
||||
|
||||
print(f"Found {len(migration_files)} migration file(s)")
|
||||
|
||||
conn = get_db_connection()
|
||||
cur = conn.cursor()
|
||||
|
||||
try:
|
||||
for migration_file in migration_files:
|
||||
print(f"Running migration: {migration_file.name}")
|
||||
|
||||
with open(migration_file, 'r') as f:
|
||||
migration_sql = f.read()
|
||||
|
||||
cur.execute(migration_sql)
|
||||
conn.commit()
|
||||
|
||||
print(f" ✓ {migration_file.name} completed")
|
||||
|
||||
print("\nAll migrations completed successfully")
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
print(f"\n✗ Migration failed: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
finally:
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_migrations()
|
||||
Loading…
Add table
Add a link
Reference in a new issue