fix: training panel stuck button, stale runs on startup, add delete model

This commit is contained in:
Marko Djordjevic 2026-02-17 22:23:50 +01:00
parent d34dc9d729
commit 6ef102cf21
4 changed files with 240 additions and 24 deletions

View file

@ -310,6 +310,28 @@ async def startup_event():
Load model and pipeline config on startup.
"""
logger.info("Starting inference service...")
# Mark any stale "running" records as failed — they belong to a previous
# process and will never complete.
try:
with get_db() as db:
stmt = (
sa_update(TrainingRun)
.where(TrainingRun.status == "running")
.values(
status="failed",
completed_at=datetime.utcnow(),
metrics_summary={"error": "Service restarted while training was in progress"},
)
)
result = db.execute(stmt)
db.commit()
if result.rowcount:
logger.warning(
f"Marked {result.rowcount} stale training run(s) as failed on startup"
)
except Exception as exc:
logger.error(f"Failed to clean up stale training runs: {exc}")
# Load pipeline config
config_path = Path("config/pipeline.yaml")
@ -1115,6 +1137,81 @@ async def training_runs():
return TrainingRunsResponse(runs=runs)
class ActiveTrainingResponse(BaseModel):
"""Response model for GET /training/active."""
active: bool
run_id: Optional[str] = None
@app.get("/training/active", response_model=ActiveTrainingResponse)
async def training_active():
"""
Return whether a training run is currently active and its run_id.
"""
with state.training_lock:
run_id = state.active_training_run_id
return ActiveTrainingResponse(active=run_id is not None, run_id=run_id)
class DeleteRunResponse(BaseModel):
"""Response model for DELETE /training/runs/{run_id}."""
run_id: str
deleted: bool
@app.delete("/training/runs/{run_id}", response_model=DeleteRunResponse)
async def delete_training_run(run_id: str):
"""
Delete a training run record and its model artifact.
Returns HTTP 409 if the run is currently active.
Returns HTTP 404 if the run_id doesn't exist.
"""
from sqlalchemy import select, delete as sa_delete
# Reject deletion of the active run
with state.training_lock:
if state.active_training_run_id == run_id:
raise HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail="Cannot delete an active training run",
)
try:
with get_db() as db:
stmt = select(TrainingRun).where(TrainingRun.run_id == run_id)
row = db.execute(stmt).scalar_one_or_none()
if row is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Training run not found: {run_id}",
)
db.execute(sa_delete(TrainingRun).where(TrainingRun.run_id == run_id))
db.commit()
except HTTPException:
raise
except Exception as exc:
logger.error(f"Failed to delete training run {run_id}: {exc}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to delete training run: {exc}",
)
# Remove model artifact if it exists
model_path = Path("models") / f"{run_id}.pkl"
if model_path.exists():
try:
model_path.unlink()
logger.info(f"Deleted model artifact: {model_path}")
except Exception as exc:
logger.warning(f"Could not delete model artifact {model_path}: {exc}")
logger.info(f"Deleted training run: {run_id}")
return DeleteRunResponse(run_id=run_id, deleted=True)
@app.get("/training/dataset-info", response_model=DatasetInfoResponse)
async def training_dataset_info():
"""