fix: training panel stuck button, stale runs on startup, add delete model
This commit is contained in:
parent
d34dc9d729
commit
6ef102cf21
4 changed files with 240 additions and 24 deletions
|
|
@ -310,6 +310,28 @@ async def startup_event():
|
|||
Load model and pipeline config on startup.
|
||||
"""
|
||||
logger.info("Starting inference service...")
|
||||
|
||||
# Mark any stale "running" records as failed — they belong to a previous
|
||||
# process and will never complete.
|
||||
try:
|
||||
with get_db() as db:
|
||||
stmt = (
|
||||
sa_update(TrainingRun)
|
||||
.where(TrainingRun.status == "running")
|
||||
.values(
|
||||
status="failed",
|
||||
completed_at=datetime.utcnow(),
|
||||
metrics_summary={"error": "Service restarted while training was in progress"},
|
||||
)
|
||||
)
|
||||
result = db.execute(stmt)
|
||||
db.commit()
|
||||
if result.rowcount:
|
||||
logger.warning(
|
||||
f"Marked {result.rowcount} stale training run(s) as failed on startup"
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.error(f"Failed to clean up stale training runs: {exc}")
|
||||
|
||||
# Load pipeline config
|
||||
config_path = Path("config/pipeline.yaml")
|
||||
|
|
@ -1115,6 +1137,81 @@ async def training_runs():
|
|||
return TrainingRunsResponse(runs=runs)
|
||||
|
||||
|
||||
class ActiveTrainingResponse(BaseModel):
|
||||
"""Response model for GET /training/active."""
|
||||
active: bool
|
||||
run_id: Optional[str] = None
|
||||
|
||||
|
||||
@app.get("/training/active", response_model=ActiveTrainingResponse)
|
||||
async def training_active():
|
||||
"""
|
||||
Return whether a training run is currently active and its run_id.
|
||||
"""
|
||||
with state.training_lock:
|
||||
run_id = state.active_training_run_id
|
||||
return ActiveTrainingResponse(active=run_id is not None, run_id=run_id)
|
||||
|
||||
|
||||
class DeleteRunResponse(BaseModel):
|
||||
"""Response model for DELETE /training/runs/{run_id}."""
|
||||
run_id: str
|
||||
deleted: bool
|
||||
|
||||
|
||||
@app.delete("/training/runs/{run_id}", response_model=DeleteRunResponse)
|
||||
async def delete_training_run(run_id: str):
|
||||
"""
|
||||
Delete a training run record and its model artifact.
|
||||
|
||||
Returns HTTP 409 if the run is currently active.
|
||||
Returns HTTP 404 if the run_id doesn't exist.
|
||||
"""
|
||||
from sqlalchemy import select, delete as sa_delete
|
||||
|
||||
# Reject deletion of the active run
|
||||
with state.training_lock:
|
||||
if state.active_training_run_id == run_id:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_409_CONFLICT,
|
||||
detail="Cannot delete an active training run",
|
||||
)
|
||||
|
||||
try:
|
||||
with get_db() as db:
|
||||
stmt = select(TrainingRun).where(TrainingRun.run_id == run_id)
|
||||
row = db.execute(stmt).scalar_one_or_none()
|
||||
|
||||
if row is None:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=f"Training run not found: {run_id}",
|
||||
)
|
||||
|
||||
db.execute(sa_delete(TrainingRun).where(TrainingRun.run_id == run_id))
|
||||
db.commit()
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as exc:
|
||||
logger.error(f"Failed to delete training run {run_id}: {exc}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to delete training run: {exc}",
|
||||
)
|
||||
|
||||
# Remove model artifact if it exists
|
||||
model_path = Path("models") / f"{run_id}.pkl"
|
||||
if model_path.exists():
|
||||
try:
|
||||
model_path.unlink()
|
||||
logger.info(f"Deleted model artifact: {model_path}")
|
||||
except Exception as exc:
|
||||
logger.warning(f"Could not delete model artifact {model_path}: {exc}")
|
||||
|
||||
logger.info(f"Deleted training run: {run_id}")
|
||||
return DeleteRunResponse(run_id=run_id, deleted=True)
|
||||
|
||||
|
||||
@app.get("/training/dataset-info", response_model=DatasetInfoResponse)
|
||||
async def training_dataset_info():
|
||||
"""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue