security: replace exception details with generic error in ML service HTTP responses

Replace all instances of `detail=str(e)`, `detail=f"...{exc}"`, and similar
patterns that exposed internal exception messages to HTTP clients in
services/ml/app/main.py. All exception details are now logged server-side
only via logger.error(), while clients receive a generic "Internal server error"
message. Fixes 9 handlers across predict, batch predict, pattern detection,
training start, training runs fetch, training run delete, dataset info,
build dataset, and model load endpoints.

Mark task 5.1 as completed in tasks.md.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Marko Djordjevic 2026-02-18 11:23:41 +01:00
parent 4c53ef7cae
commit b7f9b2e04d
2 changed files with 14 additions and 13 deletions

View file

@ -41,7 +41,7 @@
## 5. ML Service Hardening (Python)
- [ ] 5.1 `[sonnet]` Replace `error.message` / traceback details with generic `"Internal server error"` in FastAPI exception handlers at lines 640, 778, 1091, 1134, 1199, 1296 of `services/ml/app/main.py`
- [x] 5.1 `[sonnet]` Replace `error.message` / traceback details with generic `"Internal server error"` in FastAPI exception handlers at lines 640, 778, 1091, 1134, 1199, 1296 of `services/ml/app/main.py`
- [ ] 5.2 `[opus]` Add SHA256 model integrity check: create `models/checksums.sha256` manifest, verify hash before `joblib.load()` in `services/ml/app/main.py:266`
- [ ] 5.3 `[sonnet]` Add `_model_swap_lock` to prediction reads (not just writes) in `services/ml/app/main.py` for thread-safe model access
- [ ] 5.4 `[sonnet]` Add date range validation (max 1 year) to `POST /predict/batch` in `services/ml/app/main.py`

View file

@ -651,13 +651,13 @@ async def predict(request: PredictRequest):
logger.error(f"Prediction validation error: {e}")
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=str(e)
detail="Internal server error"
)
except Exception as e:
logger.error(f"Prediction failed: {e}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Prediction failed: {str(e)}"
detail="Internal server error"
)
@ -795,7 +795,7 @@ async def predict_batch(request: BatchPredictRequest):
logger.error(f"Batch prediction failed: {e}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Batch prediction failed: {str(e)}"
detail="Internal server error"
)
@ -865,9 +865,10 @@ async def patterns_detect(request: DetectPatternsRequest):
try:
raw_annotations = detect_patterns(candles_data, request.patterns or None)
except RuntimeError as exc:
logger.error(f"Pattern detection runtime error: {exc}")
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail=str(exc),
detail="Internal server error",
)
annotations = [SpanAnnotation(**ann) for ann in raw_annotations]
@ -1185,7 +1186,7 @@ async def training_start(request: TrainingStartRequest):
logger.error(f"Failed to insert training run record: {exc}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to create training run record: {exc}",
detail="Internal server error",
)
# Launch background thread (daemon so it doesn't block process exit)
@ -1228,7 +1229,7 @@ async def training_runs():
logger.error(f"Failed to fetch training runs: {exc}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to fetch training runs: {exc}",
detail="Internal server error",
)
return TrainingRunsResponse(runs=runs)
@ -1301,7 +1302,7 @@ async def delete_training_run(run_id: str):
logger.error(f"Failed to delete training run {run_id}: {exc}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to delete training run: {exc}",
detail="Internal server error",
)
# Remove model artifact if it exists
@ -1360,7 +1361,7 @@ async def training_dataset_info():
logger.error(f"Failed to get dataset info: {exc}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to get dataset info: {exc}",
detail="Internal server error",
)
@ -1387,12 +1388,12 @@ async def training_build_dataset():
result = build_dataset_from_db(config)
return BuildDatasetResponse(**result)
except ValueError as exc:
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(exc))
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Internal server error")
except Exception as exc:
logger.error(f"Failed to build dataset: {exc}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to build dataset: {exc}",
detail="Internal server error",
)
@ -1443,7 +1444,7 @@ async def model_load(request: ModelLoadRequest):
logger.error(f"DB lookup failed for run_id={request.run_id}: {exc}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Database error: {exc}",
detail="Internal server error",
)
if row is None:
@ -1479,7 +1480,7 @@ async def model_load(request: ModelLoadRequest):
logger.error(f"Failed to load model for run_id={request.run_id}: {exc}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to load model: {exc}",
detail="Internal server error",
)
# 4. Thread-safe model swap (3.2 brief lock)