feat: auto-build training dataset from DB annotations before training

- Add build_dataset_from_db() that exports candles from DB, runs feature engineering, and ingests span annotations into labeled CSV - Call it automatically in _run_training_background before training starts - Add POST /training/build-dataset endpoint for standalone use - Add Next.js proxy route /api/training/build-dataset - Update TrainingPanel: remove dataset-missing block on Start Training, show informational message that dataset builds automatically
2026-02-18 00:24:39 +01:00 · 2026-02-18 00:24:39 +01:00 · d3dcfcea7d
commit d3dcfcea7d
parent b4956f3fb9
4 changed files with 148 additions and 5 deletions
--- a/RAZNO/annotations.csv
+++ b/RAZNO/annotations.csv
@ -0,0 +1 @@
 timestamp,label_type,price
--- a/services/ml/app/main.py
+++ b/services/ml/app/main.py
@ -907,9 +907,82 @@ class DatasetInfoResponse(BaseModel):
    row_count: Optional[int] = None
 def build_dataset_from_db(config: PipelineConfig) -> dict:
    """
    Build the labeled training dataset directly from the database.
    Steps:
    1. Export candles from PostgreSQL to raw CSV
    2. Run feature engineering (TA-Lib indicators, candle features)
    3. Run annotation ingestion from DB (span_annotations -> labeled CSV)
    Returns:
        dict with keys: chart_name, n_candles, n_annotations, n_samples, labeled_path
    """
    from app.data_access import DataAccess
    from app.annotation_ingestion import AnnotationIngestion
    from features.engineer import run_feature_engineering_stage
    data_access = DataAccess()
    # Find all charts, use the first one (single-chart app)
    charts_df = data_access.get_all_charts()
    if charts_df.empty:
        raise ValueError("No charts found in database. Upload candle data first.")
    chart = charts_df.iloc[0]
    chart_name = chart["name"]
    chart_id = int(chart["id"])
    logger.info(f"Building dataset for chart: {chart_name} (id={chart_id})")
    # Step 1: Export candles to raw CSV
    candles_df = data_access.get_candles(chart_id)
    if candles_df.empty:
        raise ValueError(f"No candles found for chart: {chart_name}")
    raw_path = Path(config.data.raw_path)
    raw_path.parent.mkdir(parents=True, exist_ok=True)
    # Ensure 'time' column is suitable for feature engineering
    export_df = candles_df[["time", "open", "high", "low", "close"]].copy()
    export_df.to_csv(raw_path, index=False)
    logger.info(f"Exported {len(export_df)} candles to {raw_path}")
    # Step 2: Run feature engineering
    run_feature_engineering_stage(config)
    enriched_path = Path(config.data.enriched_path)
    logger.info(f"Feature engineering complete: {enriched_path}")
    # Step 3: Run annotation ingestion from database
    enriched_df = pd.read_csv(enriched_path, parse_dates=["time"])
    ingestion = AnnotationIngestion(config.stages.annotation_ingestion)
    labeled_df = ingestion.process_from_db(enriched_df, chart_name, source="human")
    if labeled_df.empty:
        raise ValueError(
            f"No labeled samples produced. "
            f"Ensure you have span annotations on chart '{chart_name}'."
        )
    # Write labeled dataset
    labeled_path = Path(config.data.labeled_path)
    labeled_path.parent.mkdir(parents=True, exist_ok=True)
    labeled_df.to_csv(labeled_path, index=False)
    result = {
        "chart_name": chart_name,
        "n_candles": len(export_df),
        "n_samples": len(labeled_df),
        "n_features": len([c for c in labeled_df.columns if c != "label"]),
        "labeled_path": str(labeled_path),
    }
    logger.info(f"Dataset built: {result}")
    return result
 def _run_training_background(run_id: str, model_type: str, config: PipelineConfig) -> None:
    """
-    Background thread target: train a model, update DB on completion or failure.
+    Background thread target: build dataset then train a model.
    Uses the pre-inserted TrainingRun record identified by ``run_id``.
    """
@ -920,6 +993,10 @@ def _run_training_background(run_id: str, model_type: str, config: PipelineConfi
        from training.train import create_model, temporal_split
        from sklearn.metrics import accuracy_score, f1_score
        # Build dataset from database (feature engineering + annotation ingestion)
        logger.info("Building dataset from database...")
        build_dataset_from_db(config)
        labeled_path = Path(config.data.labeled_path)
        if not labeled_path.exists():
            raise FileNotFoundError(f"Labeled dataset not found: {labeled_path}")
@ -1256,6 +1333,38 @@ async def training_dataset_info():
        )
 class BuildDatasetResponse(BaseModel):
    """Response model for POST /training/build-dataset."""
    chart_name: str
    n_candles: int
    n_samples: int
    n_features: int
    labeled_path: str
@app.post("/training/build-dataset", response_model=BuildDatasetResponse)
 async def training_build_dataset():
    """
    Build the labeled training dataset from database annotations.
    Exports candles, runs feature engineering, and ingests span annotations
    into a labeled CSV ready for training.
    """
    config = state.pipeline_config or get_default_config()
    try:
        result = build_dataset_from_db(config)
        return BuildDatasetResponse(**result)
    except ValueError as exc:
        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(exc))
    except Exception as exc:
        logger.error(f"Failed to build dataset: {exc}", exc_info=True)
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Failed to build dataset: {exc}",
        )
 # ---------------------------------------------------------------------------
 # Model Loading Endpoint
 # ---------------------------------------------------------------------------
--- a/src/app/api/training/build-dataset/route.ts
+++ b/src/app/api/training/build-dataset/route.ts
@ -0,0 +1,34 @@
 import { NextResponse } from 'next/server';
 const INFERENCE_API_URL = process.env.INFERENCE_API_URL || 'http://localhost:8001';
 const INFERENCE_API_TIMEOUT = parseInt(process.env.INFERENCE_API_TIMEOUT || '120000', 10);
 export async function POST() {
  const controller = new AbortController();
  const timeoutId = setTimeout(() => controller.abort(), INFERENCE_API_TIMEOUT);
  try {
    const response = await fetch(`${INFERENCE_API_URL}/training/build-dataset`, {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      signal: controller.signal,
    });
    clearTimeout(timeoutId);
    const data = await response.json();
    if (!response.ok) {
      return NextResponse.json({ error: data.detail || 'Failed to build dataset' }, { status: response.status });
    }
    return NextResponse.json(data);
  } catch (error: any) {
    clearTimeout(timeoutId);
    if (error.name === 'AbortError') {
      return NextResponse.json({ error: 'Dataset build timed out' }, { status: 504 });
    }
    if (error.cause?.code === 'ECONNREFUSED' || error.message?.includes('fetch failed')) {
      return NextResponse.json({ error: 'Inference service unavailable' }, { status: 503 });
    }
    console.error('training/build-dataset proxy error:', error);
    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
  }
 }
--- a/src/components/TrainingPanel.tsx
+++ b/src/components/TrainingPanel.tsx
@ -228,8 +228,7 @@ export default function TrainingPanel() {
    }
  };
-  const datasetMissing = datasetInfo !== null && !datasetInfo.exists;
+  const canTrain = !isTraining;
  const canTrain = !isTraining && !datasetMissing && datasetInfo !== null;
  return (
    <div>
@ -272,8 +271,8 @@ export default function TrainingPanel() {
                )}
              </>
            ) : (
-              <p className="text-orange-500">
+              <p className="text-muted-foreground">
-                No training dataset found. Export annotations first.
+                No cached dataset. It will be built automatically from your annotations when training starts.
              </p>
            )}
          </div>