diff --git a/RAZNO/annotations.csv b/RAZNO/annotations.csv new file mode 100644 index 0000000..f638172 --- /dev/null +++ b/RAZNO/annotations.csv @@ -0,0 +1 @@ +timestamp,label_type,price \ No newline at end of file diff --git a/services/ml/app/main.py b/services/ml/app/main.py index a9e9db9..0f43cd8 100644 --- a/services/ml/app/main.py +++ b/services/ml/app/main.py @@ -907,9 +907,82 @@ class DatasetInfoResponse(BaseModel): row_count: Optional[int] = None +def build_dataset_from_db(config: PipelineConfig) -> dict: + """ + Build the labeled training dataset directly from the database. + + Steps: + 1. Export candles from PostgreSQL to raw CSV + 2. Run feature engineering (TA-Lib indicators, candle features) + 3. Run annotation ingestion from DB (span_annotations -> labeled CSV) + + Returns: + dict with keys: chart_name, n_candles, n_annotations, n_samples, labeled_path + """ + from app.data_access import DataAccess + from app.annotation_ingestion import AnnotationIngestion + from features.engineer import run_feature_engineering_stage + + data_access = DataAccess() + + # Find all charts, use the first one (single-chart app) + charts_df = data_access.get_all_charts() + if charts_df.empty: + raise ValueError("No charts found in database. Upload candle data first.") + + chart = charts_df.iloc[0] + chart_name = chart["name"] + chart_id = int(chart["id"]) + logger.info(f"Building dataset for chart: {chart_name} (id={chart_id})") + + # Step 1: Export candles to raw CSV + candles_df = data_access.get_candles(chart_id) + if candles_df.empty: + raise ValueError(f"No candles found for chart: {chart_name}") + + raw_path = Path(config.data.raw_path) + raw_path.parent.mkdir(parents=True, exist_ok=True) + + # Ensure 'time' column is suitable for feature engineering + export_df = candles_df[["time", "open", "high", "low", "close"]].copy() + export_df.to_csv(raw_path, index=False) + logger.info(f"Exported {len(export_df)} candles to {raw_path}") + + # Step 2: Run feature engineering + run_feature_engineering_stage(config) + enriched_path = Path(config.data.enriched_path) + logger.info(f"Feature engineering complete: {enriched_path}") + + # Step 3: Run annotation ingestion from database + enriched_df = pd.read_csv(enriched_path, parse_dates=["time"]) + ingestion = AnnotationIngestion(config.stages.annotation_ingestion) + labeled_df = ingestion.process_from_db(enriched_df, chart_name, source="human") + + if labeled_df.empty: + raise ValueError( + f"No labeled samples produced. " + f"Ensure you have span annotations on chart '{chart_name}'." + ) + + # Write labeled dataset + labeled_path = Path(config.data.labeled_path) + labeled_path.parent.mkdir(parents=True, exist_ok=True) + labeled_df.to_csv(labeled_path, index=False) + + result = { + "chart_name": chart_name, + "n_candles": len(export_df), + "n_samples": len(labeled_df), + "n_features": len([c for c in labeled_df.columns if c != "label"]), + "labeled_path": str(labeled_path), + } + logger.info(f"Dataset built: {result}") + return result + + def _run_training_background(run_id: str, model_type: str, config: PipelineConfig) -> None: """ - Background thread target: train a model, update DB on completion or failure. + Background thread target: build dataset then train a model. Uses the pre-inserted TrainingRun record identified by ``run_id``. """ @@ -920,6 +993,10 @@ def _run_training_background(run_id: str, model_type: str, config: PipelineConfi from training.train import create_model, temporal_split from sklearn.metrics import accuracy_score, f1_score + # Build dataset from database (feature engineering + annotation ingestion) + logger.info("Building dataset from database...") + build_dataset_from_db(config) + labeled_path = Path(config.data.labeled_path) if not labeled_path.exists(): raise FileNotFoundError(f"Labeled dataset not found: {labeled_path}") @@ -1256,6 +1333,38 @@ async def training_dataset_info(): ) +class BuildDatasetResponse(BaseModel): + """Response model for POST /training/build-dataset.""" + chart_name: str + n_candles: int + n_samples: int + n_features: int + labeled_path: str + + +@app.post("/training/build-dataset", response_model=BuildDatasetResponse) +async def training_build_dataset(): + """ + Build the labeled training dataset from database annotations. + + Exports candles, runs feature engineering, and ingests span annotations + into a labeled CSV ready for training. + """ + config = state.pipeline_config or get_default_config() + + try: + result = build_dataset_from_db(config) + return BuildDatasetResponse(**result) + except ValueError as exc: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(exc)) + except Exception as exc: + logger.error(f"Failed to build dataset: {exc}", exc_info=True) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Failed to build dataset: {exc}", + ) + + # --------------------------------------------------------------------------- # Model Loading Endpoint # --------------------------------------------------------------------------- diff --git a/src/app/api/training/build-dataset/route.ts b/src/app/api/training/build-dataset/route.ts new file mode 100644 index 0000000..22c0cbf --- /dev/null +++ b/src/app/api/training/build-dataset/route.ts @@ -0,0 +1,34 @@ +import { NextResponse } from 'next/server'; + +const INFERENCE_API_URL = process.env.INFERENCE_API_URL || 'http://localhost:8001'; +const INFERENCE_API_TIMEOUT = parseInt(process.env.INFERENCE_API_TIMEOUT || '120000', 10); + +export async function POST() { + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), INFERENCE_API_TIMEOUT); + + try { + const response = await fetch(`${INFERENCE_API_URL}/training/build-dataset`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + signal: controller.signal, + }); + clearTimeout(timeoutId); + + const data = await response.json(); + if (!response.ok) { + return NextResponse.json({ error: data.detail || 'Failed to build dataset' }, { status: response.status }); + } + return NextResponse.json(data); + } catch (error: any) { + clearTimeout(timeoutId); + if (error.name === 'AbortError') { + return NextResponse.json({ error: 'Dataset build timed out' }, { status: 504 }); + } + if (error.cause?.code === 'ECONNREFUSED' || error.message?.includes('fetch failed')) { + return NextResponse.json({ error: 'Inference service unavailable' }, { status: 503 }); + } + console.error('training/build-dataset proxy error:', error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } +} diff --git a/src/components/TrainingPanel.tsx b/src/components/TrainingPanel.tsx index 0afeac8..11bca65 100644 --- a/src/components/TrainingPanel.tsx +++ b/src/components/TrainingPanel.tsx @@ -228,8 +228,7 @@ export default function TrainingPanel() { } }; - const datasetMissing = datasetInfo !== null && !datasetInfo.exists; - const canTrain = !isTraining && !datasetMissing && datasetInfo !== null; + const canTrain = !isTraining; return (
@@ -272,8 +271,8 @@ export default function TrainingPanel() { )} ) : ( -

- No training dataset found. Export annotations first. +

+ No cached dataset. It will be built automatically from your annotations when training starts.

)}