feat: auto-build training dataset from DB annotations before training
- Add build_dataset_from_db() that exports candles from DB, runs feature engineering, and ingests span annotations into labeled CSV - Call it automatically in _run_training_background before training starts - Add POST /training/build-dataset endpoint for standalone use - Add Next.js proxy route /api/training/build-dataset - Update TrainingPanel: remove dataset-missing block on Start Training, show informational message that dataset builds automatically
This commit is contained in:
parent
b4956f3fb9
commit
d3dcfcea7d
4 changed files with 148 additions and 5 deletions
1
RAZNO/annotations.csv
Normal file
1
RAZNO/annotations.csv
Normal file
|
|
@ -0,0 +1 @@
|
|||
timestamp,label_type,price
|
||||
|
|
|
@ -907,9 +907,82 @@ class DatasetInfoResponse(BaseModel):
|
|||
row_count: Optional[int] = None
|
||||
|
||||
|
||||
def build_dataset_from_db(config: PipelineConfig) -> dict:
|
||||
"""
|
||||
Build the labeled training dataset directly from the database.
|
||||
|
||||
Steps:
|
||||
1. Export candles from PostgreSQL to raw CSV
|
||||
2. Run feature engineering (TA-Lib indicators, candle features)
|
||||
3. Run annotation ingestion from DB (span_annotations -> labeled CSV)
|
||||
|
||||
Returns:
|
||||
dict with keys: chart_name, n_candles, n_annotations, n_samples, labeled_path
|
||||
"""
|
||||
from app.data_access import DataAccess
|
||||
from app.annotation_ingestion import AnnotationIngestion
|
||||
from features.engineer import run_feature_engineering_stage
|
||||
|
||||
data_access = DataAccess()
|
||||
|
||||
# Find all charts, use the first one (single-chart app)
|
||||
charts_df = data_access.get_all_charts()
|
||||
if charts_df.empty:
|
||||
raise ValueError("No charts found in database. Upload candle data first.")
|
||||
|
||||
chart = charts_df.iloc[0]
|
||||
chart_name = chart["name"]
|
||||
chart_id = int(chart["id"])
|
||||
logger.info(f"Building dataset for chart: {chart_name} (id={chart_id})")
|
||||
|
||||
# Step 1: Export candles to raw CSV
|
||||
candles_df = data_access.get_candles(chart_id)
|
||||
if candles_df.empty:
|
||||
raise ValueError(f"No candles found for chart: {chart_name}")
|
||||
|
||||
raw_path = Path(config.data.raw_path)
|
||||
raw_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Ensure 'time' column is suitable for feature engineering
|
||||
export_df = candles_df[["time", "open", "high", "low", "close"]].copy()
|
||||
export_df.to_csv(raw_path, index=False)
|
||||
logger.info(f"Exported {len(export_df)} candles to {raw_path}")
|
||||
|
||||
# Step 2: Run feature engineering
|
||||
run_feature_engineering_stage(config)
|
||||
enriched_path = Path(config.data.enriched_path)
|
||||
logger.info(f"Feature engineering complete: {enriched_path}")
|
||||
|
||||
# Step 3: Run annotation ingestion from database
|
||||
enriched_df = pd.read_csv(enriched_path, parse_dates=["time"])
|
||||
ingestion = AnnotationIngestion(config.stages.annotation_ingestion)
|
||||
labeled_df = ingestion.process_from_db(enriched_df, chart_name, source="human")
|
||||
|
||||
if labeled_df.empty:
|
||||
raise ValueError(
|
||||
f"No labeled samples produced. "
|
||||
f"Ensure you have span annotations on chart '{chart_name}'."
|
||||
)
|
||||
|
||||
# Write labeled dataset
|
||||
labeled_path = Path(config.data.labeled_path)
|
||||
labeled_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
labeled_df.to_csv(labeled_path, index=False)
|
||||
|
||||
result = {
|
||||
"chart_name": chart_name,
|
||||
"n_candles": len(export_df),
|
||||
"n_samples": len(labeled_df),
|
||||
"n_features": len([c for c in labeled_df.columns if c != "label"]),
|
||||
"labeled_path": str(labeled_path),
|
||||
}
|
||||
logger.info(f"Dataset built: {result}")
|
||||
return result
|
||||
|
||||
|
||||
def _run_training_background(run_id: str, model_type: str, config: PipelineConfig) -> None:
|
||||
"""
|
||||
Background thread target: train a model, update DB on completion or failure.
|
||||
Background thread target: build dataset then train a model.
|
||||
|
||||
Uses the pre-inserted TrainingRun record identified by ``run_id``.
|
||||
"""
|
||||
|
|
@ -920,6 +993,10 @@ def _run_training_background(run_id: str, model_type: str, config: PipelineConfi
|
|||
from training.train import create_model, temporal_split
|
||||
from sklearn.metrics import accuracy_score, f1_score
|
||||
|
||||
# Build dataset from database (feature engineering + annotation ingestion)
|
||||
logger.info("Building dataset from database...")
|
||||
build_dataset_from_db(config)
|
||||
|
||||
labeled_path = Path(config.data.labeled_path)
|
||||
if not labeled_path.exists():
|
||||
raise FileNotFoundError(f"Labeled dataset not found: {labeled_path}")
|
||||
|
|
@ -1256,6 +1333,38 @@ async def training_dataset_info():
|
|||
)
|
||||
|
||||
|
||||
class BuildDatasetResponse(BaseModel):
|
||||
"""Response model for POST /training/build-dataset."""
|
||||
chart_name: str
|
||||
n_candles: int
|
||||
n_samples: int
|
||||
n_features: int
|
||||
labeled_path: str
|
||||
|
||||
|
||||
@app.post("/training/build-dataset", response_model=BuildDatasetResponse)
|
||||
async def training_build_dataset():
|
||||
"""
|
||||
Build the labeled training dataset from database annotations.
|
||||
|
||||
Exports candles, runs feature engineering, and ingests span annotations
|
||||
into a labeled CSV ready for training.
|
||||
"""
|
||||
config = state.pipeline_config or get_default_config()
|
||||
|
||||
try:
|
||||
result = build_dataset_from_db(config)
|
||||
return BuildDatasetResponse(**result)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(exc))
|
||||
except Exception as exc:
|
||||
logger.error(f"Failed to build dataset: {exc}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to build dataset: {exc}",
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Model Loading Endpoint
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
34
src/app/api/training/build-dataset/route.ts
Normal file
34
src/app/api/training/build-dataset/route.ts
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
import { NextResponse } from 'next/server';
|
||||
|
||||
const INFERENCE_API_URL = process.env.INFERENCE_API_URL || 'http://localhost:8001';
|
||||
const INFERENCE_API_TIMEOUT = parseInt(process.env.INFERENCE_API_TIMEOUT || '120000', 10);
|
||||
|
||||
export async function POST() {
|
||||
const controller = new AbortController();
|
||||
const timeoutId = setTimeout(() => controller.abort(), INFERENCE_API_TIMEOUT);
|
||||
|
||||
try {
|
||||
const response = await fetch(`${INFERENCE_API_URL}/training/build-dataset`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
signal: controller.signal,
|
||||
});
|
||||
clearTimeout(timeoutId);
|
||||
|
||||
const data = await response.json();
|
||||
if (!response.ok) {
|
||||
return NextResponse.json({ error: data.detail || 'Failed to build dataset' }, { status: response.status });
|
||||
}
|
||||
return NextResponse.json(data);
|
||||
} catch (error: any) {
|
||||
clearTimeout(timeoutId);
|
||||
if (error.name === 'AbortError') {
|
||||
return NextResponse.json({ error: 'Dataset build timed out' }, { status: 504 });
|
||||
}
|
||||
if (error.cause?.code === 'ECONNREFUSED' || error.message?.includes('fetch failed')) {
|
||||
return NextResponse.json({ error: 'Inference service unavailable' }, { status: 503 });
|
||||
}
|
||||
console.error('training/build-dataset proxy error:', error);
|
||||
return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
|
||||
}
|
||||
}
|
||||
|
|
@ -228,8 +228,7 @@ export default function TrainingPanel() {
|
|||
}
|
||||
};
|
||||
|
||||
const datasetMissing = datasetInfo !== null && !datasetInfo.exists;
|
||||
const canTrain = !isTraining && !datasetMissing && datasetInfo !== null;
|
||||
const canTrain = !isTraining;
|
||||
|
||||
return (
|
||||
<div>
|
||||
|
|
@ -272,8 +271,8 @@ export default function TrainingPanel() {
|
|||
)}
|
||||
</>
|
||||
) : (
|
||||
<p className="text-orange-500">
|
||||
No training dataset found. Export annotations first.
|
||||
<p className="text-muted-foreground">
|
||||
No cached dataset. It will be built automatically from your annotations when training starts.
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue