feat: add ML service scaffolding with Python FastAPI, Docker, and MLflow setup

2026-02-15 11:58:31 +01:00 · 2026-02-15 11:58:31 +01:00 · 1a653c5866
commit 1a653c5866
parent 92abab5316
18 changed files with 1952 additions and 2593 deletions
--- a/EURUSD.csv
+++ b/EURUSD.csv
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -5,9 +5,74 @@ services:
      - "3000:3000"
    volumes:
      - candle-data:/app/data
+      - ml-data:/app/ml-data
    environment:
      - NODE_ENV=production
+      - INFERENCE_API_URL=http://ml-service:8001
+      - INFERENCE_API_TIMEOUT=30000
+      - INFERENCE_BATCH_TIMEOUT=120000
+      - NEXT_PUBLIC_PREDICTIONS_ENABLED=true
    restart: unless-stopped
+    depends_on:
+      - ml-service
+  
+  ml-service:
+    build: ./services/ml
+    ports:
+      - "8001:8001"
+    volumes:
+      - ml-data:/app/data
+      - mlflow-data:/app/mlruns
+    environment:
+      - MLFLOW_TRACKING_URI=http://mlflow:5000
+      - DATABASE_URL=postgresql://ml_user:ml_password@postgres:5432/ml_db
+      - PYTHONUNBUFFERED=1
+    restart: unless-stopped
+    depends_on:
+      postgres:
+        condition: service_healthy
+      mlflow:
+        condition: service_started
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8001/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
+  
+  mlflow:
+    image: ghcr.io/mlflow/mlflow:v2.10.0
+    ports:
+      - "5000:5000"
+    volumes:
+      - mlflow-data:/mlflow
+    command: >
+      mlflow server
+      --backend-store-uri /mlflow
+      --default-artifact-root /mlflow/artifacts
+      --host 0.0.0.0
+      --port 5000
+    restart: unless-stopped
+  
+  postgres:
+    image: postgres:16-alpine
+    ports:
+      - "5432:5432"
+    volumes:
+      - postgres-data:/var/lib/postgresql/data
+    environment:
+      - POSTGRES_USER=ml_user
+      - POSTGRES_PASSWORD=ml_password
+      - POSTGRES_DB=ml_db
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U ml_user -d ml_db"]
+      interval: 10s
+      timeout: 5s
+      retries: 5

 volumes:
  candle-data:
+  ml-data:
+  mlflow-data:
+  postgres-data:
--- a/inference-ui-prompt.md
+++ b/inference-ui-prompt.md
@ -0,0 +1,427 @@
+# Inference Integration: ML Predictions → Next.js Annotation Tool
+
+## Context
+
+We have an existing Next.js frontend app that uses TradingView Lightweight Charts for candlestick rendering and span annotation of forex patterns. We also have a Python ML pipeline that trains pattern recognition models and serves predictions via a REST API (FastAPI on port 8001).
+
+This prompt describes how to connect the inference API to the frontend so users can see model predictions overlaid on their charts alongside their own human annotations.
+
+---
+
+## Architecture
+
+```
+┌─────────────────────────────────────────────────────┐
+│  Next.js Frontend                                   │
+│                                                     │
+│  ┌───────────────────────────────────────────────┐  │
+│  │  Lightweight Charts                           │  │
+│  │                                               │  │
+│  │  [Candles] ← raw OHLCV                        │  │
+│  │  [Human Annotations] ← solid colored overlays │  │
+│  │  [Model Predictions] ← dashed/hatched overlays│  │
+│  │  [Disagreements] ← highlighted borders        │  │
+│  └───────────────────────────────────────────────┘  │
+│                                                     │
+│  ┌──────────────┐  ┌──────────────┐                 │
+│  │ Annotation   │  │ Prediction   │                 │
+│  │ Panel        │  │ Panel        │                 │
+│  └──────────────┘  └──────────────┘                 │
+└──────────────────┬──────────────────────────────────┘
+                   │
+          Next.js API routes
+          /api/predict
+          /api/predict/batch
+                   │
+       ┌───────────▼───────────┐
+       │  Python Inference API │
+       │  FastAPI :8001        │
+       │                       │
+       │  /predict             │
+       │  /predict/batch       │
+       │  /model/info          │
+       │  /model/labels        │
+       └───────────────────────┘
+```
+
+---
+
+## 1. Next.js API Routes (Proxy to Python Backend)
+
+Create Next.js API routes that proxy requests to the Python inference server. This avoids CORS issues and lets you add auth/rate-limiting on the Next.js side.
+
+### `/api/predict` — Predict patterns for visible candles
+
+```typescript
+// app/api/predict/route.ts
+
+// Request body:
+interface PredictRequest {
+  pair: string;              // e.g. "EURUSD"
+  timeframe: string;         // e.g. "1H"
+  candles: {
+    time: number;            // unix timestamp
+    open: number;
+    high: number;
+    low: number;
+    close: number;
+    volume: number;
+  }[];
+}
+
+// Response body from Python API:
+interface PredictResponse {
+  predictions: {
+    time: number;
+    label: string;           // "bull_flag", "head_and_shoulders", "O" (no pattern)
+    confidence: number;      // 0.0 - 1.0
+  }[];
+  spans: {                   // predictions grouped into continuous spans
+    start_time: number;
+    end_time: number;
+    label: string;
+    avg_confidence: number;
+  }[];
+  model_info: {
+    model_name: string;
+    model_version: string;
+    trained_at: string;
+    dataset_version: string;
+  };
+}
+```
+
+### `/api/predict/batch` — Predict on full historical dataset
+
+Same interface but accepts a date range instead of raw candles. The Python backend loads the data from its own OHLCV store and returns predictions. Useful for backfill — "show me all patterns the model finds in the last 6 months."
+
+```typescript
+interface BatchPredictRequest {
+  pair: string;
+  timeframe: string;
+  start_date: string;        // ISO 8601
+  end_date: string;
+}
+```
+
+### `/api/model/info` — Get current model metadata
+
+```typescript
+interface ModelInfoResponse {
+  model_name: string;
+  model_version: string;
+  model_type: string;        // "xgboost", "cnn_1d", etc.
+  trained_at: string;
+  dataset_version: string;
+  feature_engineering: boolean;
+  labels: string[];          // all pattern labels the model knows
+  per_class_metrics: {
+    [label: string]: {
+      precision: number;
+      recall: number;
+      f1: number;
+      training_samples: number;
+    };
+  };
+}
+```
+
+---
+
+## 2. Prediction State Management
+
+Add a prediction layer to the existing app state. Keep it separate from annotation state — they are independent data sources that render on the same chart.
+
+```typescript
+// types/predictions.ts
+
+interface PredictionSpan {
+  id: string;                  // generated from start_time + label
+  startTime: number;
+  endTime: number;
+  label: string;
+  avgConfidence: number;
+  source: "model";             // always "model", vs "human" for annotations
+}
+
+interface PredictionState {
+  spans: PredictionSpan[];
+  isLoading: boolean;
+  error: string | null;
+  modelInfo: ModelInfoResponse | null;
+  visible: boolean;            // toggle prediction layer on/off
+  confidenceThreshold: number; // filter: only show predictions above this
+  selectedLabels: string[];    // filter: which pattern types to show
+  autoPredict: boolean;        // auto-run predictions when chart scrolls
+}
+```
+
+### When to fetch predictions
+
+Predictions should be fetched:
+- **On demand:** User clicks a "Run Model" button
+- **On chart scroll/zoom (if `autoPredict` is on):** When the visible candle range changes, debounce 500ms, then send the visible candles to `/api/predict`. Only send candles that haven't been predicted yet (cache previous results by time range).
+- **On batch backfill:** User selects a date range and clicks "Predict All"
+
+### Caching
+
+Cache predictions in a Map keyed by `${pair}_${timeframe}_${startTime}_${endTime}_${modelVersion}`. Invalidate cache when the model version changes. This prevents re-predicting the same candles on every scroll.
+
+---
+
+## 3. Rendering Predictions on Lightweight Charts
+
+Predictions render as a separate visual layer on the same chart instance as human annotations. They must be visually distinct from human annotations.
+
+### Visual Distinction
+
+| Property | Human Annotations | Model Predictions |
+|---|---|---|
+| Background fill | Solid color, 20% opacity | Diagonal hatched pattern or 10% opacity |
+| Border | Solid 2px | Dashed 2px |
+| Label tag | Solid background | Outlined/hollow background |
+| Label text | "bull_flag" | "bull_flag (87%)" with confidence |
+| Position | Above candles | Below candles (avoid overlap) |
+
+### Implementation with Lightweight Charts
+
+Lightweight Charts doesn't have a native "span highlight" feature, so use one of these approaches:
+
+**Option A: Custom Series Markers + Box Plugin**
+
+Use the `createBox` or a custom plugin to draw rectangles behind candle ranges. Lightweight Charts v4+ supports plugins that can draw on the canvas.
+
+```typescript
+// Pseudo-code for rendering a prediction span
+
+function renderPredictionSpan(chart, span: PredictionSpan, labelConfig: LabelConfig) {
+  // Use a box/rectangle primitive
+  // Position: from span.startTime to span.endTime on X axis
+  // Full price range of candles in that span on Y axis
+  // Style: dashed border, hatched or low-opacity fill
+  // Color: from labelConfig based on span.label
+
+  // Add a marker at the first candle of the span with label text
+  series.setMarkers([
+    ...existingMarkers,
+    {
+      time: span.startTime,
+      position: 'belowBar',       // below for predictions, above for human
+      color: labelConfig.color,
+      shape: 'square',
+      text: `${span.label} (${Math.round(span.avgConfidence * 100)}%)`,
+    }
+  ]);
+}
+```
+
+**Option B: Background color per candle using a secondary series**
+
+Create a histogram series behind the candles that uses color to indicate predictions. Each bar's color maps to a pattern label. Simpler but less visually rich.
+
+```typescript
+const predictionSeries = chart.addHistogramSeries({
+  priceScaleId: '',          // overlay on main scale
+  color: 'transparent',
+  lastValueVisible: false,
+  priceLineVisible: false,
+});
+
+// Set data with per-bar colors
+predictionSeries.setData(
+  predictedCandles.map(c => ({
+    time: c.time,
+    value: c.high,             // height of the bar
+    color: c.label !== 'O'
+      ? `${labelColorMap[c.label]}33`  // 20% opacity hex
+      : 'transparent',
+  }))
+);
+```
+
+**Option C: Custom drawing on the chart canvas (most control)**
+
+Use the Lightweight Charts plugin API to draw directly on the canvas. This gives full control over hatching, dashed borders, etc.
+
+```typescript
+import { createChart, IChartApi } from 'lightweight-charts';
+
+// After chart is created, access the canvas and draw overlays
+// Use requestAnimationFrame to sync with chart rendering
+// Listen to chart.timeScale().subscribeVisibleLogicalRangeChange()
+// to redraw when the user scrolls
+```
+
+**Recommended approach:** Start with Option B (histogram series) for a quick implementation, then upgrade to Option C (canvas plugin) for the polished version. Option A works if you find or build a good box plugin.
+
+---
+
+## 4. Prediction Controls Panel
+
+Add a panel (sidebar or floating) next to the annotation panel with these controls:
+
+### Controls
+
+```
+┌─────────────────────────────────────────┐
+│ Model Predictions                  [ON] │ ← master toggle
+├─────────────────────────────────────────┤
+│ Model: candlestick_v1 (v3)             │
+│ Type: XGBoost                           │
+│ Trained: 2024-03-20                     │
+│ Dataset: v12 (847 annotations)          │
+├─────────────────────────────────────────┤
+│ [Run on Visible] [Predict All]          │ ← action buttons
+│ Auto-predict on scroll: [OFF]           │
+├─────────────────────────────────────────┤
+│ Confidence threshold: ━━━━━━●━━ 0.70    │ ← slider, filters
+│                                         │
+│ Show patterns:                          │
+│  ☑ bull_flag      (P:0.89 R:0.76)     │ ← per-class metrics
+│  ☑ bear_flag      (P:0.82 R:0.71)     │
+│  ☑ head_shoulders (P:0.74 R:0.65)     │
+│  ☐ double_bottom  (P:0.68 R:0.52)     │ ← low recall, user may hide
+│  ☑ wedge_up       (P:0.85 R:0.79)     │
+├─────────────────────────────────────────┤
+│ Predictions visible: 12                 │
+│ Agreements with human: 8/12             │
+│ Disagreements: 4                        │
+│  → [Show only disagreements]            │
+└─────────────────────────────────────────┘
+```
+
+### Per-class metrics display
+
+Fetch from `/api/model/info` on load. Show precision (P) and recall (R) next to each label checkbox so the user knows which patterns the model is reliable for. Low-metric patterns can be hidden by default.
+
+---
+
+## 5. Disagreement Detection
+
+The most valuable feature: comparing human annotations with model predictions.
+
+### Logic
+
+For each time range, compare human annotation spans with prediction spans:
+
+```typescript
+interface Disagreement {
+  time_range: { start: number; end: number };
+  human_label: string | null;      // null if model predicted but human didn't annotate
+  model_label: string | null;      // null if human annotated but model missed
+  model_confidence: number;
+  type: "missed_by_model"          // human annotated, model said "O"
+      | "missed_by_human"          // model predicted pattern, human didn't annotate
+      | "label_mismatch";          // both see a pattern but disagree on which
+}
+```
+
+### How to compute disagreements
+
+1. Get all human annotation spans for the visible range.
+2. Get all model prediction spans for the visible range.
+3. For each human span, check if any prediction span overlaps (>50% time overlap):
+   - No overlap → `missed_by_model`
+   - Overlap but different label → `label_mismatch`
+   - Overlap and same label → agreement
+4. For each prediction span not matched to a human span → `missed_by_human`
+
+### Rendering disagreements
+
+- `missed_by_model`: Red dashed border around the human annotation (model couldn't see this)
+- `missed_by_human`: Yellow pulsing/blinking highlight (model found something you didn't label — review it!)
+- `label_mismatch`: Orange border with both labels shown
+
+`missed_by_human` is especially valuable — the model may be finding patterns you haven't annotated yet. Add a quick action: clicking a `missed_by_human` prediction opens the annotation dialog pre-filled with the model's suggested label so you can confirm or correct it with one click.
+
+---
+
+## 6. Feedback Loop: Predictions → New Annotations
+
+This is what makes the system improve over time.
+
+### Flow
+
+```
+1. Model predicts a pattern the user didn't annotate (missed_by_human)
+2. User sees it highlighted on the chart
+3. User clicks it → annotation dialog opens pre-filled:
+   - Start/end time from prediction span
+   - Label from prediction
+   - Confidence shown
+4. User either:
+   a. Confirms → saves as a new human annotation
+   b. Corrects label → saves with corrected label
+   c. Dismisses → optionally mark as "not a pattern" (negative example)
+5. New annotation is exported → fed into next training cycle
+```
+
+### "Not a pattern" negative examples
+
+When the model predicts something and the user explicitly says "this is not a pattern," save this as a negative annotation:
+
+```json
+{
+  "start_time": "...",
+  "end_time": "...",
+  "label": "O",
+  "source": "human_correction",
+  "model_predicted": "bull_flag",
+  "model_confidence": 0.72
+}
+```
+
+These negative examples are extremely valuable for training — they teach the model to stop making specific false positive mistakes.
+
+---
+
+## 7. API Error Handling & Loading States
+
+### When inference API is unavailable
+
+- Show a subtle banner: "Model server offline — predictions unavailable"
+- All prediction UI controls become disabled/greyed out
+- Human annotation continues to work normally (never block annotation on inference)
+- Poll `/api/model/info` every 30 seconds, auto-reconnect when available
+
+### Loading states
+
+- While predictions are loading, show a skeleton/shimmer overlay on the chart area
+- For batch predictions on large date ranges, show a progress indicator
+- Allow the user to cancel a long-running batch prediction
+
+### Stale predictions
+
+- When the user scrolls to a range that has cached predictions from an older model version, show a subtle indicator: "Predictions from model v2 — click to refresh with v3"
+- When a new model is deployed, invalidate all cached predictions and show "New model available — rerun predictions?"
+
+---
+
+## 8. Environment Configuration
+
+```env
+# .env.local
+
+# Inference API
+INFERENCE_API_URL=http://localhost:8001
+INFERENCE_API_TIMEOUT=10000       # ms, for single prediction requests
+INFERENCE_BATCH_TIMEOUT=120000    # ms, for batch predictions
+
+# Feature flags
+NEXT_PUBLIC_PREDICTIONS_ENABLED=true
+NEXT_PUBLIC_AUTO_PREDICT_DEFAULT=false
+NEXT_PUBLIC_DEFAULT_CONFIDENCE_THRESHOLD=0.70
+```
+
+---
+
+## Summary of Integration Points
+
+1. **Next.js API routes** proxy to Python FastAPI inference server
+2. **Prediction state** lives alongside annotation state, fetched on demand or auto-scroll
+3. **Rendering** uses Lightweight Charts histogram series (quick) or canvas plugin (polished) with visual distinction from human annotations
+4. **Controls panel** lets user toggle predictions, filter by confidence/label, view model metrics
+5. **Disagreement detection** compares human vs model and highlights mismatches
+6. **Feedback loop** lets user confirm/correct/dismiss model predictions as new annotations
+7. **Negative examples** from dismissed predictions feed back into training
+8. **Error handling** ensures annotation tool works independently of inference availability
--- a/ml-pipeline-prompt.md
+++ b/ml-pipeline-prompt.md
@ -0,0 +1,416 @@
+# ML Pipeline for Candlestick Pattern Recognition
+
+Build a modular ML pipeline for forex candlestick pattern recognition. The pipeline has distinct stages that can be enabled/disabled independently via config. Each stage reads from and writes to well-defined file formats so stages can run standalone.
+
+## Architecture Overview
+
+```
+[Stage 1: Feature Engineering]  ← optional, uses TA-Lib
+        ↓ outputs enriched OHLCV CSV
+[Stage 2: Annotation Ingestion]  ← optional, merges human labels
+        ↓ outputs labeled dataset CSV
+[Stage 3: Training]              ← always runs
+        ↓ logs to MLflow, saves model
+[Stage 4: Inference]             ← loads model, returns predictions
+        ↓ serves predictions via API or direct call
+```
+
+## Pipeline Config
+
+A single YAML config controls the entire pipeline:
+
+```yaml
+pipeline:
+  name: "candlestick_pattern_v1"
+  pair: "EURUSD"
+  timeframe: "1H"
+
+data:
+  raw_ohlcv_path: "data/raw/EURUSD_1H.csv"    # input: raw OHLCV data
+  enriched_path: "data/enriched/EURUSD_1H.csv" # output of stage 1
+  labeled_path: "data/labeled/EURUSD_1H.csv"   # output of stage 2
+  annotations_path: "data/annotations/annotations.json"  # from annotation tool
+
+stages:
+  feature_engineering:
+    enabled: true          # set false to skip, use raw OHLCV only
+    talib_indicators:
+      - name: "RSI"
+        params: { timeperiod: 14 }
+      - name: "ATR"
+        params: { timeperiod: 14 }
+      - name: "EMA"
+        params: { timeperiod: 20 }
+      - name: "MACD"
+        params: { fastperiod: 12, slowperiod: 26, signalperiod: 9 }
+      - name: "BBANDS"
+        params: { timeperiod: 20 }
+    candle_features: true  # compute body_size, wick_ratio, body_direction, gap, etc.
+    custom_features:       # user-defined feature functions
+      - "features.custom.trend_slope"
+      - "features.custom.volume_zscore"
+
+  annotation_ingestion:
+    enabled: true           # set false to skip, use programmatic labels only
+    source: "span"          # "span" (from annotation tool) or "point" (single candle marks)
+    context_padding: 20     # include N candles before/after each span
+    label_encoding: "bio"   # "bio" for sequence models, "window" for classification
+    window_size: 30         # fixed window size for classification (pad/truncate spans)
+    min_confidence: 3       # discard annotations below this confidence score
+    programmatic_labels:
+      enabled: true         # also generate labels from TA-Lib pattern functions
+      talib_patterns:       # TA-Lib's CDL* pattern recognition functions
+        - "CDL_ENGULFING"
+        - "CDL_MORNINGSTAR"
+        - "CDL_EVENINGSTAR"
+        - "CDL_HAMMER"
+        - "CDL_SHOOTINGSTAR"
+        - "CDL_DOJI"
+      merge_strategy: "human_priority"  # when human and programmatic disagree:
+                                         # "human_priority" = keep human label
+                                         # "programmatic_priority" = keep TA-Lib label
+                                         # "both" = keep both as separate label columns
+
+  training:
+    enabled: true
+    model_type: "xgboost"   # "xgboost", "lightgbm", "cnn_1d", "lstm", "transformer"
+    task: "classification"  # "classification" or "sequence_labeling"
+    target_column: "label"  # or "bio_tag" for sequence models
+    test_split: 0.2
+    validation_split: 0.1
+    split_method: "temporal" # "temporal" (time-based) or "random"
+                              # IMPORTANT: always use temporal for financial data
+    class_weights: "balanced" # handle imbalanced pattern classes
+    hyperparameters:          # model-specific, these are xgboost defaults
+      n_estimators: 500
+      max_depth: 6
+      learning_rate: 0.01
+      subsample: 0.8
+    mlflow:
+      tracking_uri: "http://localhost:5000"
+      experiment_name: "candlestick_patterns"
+      log_artifacts: true     # log feature importance plots, confusion matrices
+      register_model: true    # auto-register if metric beats current best
+
+  inference:
+    model_source: "mlflow"    # "mlflow" (load from registry) or "local" (load from file)
+    mlflow_model_name: "candlestick_pattern_v1"
+    mlflow_model_stage: "Production"
+    local_model_path: "models/best_model.pkl"
+    serve_mode: "api"         # "api" (REST endpoint) or "library" (direct Python call)
+    api_port: 8001
+    batch_size: 64            # for batch inference on historical data
+```
+
+---
+
+## Stage 1: Feature Engineering (Optional)
+
+**Input:** Raw OHLCV CSV (`data.raw_ohlcv_path`)
+**Output:** Enriched CSV (`data.enriched_path`) with additional feature columns
+**Skip condition:** If `stages.feature_engineering.enabled` is false, Stage 2/3 reads raw OHLCV directly.
+
+### What this stage does
+
+1. Load raw OHLCV data (columns: `time, open, high, low, close, volume`)
+2. If `talib_indicators` is configured, compute each indicator using TA-Lib and append as new columns. Column naming: lowercase indicator name, e.g. `rsi_14`, `atr_14`, `ema_20`, `macd`, `macd_signal`, `macd_hist`, `bbands_upper`, `bbands_middle`, `bbands_lower`
+3. If `candle_features` is true, compute these derived features for each candle:
+   - `body_size` = abs(close - open)
+   - `body_direction` = 1 if close >= open, else -1
+   - `upper_wick` = high - max(open, close)
+   - `lower_wick` = min(open, close) - low
+   - `wick_ratio` = upper_wick / lower_wick (handle div by zero)
+   - `body_to_range` = body_size / (high - low) (handle div by zero)
+   - `gap` = open - previous close
+   - `range` = high - low
+4. If `custom_features` is configured, import and call each function. Each custom feature function receives the full DataFrame and returns a Series.
+5. Handle NaN values from indicator warmup periods (drop or forward-fill based on config).
+6. Save enriched CSV.
+
+### TA-Lib installation note
+
+TA-Lib requires a C library installation before the Python wrapper works:
+- macOS: `brew install ta-lib`
+- Ubuntu: `apt-get install libta-lib-dev`
+- Windows: download from ta-lib.org
+- Python wrapper: `pip install TA-Lib`
+
+If TA-Lib is not installed and stage is enabled, fail with a clear error message and instructions, do not silently skip.
+
+---
+
+## Stage 2: Annotation Ingestion (Optional)
+
+**Input:** Enriched CSV (or raw OHLCV if Stage 1 skipped) + annotations JSON from the annotation tool
+**Output:** Labeled dataset CSV (`data.labeled_path`)
+**Skip condition:** If `stages.annotation_ingestion.enabled` is false, a target column must already exist in the input data (e.g., from a previous run or manual CSV column).
+
+### What this stage does
+
+1. Load the enriched/raw OHLCV CSV.
+2. Load annotations JSON (exported from the span annotation tool). Expected format:
+
+```json
+{
+  "annotations": [
+    {
+      "id": "uuid",
+      "start_time": "2024-03-15T09:00:00Z",
+      "end_time": "2024-03-15T16:00:00Z",
+      "label": "bull_flag",
+      "confidence": 4,
+      "outcome": "win",
+      "sub_spans": [...]
+    }
+  ]
+}
+```
+
+3. **If `label_encoding` is "bio":** For each candle in the dataset, assign a BIO tag based on annotations:
+   - First candle of an annotation span → `B-{label}`
+   - Subsequent candles in the span → `I-{label}`
+   - Candles outside any span → `O`
+   - For overlapping annotations, create multiple tag columns (`bio_tag_1`, `bio_tag_2`)
+
+4. **If `label_encoding` is "window":** For each annotation, extract a fixed-size window of `window_size` candles centered on the annotation span. If the span is shorter than window_size, pad with context candles. If longer, use the full span. Each window becomes one row in the output with flattened OHLCV + feature columns.
+
+5. Filter annotations below `min_confidence`.
+
+6. **If `programmatic_labels.enabled`:** Also run TA-Lib's CDL* functions on the OHLC data. These return +100/-100/0 for bullish/bearish/no-pattern. Convert to label names. Merge with human annotations using the configured `merge_strategy`.
+
+7. Add `context_padding` candles before and after each annotation span in the output (for models that need trend context).
+
+8. Log dataset statistics:
+   - Total annotations by label
+   - Class distribution
+   - Average span length per label
+   - Agreement rate between human and programmatic labels (if both enabled)
+
+9. Save labeled dataset CSV.
+
+---
+
+## Stage 3: Training
+
+**Input:** Labeled dataset CSV (`data.labeled_path`)
+**Output:** Trained model (logged to MLflow and/or saved locally)
+**This stage always runs when the pipeline is executed.**
+
+### What this stage does
+
+1. Load labeled dataset.
+2. Split into train/validation/test using `split_method`:
+   - `temporal`: sort by time, first N% train, next M% validation, last K% test. **Never shuffle financial time series.**
+   - `random`: standard sklearn split (not recommended for financial data, but available).
+3. Separate features (X) from target (y) using `target_column`.
+4. Apply `class_weights` to handle imbalanced labels (common: you'll have way more "O" / no-pattern than any specific pattern).
+
+5. **Start an MLflow run:**
+
+```python
+import mlflow
+
+mlflow.set_tracking_uri(config.training.mlflow.tracking_uri)
+mlflow.set_experiment(config.training.mlflow.experiment_name)
+
+with mlflow.start_run():
+    # Log the full pipeline config
+    mlflow.log_dict(config, "pipeline_config.yaml")
+
+    # Log dataset info
+    mlflow.log_param("dataset_version", dvc_version_hash)  # if using DVC
+    mlflow.log_param("total_samples", len(X_train) + len(X_test))
+    mlflow.log_param("num_classes", len(unique_labels))
+    mlflow.log_param("model_type", config.training.model_type)
+    mlflow.log_param("window_size", config.annotation_ingestion.window_size)
+    mlflow.log_param("feature_engineering_enabled", config.stages.feature_engineering.enabled)
+    mlflow.log_param("annotations_enabled", config.stages.annotation_ingestion.enabled)
+
+    # Log per-class sample counts
+    for label, count in label_counts.items():
+        mlflow.log_param(f"samples_{label}", count)
+
+    # Log all hyperparameters
+    mlflow.log_params(config.training.hyperparameters)
+
+    # Train model
+    model = train(config.training.model_type, X_train, y_train, config.training.hyperparameters)
+
+    # Evaluate
+    y_pred = model.predict(X_test)
+
+    # Log overall metrics
+    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))
+    mlflow.log_metric("f1_macro", f1_score(y_test, y_pred, average='macro'))
+    mlflow.log_metric("f1_weighted", f1_score(y_test, y_pred, average='weighted'))
+
+    # Log PER-CLASS metrics (critical for imbalanced pattern data)
+    report = classification_report(y_test, y_pred, output_dict=True)
+    for label, metrics in report.items():
+        if isinstance(metrics, dict):
+            mlflow.log_metric(f"precision_{label}", metrics['precision'])
+            mlflow.log_metric(f"recall_{label}", metrics['recall'])
+            mlflow.log_metric(f"f1_{label}", metrics['f1-score'])
+
+    # Log artifacts
+    if config.training.mlflow.log_artifacts:
+        # Confusion matrix plot
+        fig = plot_confusion_matrix(y_test, y_pred, labels=unique_labels)
+        mlflow.log_figure(fig, "confusion_matrix.png")
+
+        # Feature importance (for tree models)
+        if hasattr(model, 'feature_importances_'):
+            fig = plot_feature_importance(model, feature_names)
+            mlflow.log_figure(fig, "feature_importance.png")
+
+        # Classification report as text
+        mlflow.log_text(classification_report(y_test, y_pred), "classification_report.txt")
+
+    # Log model
+    mlflow.sklearn.log_model(model, "model")  # or mlflow.xgboost, mlflow.pytorch etc.
+
+    # Register model if configured
+    if config.training.mlflow.register_model:
+        mlflow.register_model(f"runs:/{run.info.run_id}/model", config.inference.mlflow_model_name)
+```
+
+6. **Model-specific training logic:**
+
+   - **xgboost / lightgbm:** Direct fit on tabular features. Works with windowed classification format.
+   - **cnn_1d:** Reshape windowed data into (samples, timesteps, features) tensor. Use Conv1D → MaxPool → Dense layers.
+   - **lstm:** Same reshape as CNN. Use LSTM → Dense layers. Consider bidirectional LSTM for pattern detection.
+   - **transformer:** Use a lightweight transformer encoder. Positional encoding is important for candle sequence order.
+
+   For sequence labeling task (`task: "sequence_labeling"`):
+   - **BiLSTM-CRF:** Use BIO-tagged data. Input is full candle sequence, output is tag per candle.
+
+---
+
+## Stage 4: Inference
+
+**Input:** New OHLCV data (with features if Stage 1 is enabled) + trained model
+**Output:** Pattern predictions for each candle or window
+
+### What this stage does
+
+1. **Load model:**
+   - If `model_source` is "mlflow": load from MLflow model registry using model name + stage
+   - If `model_source` is "local": load from file path
+
+```python
+if config.inference.model_source == "mlflow":
+    import mlflow.pyfunc
+    model = mlflow.pyfunc.load_model(
+        model_uri=f"models:/{config.inference.mlflow_model_name}/{config.inference.mlflow_model_stage}"
+    )
+else:
+    import joblib
+    model = joblib.load(config.inference.local_model_path)
+```
+
+2. **Preprocessing must match training exactly.** If Stage 1 was enabled during training, it must also run during inference with the same config. Same indicators, same candle features, same custom features, same parameter values. Log the pipeline config as an MLflow artifact during training so inference can replicate it.
+
+3. **If `serve_mode` is "api":** Start a REST endpoint (Flask/FastAPI) on `api_port`:
+
+```
+POST /predict
+Content-Type: application/json
+
+{
+  "candles": [
+    {"time": "2024-03-15T09:00:00Z", "open": 1.0921, "high": 1.0935, "low": 1.0918, "close": 1.0933, "volume": 1200},
+    ...
+  ]
+}
+
+Response:
+{
+  "predictions": [
+    {"time": "2024-03-15T09:00:00Z", "label": "bull_flag", "confidence": 0.87},
+    {"time": "2024-03-15T10:00:00Z", "label": "bull_flag", "confidence": 0.82},
+    {"time": "2024-03-15T11:00:00Z", "label": "O", "confidence": 0.95},
+    ...
+  ],
+  "model_version": "v3",
+  "pipeline_config_hash": "abc123"
+}
+```
+
+   The API must:
+   - Accept raw OHLCV candles
+   - Run Stage 1 (feature engineering) internally if it was used during training
+   - Run the model
+   - Return predictions with confidence scores
+   - Return model version for traceability
+
+4. **If `serve_mode` is "library":** Expose a Python function that the annotation tool can call directly:
+
+```python
+from pipeline import predict
+
+results = predict(candles_df)  # returns DataFrame with label + confidence columns
+```
+
+5. **Batch inference:** For historical data, process in chunks of `batch_size` to avoid memory issues. Output a full CSV with predictions alongside the original OHLCV data.
+
+6. **Integration with annotation tool:** The annotation tool can call the inference API to show predicted patterns as a separate visual layer (e.g., dashed outline vs. solid for human annotations). This enables the user to compare their annotations against the model's predictions and find disagreements.
+
+---
+
+## Directory Structure
+
+```
+project/
+├── config/
+│   └── pipeline.yaml
+├── data/
+│   ├── raw/              # raw OHLCV CSVs
+│   ├── enriched/         # after feature engineering
+│   ├── labeled/          # after annotation ingestion
+│   └── annotations/      # JSON exports from annotation tool
+├── features/
+│   ├── talib_features.py
+│   ├── candle_features.py
+│   └── custom/
+│       ├── trend_slope.py
+│       └── volume_zscore.py
+├── training/
+│   ├── train.py          # main training entry point
+│   ├── models/           # model architecture definitions
+│   │   ├── xgb.py
+│   │   ├── cnn.py
+│   │   ├── lstm.py
+│   │   └── transformer.py
+│   └── evaluation.py     # metrics, plots, reports
+├── inference/
+│   ├── serve.py          # REST API server
+│   ├── predict.py        # library-mode prediction
+│   └── preprocess.py     # must mirror training preprocessing exactly
+├── pipeline.py           # orchestrates all stages
+├── requirements.txt
+└── README.md
+```
+
+## Running the Pipeline
+
+```bash
+# Run full pipeline
+python pipeline.py --config config/pipeline.yaml
+
+# Run individual stages
+python pipeline.py --config config/pipeline.yaml --stage feature_engineering
+python pipeline.py --config config/pipeline.yaml --stage annotation_ingestion
+python pipeline.py --config config/pipeline.yaml --stage training
+python pipeline.py --config config/pipeline.yaml --stage inference
+
+# Start inference API
+python inference/serve.py --config config/pipeline.yaml
+```
+
+## Key Principles
+
+1. **Every stage is optional except training.** The pipeline should work with raw OHLCV + no annotations (using only programmatic TA-Lib labels), with annotations but no TA-Lib features, or with everything enabled.
+2. **Preprocessing parity.** Whatever transformations run during training MUST run identically during inference. Log the full config as an MLflow artifact.
+3. **Temporal splits only.** Never randomly shuffle financial time series data. Future data must never leak into training.
+4. **Per-class metrics matter more than overall accuracy.** A model that predicts "no pattern" for everything will have high accuracy but is useless. Track precision/recall for each pattern class.
+5. **MLflow is the observer, not the engine.** It logs, stores, and serves — your code does all computation.
--- a/openspec/changes/candle-backend/.openspec.yaml
+++ b/openspec/changes/candle-backend/.openspec.yaml
@ -0,0 +1,2 @@
+schema: spec-driven
+created: 2026-02-15
--- a/openspec/changes/candle-backend/design.md
+++ b/openspec/changes/candle-backend/design.md
@ -0,0 +1,133 @@
+## Context
+
+The Candle Annotator is a Next.js app with SQLite storage that lets users annotate candlestick charts with pattern labels. It currently has no ML capabilities — annotations are created manually and exported as CSV/JSON, but there's no way to train models or get predictions back into the UI.
+
+The existing stack is: Next.js 16 (App Router), React 19, lightweight-charts v4, SQLite via Drizzle ORM, Docker deployment. The app runs as a single container on port 3000.
+
+We need to add a Python ML service that sits alongside the Next.js app, connected via HTTP. The Python ecosystem (scikit-learn, XGBoost, TA-Lib, MLflow) is the right tool for this job — there's no viable way to do this in Node.js.
+
+## Goals / Non-Goals
+
+**Goals:**
+
+- Stand up a Python FastAPI service at `services/ml/` that handles feature engineering, annotation ingestion, training, and inference
+- Use TA-Lib for programmatic candlestick pattern detection (CDL* functions)
+- Train tree-based models (RandomForest, XGBoost) with MLflow tracking
+- Serve predictions via REST API on port 8001
+- Proxy inference requests through Next.js API routes to avoid CORS
+- Render model predictions on the chart as a distinct visual layer
+- Version datasets with DVC
+
+**Non-Goals:**
+
+- Deep learning models (LSTM, GRU, transformer) — architecture should accommodate them later, but not implemented now
+- Multi-user or multi-tenant support
+- Real-time streaming predictions (batch/on-demand only)
+- Automated retraining pipelines or CI/CD for model deployment
+- GPU inference or training optimization
+
+## Decisions
+
+### 1. Separate Python service vs. embedded in Next.js
+
+**Decision**: Standalone Python FastAPI service in `services/ml/`, communicating via HTTP.
+
+**Alternatives considered**:
+- Python subprocess spawned by Next.js — fragile process management, no independent scaling
+- Python WASM in browser — TA-Lib and scikit-learn don't work in WASM
+- Shared SQLite access from Python — SQLite doesn't handle concurrent writers well
+
+**Rationale**: Clean separation of concerns. The Next.js app owns the UI and annotation data; the Python service owns ML. They communicate through well-defined REST APIs. Each can be developed, tested, and deployed independently.
+
+### 2. Directory structure: `services/ml/` in the monorepo
+
+**Decision**: Place the Python service at `services/ml/` within the existing repo.
+
+**Alternatives considered**:
+- Separate repository — adds overhead for a single-developer project
+- Top-level `ml/` directory — `services/` namespace leaves room for future services
+
+**Rationale**: Monorepo keeps everything together. The `services/` prefix signals it's a separate deployable unit, not part of the Next.js app.
+
+### 3. Pipeline config via YAML
+
+**Decision**: Single `config/pipeline.yaml` controls all pipeline stages (feature engineering, annotation ingestion, training, inference). Each stage has an `enabled` flag.
+
+**Rationale**: Makes experiments reproducible — the full config is logged as an MLflow artifact with each training run. Stages can be toggled independently (e.g., skip feature engineering, use only programmatic labels).
+
+### 4. MLflow for experiment tracking, DVC for data versioning
+
+**Decision**: MLflow tracks experiments, metrics, models. DVC versions datasets.
+
+**Alternatives considered**:
+- Weights & Biases — heavier, cloud-dependent
+- Plain file logging — loses queryability and model registry
+- Git LFS for data — doesn't handle dataset lineage
+
+**Rationale**: MLflow runs locally (no cloud dependency), provides a model registry, and has native integrations with scikit-learn and XGBoost. DVC handles data versioning without bloating the git repo.
+
+### 5. Annotation export format: JSON from existing API
+
+**Decision**: The Python pipeline reads annotation data by calling the existing Next.js API endpoints (`GET /api/annotations`, span annotation exports) or from exported JSON/CSV files in `data/annotations/`.
+
+**Alternatives considered**:
+- Direct SQLite read from Python — concurrent access issues
+- Shared PostgreSQL — overkill for single-user tool
+
+**Rationale**: Using the existing API or file exports keeps the services decoupled. The annotation tool already has export functionality. For training, batch export to `data/annotations/` is sufficient.
+
+### 6. Label encoding: windowed classification first, BIO later
+
+**Decision**: Start with fixed-window classification (each annotation span → one training sample of N candles). BIO sequence labeling is designed for but not implemented in v1.
+
+**Rationale**: Window classification works with tree-based models (RandomForest, XGBoost) which are the initial model types. BIO encoding is needed for sequence models (BiLSTM-CRF) which are a non-goal for now.
+
+### 7. Next.js proxy routes for inference
+
+**Decision**: Next.js API routes at `/api/predict`, `/api/predict/batch`, `/api/model/info` proxy to the Python service.
+
+**Rationale**: Avoids CORS configuration. Lets us add auth or rate-limiting on the Next.js side later. The frontend only talks to one origin.
+
+### 8. Prediction rendering: histogram series overlay
+
+**Decision**: Use a lightweight-charts histogram series to render predictions as colored bars behind candles. Each bar's color maps to a predicted pattern label.
+
+**Alternatives considered**:
+- Custom canvas plugin — more control but significantly more code
+- Series markers only — no area highlighting, just point markers
+
+**Rationale**: Histogram series is the simplest approach that gives visual area coverage. Can upgrade to a canvas plugin later for hatched/dashed styling. Markers are added for label text with confidence scores.
+
+### 9. Docker: multi-container with docker-compose
+
+**Decision**: Add an `ml-service` container to the existing docker-compose. Add an `mlflow` container for the tracking server. Shared volume for `data/`.
+
+```
+services:
+  candle-annotator:  # existing
+  ml-service:        # new - FastAPI on 8001
+  mlflow:            # new - tracking server on 5000
+  postgres:          # new - PostgreSQL for ML service state
+```
+
+**Rationale**: Each service has its own Dockerfile and dependencies. The shared `data/` volume lets both services access OHLCV and annotation files.
+
+## Risks / Trade-offs
+
+**[TA-Lib C library dependency]** → TA-Lib requires installing a system-level C library before the Python wrapper works. Mitigated by pinning it in the Dockerfile (`apt-get install libta-lib-dev`) and providing clear setup instructions for local development.
+
+**[MLflow storage growth]** → MLflow artifacts (models, plots, configs) accumulate over time. Mitigated by using a local `mlruns/` directory with periodic manual cleanup. Not a concern at single-user scale.
+
+**[Preprocessing parity]** → Feature engineering during inference must exactly match training. If the pipeline config changes between training and inference, predictions are invalid. Mitigated by logging the full pipeline config as an MLflow artifact and loading it during inference to replicate preprocessing.
+
+**[Class imbalance]** → Pattern classes will be heavily imbalanced (mostly "no pattern"). Mitigated by using `class_weights: balanced` and tracking per-class precision/recall, not just accuracy.
+
+**[SQLite concurrent access]** → If both the Next.js app and Python service try to access the SQLite DB simultaneously, writes can fail. Mitigated by keeping Python read-only on annotation data (via API calls or file exports), never writing to the Next.js SQLite DB directly.
+
+**[Temporal data leakage]** → Random train/test splits on time series data leak future information. Mitigated by enforcing temporal splits only (configurable but defaulting to temporal).
+
+## Resolved Questions
+
+- **Python service database**: PostgreSQL — the Python service uses its own Postgres instance for storing training run references, pipeline configs, and any service-specific state. Added to docker-compose.
+- **DVC remote storage**: Local backend — datasets versioned on the local filesystem, simplest setup for single-developer workflow.
+- **Prediction persistence**: Ephemeral — predictions are fetched on demand from the inference API, not persisted in any database. The frontend caches them in memory keyed by time range + model version.
--- a/openspec/changes/candle-backend/proposal.md
+++ b/openspec/changes/candle-backend/proposal.md
@ -0,0 +1,39 @@
+## Why
+
+The annotation tool currently creates labeled datasets but has no way to train models on them or get predictions back. Adding a Python ML backend closes the loop: annotations become training data, models produce predictions, and predictions guide further annotation — creating an active learning cycle for candlestick pattern recognition.
+
+## What Changes
+
+- Add a Python service (`services/ml/`) alongside the existing Next.js app, using FastAPI for the REST API
+- Implement TA-Lib-based candlestick pattern recognition to auto-generate annotations programmatically
+- Build a configurable ML training pipeline (feature engineering → annotation ingestion → training → evaluation) with MLflow tracking and DVC for data versioning
+- Support multiple model types: RandomForest and XGBoost initially, with architecture ready for LSTM/GRU and transformer-based models later
+- Serve trained models via a FastAPI inference API that accepts OHLCV candles and returns pattern predictions with confidence scores
+- Add Next.js API proxy routes (`/api/predict`, `/api/predict/batch`, `/api/model/info`) to connect the frontend to the Python backend
+- Add prediction visualization layer on the chart (distinct from human annotations) with confidence filtering and disagreement detection
+- Add a prediction controls panel for toggling predictions, filtering by label/confidence, and viewing per-class model metrics
+- Implement a feedback loop: users can confirm, correct, or dismiss model predictions as new annotations
+
+## Capabilities
+
+### New Capabilities
+
+- `feature-engineering`: TA-Lib indicator computation and candle feature extraction from raw OHLCV data, producing enriched datasets for training and inference
+- `annotation-ingestion`: Converting span annotations (human and programmatic) into labeled training datasets with BIO or windowed encoding, including TA-Lib CDL* pattern auto-labeling
+- `ml-training`: Configurable model training pipeline with temporal splits, class balancing, MLflow experiment tracking, artifact logging, and model registry integration
+- `ml-inference`: REST API serving trained models — accepts OHLCV candles, runs preprocessing, returns predictions with confidence scores and model metadata
+- `prediction-ui`: Frontend prediction layer with chart visualization, controls panel, confidence filtering, disagreement detection, and feedback loop for active learning
+
+### Modified Capabilities
+
+- `backend-api`: New proxy routes (`/api/predict`, `/api/predict/batch`, `/api/model/info`) added to forward requests to the Python inference service
+- `span-annotation`: Span export format consumed by the ML pipeline for training; prediction-confirmed spans can be saved as new annotations
+
+## Impact
+
+- **New dependencies**: Python 3.11+, FastAPI, uvicorn, scikit-learn, XGBoost, TA-Lib (C library + Python wrapper), MLflow, DVC, pandas, numpy, joblib
+- **New service**: Python FastAPI service running on port 8001, needs to be added to docker-compose
+- **Data flow**: Annotation JSON/CSV exports feed into Python pipeline; inference results flow back to the frontend via Next.js proxy routes
+- **Infrastructure**: MLflow tracking server (port 5000), DVC remote storage for dataset versioning
+- **Existing code changes**: New API routes in Next.js, new React components for prediction panel, chart overlay modifications for prediction rendering
+- **Config**: Pipeline YAML config (`config/pipeline.yaml`) controls all ML stages; env vars for inference API URL and feature flags
--- a/openspec/changes/candle-backend/specs/annotation-ingestion/spec.md
+++ b/openspec/changes/candle-backend/specs/annotation-ingestion/spec.md
@ -0,0 +1,85 @@
+## ADDED Requirements
+
+### Requirement: Load annotations from JSON export
+The system SHALL load annotation data from JSON files exported by the annotation tool, located at `data.annotations_path`. The expected format is a JSON object with an `annotations` array where each annotation has: `id`, `start_time`, `end_time`, `label`, `confidence` (nullable), `outcome` (nullable), and `sub_spans` (nullable).
+
+#### Scenario: Load valid annotations JSON
+- **WHEN** `data.annotations_path` points to a valid JSON file with annotations
+- **THEN** the system loads all annotation objects into memory for processing
+
+#### Scenario: Missing annotations file
+- **WHEN** `data.annotations_path` points to a file that does not exist and annotation ingestion is enabled
+- **THEN** the system SHALL fail with an error message identifying the missing file path
+
+#### Scenario: Filter by confidence
+- **WHEN** `stages.annotation_ingestion.min_confidence` is set to 3
+- **THEN** annotations with confidence below 3 SHALL be excluded from the labeled dataset
+
+### Requirement: Windowed classification encoding
+When `stages.annotation_ingestion.label_encoding` is "window", the system SHALL convert each annotation span into a fixed-size window of candles. The window size is defined by `stages.annotation_ingestion.window_size`. If the annotation span is shorter than window_size, the system SHALL pad with context candles (centered on the span). If the span is longer, the system SHALL use the full span. Each window becomes one row in the output with flattened OHLCV + feature columns.
+
+#### Scenario: Span shorter than window
+- **WHEN** an annotation spans 10 candles and window_size is 30
+- **THEN** the system extracts 30 candles centered on the annotation (10 before, 10 span, 10 after) and flattens them into a single row
+
+#### Scenario: Span longer than window
+- **WHEN** an annotation spans 50 candles and window_size is 30
+- **THEN** the system uses all 50 candles and flattens them into a single row
+
+#### Scenario: Span near dataset boundary
+- **WHEN** an annotation is near the start of the dataset and there aren't enough candles for padding
+- **THEN** the system SHALL pad with as many candles as available (no error), filling missing positions with NaN
+
+### Requirement: BIO sequence labeling encoding
+When `stages.annotation_ingestion.label_encoding` is "bio", the system SHALL assign a BIO tag to each candle in the dataset based on annotations. The first candle of an annotation span gets `B-{label}`, subsequent candles in the span get `I-{label}`, and candles outside any annotation get `O`.
+
+#### Scenario: Single annotation BIO tags
+- **WHEN** a "bull_flag" annotation spans candles at times T5 through T8
+- **THEN** candle T5 gets tag `B-bull_flag`, candles T6-T8 get `I-bull_flag`, all other candles get `O`
+
+#### Scenario: Overlapping annotations
+- **WHEN** two annotations overlap in time range
+- **THEN** the system SHALL create multiple tag columns (`bio_tag_1`, `bio_tag_2`) to represent both annotations
+
+### Requirement: Programmatic TA-Lib pattern labels
+When `stages.annotation_ingestion.programmatic_labels.enabled` is true, the system SHALL run TA-Lib CDL* pattern recognition functions listed in `talib_patterns` on the OHLC data. Each CDL function returns +100 (bullish), -100 (bearish), or 0 (no pattern). The system SHALL convert non-zero results to label names (e.g., `CDL_ENGULFING` with +100 → `bullish_engulfing`).
+
+#### Scenario: Detect engulfing pattern
+- **WHEN** `CDL_ENGULFING` is in the talib_patterns list and the OHLC data contains an engulfing pattern
+- **THEN** the system generates a label `bullish_engulfing` or `bearish_engulfing` for the corresponding candle
+
+#### Scenario: No pattern detected
+- **WHEN** a CDL function returns 0 for a candle
+- **THEN** no programmatic label is assigned to that candle
+
+### Requirement: Human and programmatic label merge
+When both human annotations and programmatic labels exist for the same candle, the system SHALL merge them using the strategy in `stages.annotation_ingestion.merge_strategy`: "human_priority" keeps the human label, "programmatic_priority" keeps the TA-Lib label, "both" keeps both as separate label columns.
+
+#### Scenario: Human priority merge
+- **WHEN** merge_strategy is "human_priority" and a candle has human label "bull_flag" and programmatic label "bullish_engulfing"
+- **THEN** the output label for that candle is "bull_flag"
+
+#### Scenario: Both labels merge
+- **WHEN** merge_strategy is "both" and a candle has both human and programmatic labels
+- **THEN** the output has two separate label columns: `label_human` and `label_programmatic`
+
+### Requirement: Context padding
+The system SHALL include `stages.annotation_ingestion.context_padding` candles before and after each annotation span in the labeled output. This provides trend context for models.
+
+#### Scenario: Add padding candles
+- **WHEN** context_padding is 20 and an annotation spans candles T10 to T15
+- **THEN** the output includes candles from T-10 (or dataset start) through T35 (or dataset end) associated with that annotation
+
+### Requirement: Dataset statistics logging
+After annotation ingestion completes, the system SHALL log: total annotations by label, class distribution percentages, average span length per label, and agreement rate between human and programmatic labels (when both are enabled).
+
+#### Scenario: Log class distribution
+- **WHEN** annotation ingestion completes with 50 "bull_flag", 30 "bear_flag", and 200 "O" labels
+- **THEN** the system logs the counts and percentages for each class
+
+### Requirement: Labeled CSV output
+The system SHALL write the labeled dataset to `data.labeled_path` in CSV format. The output SHALL contain all feature columns plus the target label column(s).
+
+#### Scenario: Write labeled CSV
+- **WHEN** annotation ingestion completes successfully
+- **THEN** the labeled CSV is written to `data.labeled_path` with all feature and label columns
--- a/openspec/changes/candle-backend/specs/backend-api/spec.md
+++ b/openspec/changes/candle-backend/specs/backend-api/spec.md
@ -0,0 +1,38 @@
+## ADDED Requirements
+
+### Requirement: Predict proxy endpoint
+The system SHALL provide a `POST /api/predict` Next.js API route that proxies requests to the Python inference service at `${INFERENCE_API_URL}/predict`. The route SHALL forward the request body (pair, timeframe, candles array) and return the Python service's response. If the inference service is unreachable, the route SHALL return HTTP 503 with `{ "error": "Inference service unavailable" }`.
+
+#### Scenario: Successful prediction proxy
+- **WHEN** POST /api/predict is called with valid candle data and the Python service is running
+- **THEN** the route forwards the request to the inference service and returns the prediction response with HTTP 200
+
+#### Scenario: Inference service down
+- **WHEN** POST /api/predict is called but the Python inference service is unreachable
+- **THEN** the route returns HTTP 503 with `{ "error": "Inference service unavailable" }`
+
+#### Scenario: Inference service error
+- **WHEN** the Python inference service returns an error status (4xx or 5xx)
+- **THEN** the route forwards the error status and message to the client
+
+### Requirement: Batch predict proxy endpoint
+The system SHALL provide a `POST /api/predict/batch` Next.js API route that proxies batch prediction requests to `${INFERENCE_API_URL}/predict/batch`. The route SHALL forward pair, timeframe, start_date, and end_date.
+
+#### Scenario: Successful batch prediction
+- **WHEN** POST /api/predict/batch is called with valid parameters
+- **THEN** the route forwards to the inference service and returns the batch prediction response
+
+#### Scenario: Timeout on large batch
+- **WHEN** the batch prediction takes longer than INFERENCE_BATCH_TIMEOUT
+- **THEN** the route returns HTTP 504 with `{ "error": "Batch prediction timed out" }`
+
+### Requirement: Model info proxy endpoint
+The system SHALL provide a `GET /api/model/info` Next.js API route that proxies to `${INFERENCE_API_URL}/model/info`. This endpoint returns model metadata and per-class metrics.
+
+#### Scenario: Successful model info
+- **WHEN** GET /api/model/info is called and the inference service is running
+- **THEN** the route returns the model metadata JSON
+
+#### Scenario: No model available
+- **WHEN** GET /api/model/info is called and the inference service returns 503
+- **THEN** the route returns HTTP 503 with `{ "error": "No model available" }`
--- a/openspec/changes/candle-backend/specs/feature-engineering/spec.md
+++ b/openspec/changes/candle-backend/specs/feature-engineering/spec.md
@ -0,0 +1,60 @@
+## ADDED Requirements
+
+### Requirement: TA-Lib indicator computation
+The system SHALL compute technical indicators from raw OHLCV data using TA-Lib. The pipeline config's `stages.feature_engineering.talib_indicators` list defines which indicators to compute. Each indicator entry specifies a `name` (TA-Lib function name) and `params` (dictionary of function parameters). Computed indicators SHALL be appended as new columns to the output CSV using lowercase naming: `{indicator}_{param}` (e.g., `rsi_14`, `ema_20`, `macd`, `macd_signal`, `macd_hist`, `bbands_upper`, `bbands_middle`, `bbands_lower`).
+
+#### Scenario: Compute RSI indicator
+- **WHEN** the config includes `{ name: "RSI", params: { timeperiod: 14 } }` and feature engineering is enabled
+- **THEN** the system computes RSI with period 14 and appends a `rsi_14` column to the enriched CSV
+
+#### Scenario: Compute multi-output indicator
+- **WHEN** the config includes `{ name: "MACD", params: { fastperiod: 12, slowperiod: 26, signalperiod: 9 } }`
+- **THEN** the system appends `macd`, `macd_signal`, and `macd_hist` columns to the enriched CSV
+
+#### Scenario: TA-Lib not installed
+- **WHEN** feature engineering is enabled but the TA-Lib C library is not installed on the system
+- **THEN** the system SHALL fail with a clear error message including installation instructions for the user's platform, and SHALL NOT silently skip the stage
+
+#### Scenario: Feature engineering disabled
+- **WHEN** `stages.feature_engineering.enabled` is false
+- **THEN** the system SHALL skip indicator computation entirely and pass raw OHLCV data to the next stage
+
+### Requirement: Candle feature extraction
+When `stages.feature_engineering.candle_features` is true, the system SHALL compute derived candle features for each row: `body_size` (abs(close - open)), `body_direction` (1 if close >= open, else -1), `upper_wick` (high - max(open, close)), `lower_wick` (min(open, close) - low), `wick_ratio` (upper_wick / lower_wick), `body_to_range` (body_size / (high - low)), `gap` (open - previous close), and `range` (high - low).
+
+#### Scenario: Compute candle features
+- **WHEN** `candle_features` is true and feature engineering is enabled
+- **THEN** the system appends columns `body_size`, `body_direction`, `upper_wick`, `lower_wick`, `wick_ratio`, `body_to_range`, `gap`, `range` to the enriched CSV
+
+#### Scenario: Division by zero handling
+- **WHEN** a candle has `lower_wick` equal to 0 (for `wick_ratio`) or `high` equal to `low` (for `body_to_range`)
+- **THEN** the system SHALL set the result to 0.0 instead of raising an error
+
+#### Scenario: Gap for first candle
+- **WHEN** computing `gap` for the first candle in the dataset (no previous close)
+- **THEN** the system SHALL set gap to 0.0
+
+### Requirement: Custom feature functions
+When `stages.feature_engineering.custom_features` is configured, the system SHALL dynamically import each listed Python module path and call it as a function. Each custom feature function SHALL accept a pandas DataFrame (the full OHLCV + computed features so far) and return a pandas Series. The returned Series SHALL be appended as a new column named after the function.
+
+#### Scenario: Load custom feature
+- **WHEN** the config includes `custom_features: ["features.custom.trend_slope"]`
+- **THEN** the system imports `features.custom.trend_slope`, calls it with the DataFrame, and appends the result as a `trend_slope` column
+
+#### Scenario: Custom feature import error
+- **WHEN** a custom feature module path cannot be imported
+- **THEN** the system SHALL fail with an error message naming the unresolvable module path
+
+### Requirement: NaN handling for warmup periods
+After computing all indicators, the system SHALL handle NaN values introduced by indicator warmup periods. Rows with NaN values in indicator columns SHALL be dropped from the output. The system SHALL log how many rows were dropped.
+
+#### Scenario: Drop warmup rows
+- **WHEN** RSI with period 14 produces NaN for the first 14 rows
+- **THEN** those rows are dropped from the enriched CSV and a log message reports "Dropped 14 rows due to indicator warmup"
+
+### Requirement: Enriched CSV output
+The system SHALL write the enriched dataset (original OHLCV columns + all computed feature columns) to the path specified by `data.enriched_path` in CSV format. The output SHALL preserve the original column order with new feature columns appended.
+
+#### Scenario: Write enriched CSV
+- **WHEN** feature engineering completes successfully
+- **THEN** the enriched CSV is written to `data.enriched_path` with all original and computed columns
--- a/openspec/changes/candle-backend/specs/ml-inference/spec.md
+++ b/openspec/changes/candle-backend/specs/ml-inference/spec.md
@ -0,0 +1,107 @@
+## ADDED Requirements
+
+### Requirement: Model loading from MLflow registry
+When `stages.inference.model_source` is "mlflow", the system SHALL load the model from the MLflow model registry using the model name (`stages.inference.mlflow_model_name`) and stage (`stages.inference.mlflow_model_stage`).
+
+#### Scenario: Load production model
+- **WHEN** model_source is "mlflow", model name is "candlestick_pattern_v1", and stage is "Production"
+- **THEN** the system loads the model registered as "candlestick_pattern_v1" at the "Production" stage from MLflow
+
+#### Scenario: Model not found in registry
+- **WHEN** the specified model name or stage does not exist in the MLflow registry
+- **THEN** the system SHALL return a clear error indicating the model was not found
+
+### Requirement: Model loading from local file
+When `stages.inference.model_source` is "local", the system SHALL load the model from the file path specified by `stages.inference.local_model_path` using joblib.
+
+#### Scenario: Load local model
+- **WHEN** model_source is "local" and local_model_path is "models/best_model.pkl"
+- **THEN** the system loads the model from that file path
+
+#### Scenario: Local model file missing
+- **WHEN** the local_model_path does not exist
+- **THEN** the system SHALL return an error indicating the model file was not found
+
+### Requirement: Preprocessing parity
+The inference service SHALL replicate the exact preprocessing (feature engineering) used during training. The system SHALL load the pipeline config artifact from the MLflow run that produced the model and apply the same feature engineering steps (TA-Lib indicators, candle features) with the same parameters.
+
+#### Scenario: Matching preprocessing
+- **WHEN** the model was trained with RSI(14) and EMA(20) features
+- **THEN** inference SHALL compute RSI(14) and EMA(20) on the input candles before running the model
+
+#### Scenario: Config mismatch warning
+- **WHEN** the current pipeline config differs from the config stored with the model
+- **THEN** the system SHALL log a warning about the mismatch
+
+### Requirement: Predict endpoint
+The system SHALL provide a `POST /predict` endpoint on the FastAPI service (port 8001). The endpoint SHALL accept a JSON body with `pair` (string), `timeframe` (string), and `candles` (array of objects with `time`, `open`, `high`, `low`, `close`, `volume`). It SHALL return predictions with per-candle labels and confidence scores, prediction spans (grouped continuous predictions), and model metadata.
+
+#### Scenario: Successful prediction
+- **WHEN** POST /predict is called with 100 valid candle objects
+- **THEN** the system returns a JSON response with `predictions` array (one entry per candle with `time`, `label`, `confidence`), `spans` array (continuous same-label predictions grouped with `start_time`, `end_time`, `label`, `avg_confidence`), and `model_info` object
+
+#### Scenario: Empty candles array
+- **WHEN** POST /predict is called with an empty candles array
+- **THEN** the system returns HTTP 400 with an error message
+
+#### Scenario: Invalid candle data
+- **WHEN** POST /predict is called with candle objects missing required fields
+- **THEN** the system returns HTTP 422 with validation error details
+
+### Requirement: Batch predict endpoint
+The system SHALL provide a `POST /predict/batch` endpoint that accepts `pair`, `timeframe`, `start_date`, and `end_date`. The system SHALL load OHLCV data from its own data store for the specified range, process in chunks of `stages.inference.batch_size`, and return predictions for the full range.
+
+#### Scenario: Batch prediction
+- **WHEN** POST /predict/batch is called with pair "EURUSD", timeframe "1H", start_date and end_date spanning 6 months
+- **THEN** the system loads the data, processes in batches, and returns predictions for the full range
+
+#### Scenario: No data for range
+- **WHEN** the requested date range has no OHLCV data available
+- **THEN** the system returns HTTP 404 with a message indicating no data found for the range
+
+### Requirement: Model info endpoint
+The system SHALL provide a `GET /model/info` endpoint that returns metadata about the currently loaded model: model_name, model_version, model_type, trained_at, dataset_version, feature_engineering enabled status, list of all labels the model knows, and per-class metrics (precision, recall, F1, training sample count for each label).
+
+#### Scenario: Get model info
+- **WHEN** GET /model/info is called and a model is loaded
+- **THEN** the system returns JSON with model metadata and per-class metrics
+
+#### Scenario: No model loaded
+- **WHEN** GET /model/info is called and no model has been loaded
+- **THEN** the system returns HTTP 503 with a message indicating no model is available
+
+### Requirement: Model labels endpoint
+The system SHALL provide a `GET /model/labels` endpoint that returns the list of all pattern labels the current model can predict, along with their display colors.
+
+#### Scenario: Get model labels
+- **WHEN** GET /model/labels is called
+- **THEN** the system returns a JSON array of label objects with `name` and `color` fields
+
+### Requirement: Health check endpoint
+The system SHALL provide a `GET /health` endpoint that returns the service status including whether a model is loaded, the MLflow connection status, and the PostgreSQL connection status.
+
+#### Scenario: Healthy service
+- **WHEN** GET /health is called and all dependencies are available
+- **THEN** the system returns HTTP 200 with `{ "status": "healthy", "model_loaded": true, "mlflow": "connected", "database": "connected" }`
+
+#### Scenario: Degraded service
+- **WHEN** GET /health is called but the MLflow server is unreachable
+- **THEN** the system returns HTTP 200 with `{ "status": "degraded", "model_loaded": true, "mlflow": "disconnected", "database": "connected" }`
+
+### Requirement: Prediction confidence scores
+Each prediction SHALL include a confidence score between 0.0 and 1.0 derived from the model's probability output. For tree-based models, this is the max class probability from `predict_proba()`.
+
+#### Scenario: Confidence from predict_proba
+- **WHEN** the model predicts class "bull_flag" with probability 0.87
+- **THEN** the prediction confidence for that candle is 0.87
+
+### Requirement: Prediction span grouping
+The system SHALL group consecutive candle predictions with the same non-"O" label into prediction spans. Each span SHALL have `start_time`, `end_time`, `label`, and `avg_confidence` (mean confidence of candles in the span).
+
+#### Scenario: Group consecutive predictions
+- **WHEN** candles at T1, T2, T3 are all predicted as "bull_flag" with confidences 0.85, 0.90, 0.80
+- **THEN** the system creates one span: `{ start_time: T1, end_time: T3, label: "bull_flag", avg_confidence: 0.85 }`
+
+#### Scenario: Break on label change
+- **WHEN** candle T1 is "bull_flag" and candle T2 is "bear_flag"
+- **THEN** the system creates two separate spans
--- a/openspec/changes/candle-backend/specs/ml-training/spec.md
+++ b/openspec/changes/candle-backend/specs/ml-training/spec.md
@ -0,0 +1,92 @@
+## ADDED Requirements
+
+### Requirement: Temporal train/test splitting
+The system SHALL split the labeled dataset into train, validation, and test sets using temporal ordering. Data SHALL be sorted by time. The first portion is training, middle is validation, last is test. Split ratios are defined by `stages.training.test_split` and `stages.training.validation_split`. The system SHALL NOT shuffle financial time series data.
+
+#### Scenario: Temporal split
+- **WHEN** test_split is 0.2, validation_split is 0.1, and the dataset has 1000 rows sorted by time
+- **THEN** the first 700 rows are training, next 100 are validation, last 200 are test
+
+#### Scenario: Random split option
+- **WHEN** split_method is "random"
+- **THEN** the system uses standard random splitting (sklearn train_test_split) but logs a warning that this is not recommended for financial data
+
+### Requirement: Class weight balancing
+The system SHALL apply class weighting to handle imbalanced pattern labels. When `stages.training.class_weights` is "balanced", the system SHALL compute inverse-frequency weights so rare pattern classes receive higher training weight.
+
+#### Scenario: Balanced weights
+- **WHEN** class_weights is "balanced" and the dataset has 500 "O" labels and 50 "bull_flag" labels
+- **THEN** the model trains with class weights inversely proportional to class frequency
+
+### Requirement: Model training dispatch
+The system SHALL train the model type specified in `stages.training.model_type` using the hyperparameters in `stages.training.hyperparameters`. Supported model types for v1: "random_forest" (scikit-learn RandomForestClassifier) and "xgboost" (XGBClassifier).
+
+#### Scenario: Train XGBoost model
+- **WHEN** model_type is "xgboost" with hyperparameters n_estimators=500, max_depth=6, learning_rate=0.01
+- **THEN** the system trains an XGBClassifier with those parameters on the training set
+
+#### Scenario: Train RandomForest model
+- **WHEN** model_type is "random_forest"
+- **THEN** the system trains a RandomForestClassifier with the configured hyperparameters
+
+#### Scenario: Unsupported model type
+- **WHEN** model_type is a value not supported in v1 (e.g., "lstm", "transformer")
+- **THEN** the system SHALL fail with an error message listing the supported model types
+
+### Requirement: MLflow experiment tracking
+The system SHALL log all training runs to MLflow. Each run SHALL log: the full pipeline YAML config as an artifact, dataset version (DVC hash if available), total samples, number of classes, model type, window size, per-class sample counts, and all hyperparameters.
+
+#### Scenario: Log training run
+- **WHEN** a training run starts
+- **THEN** the system creates an MLflow run under the experiment name from `stages.training.mlflow.experiment_name` and logs all parameters
+
+#### Scenario: MLflow server unavailable
+- **WHEN** the MLflow tracking URI is unreachable
+- **THEN** the system SHALL fail with an error message indicating the MLflow server cannot be reached at the configured URI
+
+### Requirement: Training metrics logging
+After training, the system SHALL evaluate the model on the test set and log metrics to MLflow: overall accuracy, macro F1, weighted F1, and per-class precision, recall, and F1 for each label.
+
+#### Scenario: Log overall metrics
+- **WHEN** model evaluation completes
+- **THEN** the system logs accuracy, f1_macro, and f1_weighted to MLflow
+
+#### Scenario: Log per-class metrics
+- **WHEN** model evaluation completes with labels "bull_flag", "bear_flag", and "O"
+- **THEN** the system logs precision_bull_flag, recall_bull_flag, f1_bull_flag (and same for each other label) to MLflow
+
+### Requirement: Training artifact logging
+When `stages.training.mlflow.log_artifacts` is true, the system SHALL log to MLflow: a confusion matrix plot (PNG), a feature importance plot (PNG, for tree-based models), and a classification report (text).
+
+#### Scenario: Log confusion matrix
+- **WHEN** log_artifacts is true and training completes
+- **THEN** the system generates and logs a confusion matrix plot as "confusion_matrix.png" to MLflow
+
+#### Scenario: Log feature importance
+- **WHEN** log_artifacts is true and the model has `feature_importances_` attribute
+- **THEN** the system generates and logs a feature importance plot as "feature_importance.png" to MLflow
+
+### Requirement: Model registration
+When `stages.training.mlflow.register_model` is true, the system SHALL register the trained model in the MLflow model registry under the name specified by `stages.inference.mlflow_model_name`.
+
+#### Scenario: Register model
+- **WHEN** register_model is true and training completes
+- **THEN** the system registers the model in MLflow registry with the configured model name
+
+### Requirement: PostgreSQL training metadata storage
+The system SHALL store training run metadata in the PostgreSQL database. Each training run record SHALL include: run_id (MLflow run ID), model_type, experiment_name, pipeline_config_hash, dataset_version, metrics summary (JSON), status, and timestamps (created_at, completed_at).
+
+#### Scenario: Store training run record
+- **WHEN** a training run completes successfully
+- **THEN** the system inserts a record into the PostgreSQL `training_runs` table with the run metadata
+
+#### Scenario: Query training history
+- **WHEN** the system queries training runs
+- **THEN** it returns records from PostgreSQL ordered by created_at descending
+
+### Requirement: Pipeline config logging
+The system SHALL log the full pipeline YAML config as an MLflow artifact with each training run. This config SHALL be used during inference to replicate the exact preprocessing steps.
+
+#### Scenario: Config artifact logged
+- **WHEN** a training run starts
+- **THEN** the full pipeline.yaml content is logged as "pipeline_config.yaml" artifact in the MLflow run
--- a/openspec/changes/candle-backend/specs/prediction-ui/spec.md
+++ b/openspec/changes/candle-backend/specs/prediction-ui/spec.md
@ -0,0 +1,130 @@
+## ADDED Requirements
+
+### Requirement: Prediction state management
+The system SHALL maintain a separate prediction state alongside the existing annotation state. The prediction state SHALL include: spans (array of prediction spans), isLoading, error, modelInfo, visible (toggle), confidenceThreshold (filter), selectedLabels (filter), and autoPredict (toggle). Prediction state SHALL be independent from annotation state.
+
+#### Scenario: Initial prediction state
+- **WHEN** the app loads
+- **THEN** predictions are empty, visible is true, confidenceThreshold defaults to 0.70, autoPredict is false, and selectedLabels includes all labels
+
+### Requirement: On-demand prediction fetching
+The system SHALL fetch predictions on demand when the user clicks "Run on Visible". The system SHALL send the currently visible candles to `/api/predict` and update the prediction state with results. Predictions are ephemeral — not persisted, re-fetched on demand.
+
+#### Scenario: Run on visible candles
+- **WHEN** user clicks "Run on Visible" button
+- **THEN** the system sends the visible candle range to /api/predict, shows a loading state, and renders returned predictions on the chart
+
+#### Scenario: Batch predict all
+- **WHEN** user clicks "Predict All" button
+- **THEN** the system sends a batch request to /api/predict/batch for the full dataset and renders all returned predictions
+
+### Requirement: Prediction caching
+The system SHALL cache predictions in memory keyed by `${pair}_${timeframe}_${startTime}_${endTime}_${modelVersion}`. When the user scrolls to a range with cached predictions, the system SHALL use the cache instead of re-fetching. Cache SHALL be invalidated when the model version changes.
+
+#### Scenario: Cache hit
+- **WHEN** user scrolls back to a previously predicted range with the same model version
+- **THEN** the system renders cached predictions without making an API call
+
+#### Scenario: Cache invalidation on model change
+- **WHEN** the model version changes (detected via /api/model/info)
+- **THEN** all cached predictions are cleared
+
+### Requirement: Prediction rendering on chart
+The system SHALL render model predictions as a visual layer on the lightweight-charts instance, visually distinct from human annotations. Predictions SHALL use a histogram series with per-bar colors mapped to predicted pattern labels at reduced opacity (10-20%). Series markers SHALL be added at the start of each prediction span showing `{label} ({confidence}%)` positioned below bars.
+
+#### Scenario: Render prediction spans
+- **WHEN** predictions are loaded and visible is true
+- **THEN** colored histogram bars appear behind candles for predicted patterns, with markers showing labels and confidence
+
+#### Scenario: Predictions hidden
+- **WHEN** the user toggles predictions off (visible = false)
+- **THEN** the prediction histogram series and markers are removed from the chart
+
+#### Scenario: Visual distinction from annotations
+- **WHEN** both human annotations and model predictions exist for the same range
+- **THEN** human annotations render as solid colored rectangles (above bars) and predictions render as low-opacity histogram bars (below bars) — they are visually distinguishable
+
+### Requirement: Confidence threshold filter
+The system SHALL filter displayed predictions by confidence. Only predictions with confidence >= `confidenceThreshold` SHALL be rendered. The threshold is adjustable via a slider in the controls panel (range 0.0 to 1.0).
+
+#### Scenario: Filter low confidence
+- **WHEN** confidenceThreshold is 0.70 and a prediction has confidence 0.55
+- **THEN** that prediction is not rendered on the chart
+
+#### Scenario: Adjust threshold
+- **WHEN** user moves the confidence slider from 0.70 to 0.50
+- **THEN** previously hidden predictions with confidence between 0.50 and 0.70 become visible
+
+### Requirement: Label type filter
+The system SHALL allow users to toggle visibility of individual pattern labels via checkboxes in the controls panel. Only predictions for checked labels are rendered.
+
+#### Scenario: Hide specific label
+- **WHEN** user unchecks "double_bottom" in the label filter
+- **THEN** all "double_bottom" predictions are hidden from the chart
+
+### Requirement: Prediction controls panel
+The system SHALL display a prediction controls panel in the sidebar with: master on/off toggle, model info (name, version, type, training date), action buttons ("Run on Visible", "Predict All"), auto-predict toggle, confidence threshold slider, label checkboxes with per-class precision/recall metrics, prediction count, agreement count, and a "Show only disagreements" filter.
+
+#### Scenario: Display model info
+- **WHEN** the prediction panel loads and the inference API is available
+- **THEN** the panel fetches /api/model/info and displays model name, version, type, and training date
+
+#### Scenario: Inference API unavailable
+- **WHEN** the prediction panel loads and /api/model/info returns an error
+- **THEN** the panel shows "Model server offline — predictions unavailable" and all controls are disabled
+
+#### Scenario: Per-class metrics display
+- **WHEN** model info includes per-class metrics
+- **THEN** each label checkbox shows precision and recall values (e.g., "bull_flag (P:0.89 R:0.76)")
+
+### Requirement: Disagreement detection
+The system SHALL compare human annotation spans with model prediction spans to identify disagreements. For each human annotation, check if any prediction span overlaps (>50% time overlap). Disagreement types: "missed_by_model" (human annotated, model predicted "O"), "missed_by_human" (model predicted pattern, no human annotation), "label_mismatch" (both see a pattern but different labels).
+
+#### Scenario: Missed by model
+- **WHEN** a human annotation exists at T10-T20 but no prediction span overlaps it
+- **THEN** the system identifies this as "missed_by_model"
+
+#### Scenario: Missed by human
+- **WHEN** a prediction span exists at T30-T40 with no overlapping human annotation
+- **THEN** the system identifies this as "missed_by_human"
+
+#### Scenario: Label mismatch
+- **WHEN** a human annotation labels T10-T20 as "bull_flag" and the prediction labels the same range as "wedge_up"
+- **THEN** the system identifies this as "label_mismatch"
+
+### Requirement: Disagreement rendering
+The system SHALL render disagreements with distinct visual styles: "missed_by_model" shows a red dashed border around the human annotation, "missed_by_human" shows a yellow highlight around the prediction, "label_mismatch" shows an orange border with both labels displayed.
+
+#### Scenario: Render missed_by_human highlight
+- **WHEN** a "missed_by_human" disagreement is detected and disagreement rendering is enabled
+- **THEN** the prediction span is highlighted with a yellow border/glow to draw attention
+
+#### Scenario: Show only disagreements
+- **WHEN** user clicks "Show only disagreements" filter
+- **THEN** only prediction spans involved in disagreements are rendered, hiding agreement spans
+
+### Requirement: Prediction-to-annotation feedback
+When a user clicks on a "missed_by_human" prediction, the system SHALL open the span annotation dialog pre-filled with the prediction's start_time, end_time, and label. The user can confirm (save as new annotation), correct (change label, then save), or dismiss.
+
+#### Scenario: Confirm prediction as annotation
+- **WHEN** user clicks a "missed_by_human" prediction and clicks Save in the pre-filled dialog
+- **THEN** the system creates a new span annotation with the model's suggested label and timestamps
+
+#### Scenario: Correct and save
+- **WHEN** user clicks a "missed_by_human" prediction, changes the label in the dialog, and clicks Save
+- **THEN** the system creates a new span annotation with the corrected label
+
+#### Scenario: Dismiss as not-a-pattern
+- **WHEN** user clicks a "missed_by_human" prediction and clicks "Not a pattern"
+- **THEN** the system saves a negative annotation with label "O", source "human_correction", and records the model's original prediction and confidence
+
+### Requirement: Inference API connection monitoring
+The system SHALL poll `/api/model/info` every 30 seconds when the inference API is unavailable. When the API becomes available, the system SHALL auto-reconnect and enable prediction controls. Human annotation SHALL never be blocked by inference API availability.
+
+#### Scenario: Auto-reconnect
+- **WHEN** the inference API was unavailable and becomes reachable
+- **THEN** the prediction panel re-enables controls and shows "Model server online"
+
+#### Scenario: Annotation independence
+- **WHEN** the inference API is unavailable
+- **THEN** all human annotation tools continue to work normally
--- a/openspec/changes/candle-backend/specs/span-annotation/spec.md
+++ b/openspec/changes/candle-backend/specs/span-annotation/spec.md
@ -0,0 +1,34 @@
+## ADDED Requirements
+
+### Requirement: Span annotation JSON export for ML pipeline
+The system SHALL provide a `GET /api/span-annotations/export` endpoint that exports all span annotations for a given chart as JSON in the format expected by the ML pipeline. The output SHALL be a JSON object with an `annotations` array where each entry has: `id`, `start_time` (Unix timestamp), `end_time` (Unix timestamp), `label`, `confidence` (nullable), `outcome` (nullable), and `sub_spans` (nullable). The endpoint SHALL accept an optional `chartId` query parameter.
+
+#### Scenario: Export span annotations as JSON
+- **WHEN** GET /api/span-annotations/export?chartId=3 is called
+- **THEN** the system returns a JSON object with all span annotations for chart 3 in the ML pipeline format
+
+#### Scenario: Export without chartId
+- **WHEN** GET /api/span-annotations/export is called without chartId
+- **THEN** the system exports span annotations for the most recently created chart
+
+### Requirement: Prediction-sourced span annotation creation
+The system SHALL support creating span annotations with a `source` field indicating whether the annotation was created by a human ("human"), confirmed from a model prediction ("model_confirmed"), or corrected from a model prediction ("model_corrected"). The existing POST endpoint for span annotations SHALL accept an optional `source` field (default: "human") and optional `model_prediction` field (object with `label` and `confidence` from the original prediction).
+
+#### Scenario: Create human annotation
+- **WHEN** a span annotation is created without a source field
+- **THEN** the source defaults to "human"
+
+#### Scenario: Confirm model prediction
+- **WHEN** a user confirms a model prediction as an annotation
+- **THEN** the span annotation is created with source "model_confirmed" and model_prediction containing the original predicted label and confidence
+
+#### Scenario: Correct model prediction
+- **WHEN** a user changes the label of a model prediction before saving
+- **THEN** the span annotation is created with source "model_corrected" and model_prediction containing the original predicted label and confidence
+
+### Requirement: Negative annotation for dismissed predictions
+The system SHALL support saving negative annotations when a user dismisses a model prediction as "not a pattern". A negative annotation SHALL have label "O", source "human_correction", and a `model_prediction` field recording what the model originally predicted.
+
+#### Scenario: Save negative annotation
+- **WHEN** user dismisses a "bull_flag" prediction with confidence 0.72
+- **THEN** the system creates a span annotation with label "O", source "human_correction", and model_prediction `{ "label": "bull_flag", "confidence": 0.72 }`
--- a/openspec/changes/candle-backend/tasks.md
+++ b/openspec/changes/candle-backend/tasks.md
@ -0,0 +1,116 @@
+## 1. Project Scaffolding & Infrastructure
+
+- [x] 1.1 Create `services/ml/` directory structure: `config/`, `features/`, `features/custom/`, `training/`, `training/models/`, `inference/`, `data/raw/`, `data/enriched/`, `data/labeled/`, `data/annotations/`
+- [x] 1.2 Create `services/ml/pyproject.toml` (or `requirements.txt`) with dependencies: fastapi, uvicorn, scikit-learn, xgboost, pandas, numpy, joblib, mlflow, pyyaml, ta-lib, dvc, sqlalchemy, psycopg2-binary, pydantic
+- [x] 1.3 Create `services/ml/Dockerfile` with Python 3.11, TA-Lib C library installation (`libta-lib-dev`), and pip install of dependencies
+- [x] 1.4 Create `config/pipeline.yaml` with the full pipeline configuration (all stages, default hyperparameters, MLflow/DVC settings)
+- [x] 1.5 Add PostgreSQL, ml-service, and mlflow containers to `docker-compose.yml` with shared data volume
+- [ ] 1.6 Initialize DVC in `services/ml/` with local remote storage backend
+- [ ] 1.7 Create PostgreSQL database schema: `training_runs` table (run_id, model_type, experiment_name, pipeline_config_hash, dataset_version, metrics_summary JSON, status, created_at, completed_at)
+- [ ] 1.8 Create `services/ml/app/db.py` — SQLAlchemy engine and session setup for PostgreSQL connection
+
+## 2. Pipeline Config & Entry Point
+
+- [ ] 2.1 Create `services/ml/app/config.py` — Pydantic model for pipeline YAML config with validation (stages, data paths, hyperparameters)
+- [ ] 2.2 Create `services/ml/pipeline.py` — main orchestrator that reads config and runs enabled stages in sequence
+- [ ] 2.3 Add CLI argument parsing: `--config`, `--stage` (run individual stage), support for `python pipeline.py --config config/pipeline.yaml`
+
+## 3. Feature Engineering Stage
+
+- [ ] 3.1 Create `services/ml/features/talib_features.py` — compute TA-Lib indicators from config list, append columns with `{indicator}_{param}` naming, fail with clear error if TA-Lib not installed
+- [ ] 3.2 Create `services/ml/features/candle_features.py` — compute body_size, body_direction, upper_wick, lower_wick, wick_ratio, body_to_range, gap, range with division-by-zero handling
+- [ ] 3.3 Create `services/ml/features/custom_loader.py` — dynamic import of custom feature functions from config paths, call with DataFrame, append result as column
+- [ ] 3.4 Implement NaN warmup row handling — drop rows with NaN in indicator columns, log count of dropped rows
+- [ ] 3.5 Wire feature engineering into `pipeline.py` — read raw OHLCV CSV, run enabled feature steps, write enriched CSV to `data.enriched_path`
+
+## 4. Annotation Ingestion Stage
+
+- [ ] 4.1 Create `services/ml/app/annotation_ingestion.py` — load annotations JSON from `data.annotations_path`, filter by min_confidence
+- [ ] 4.2 Implement windowed classification encoding — extract fixed-size windows centered on each annotation span, flatten into single rows, handle boundary padding
+- [ ] 4.3 Implement BIO sequence labeling encoding — assign B-{label}/I-{label}/O tags per candle, handle overlapping annotations with multiple tag columns
+- [ ] 4.4 Implement TA-Lib CDL* programmatic labeling — run configured CDL functions, convert +100/-100 to label names (bullish_/bearish_ prefix)
+- [ ] 4.5 Implement human/programmatic label merge strategies — human_priority, programmatic_priority, both (separate columns)
+- [ ] 4.6 Implement context padding — include N candles before/after each annotation span
+- [ ] 4.7 Add dataset statistics logging — counts per label, class distribution %, avg span length, human/programmatic agreement rate
+- [ ] 4.8 Wire annotation ingestion into `pipeline.py` — read enriched CSV + annotations JSON, run encoding, write labeled CSV to `data.labeled_path`
+
+## 5. Training Stage
+
+- [ ] 5.1 Create `services/ml/training/train.py` — main training entry point: load labeled CSV, split, train, evaluate, log to MLflow
+- [ ] 5.2 Implement temporal train/validation/test splitting with configurable ratios, warn on random split
+- [ ] 5.3 Create `services/ml/training/models/random_forest.py` — RandomForestClassifier wrapper with class_weights support
+- [ ] 5.4 Create `services/ml/training/models/xgboost_model.py` — XGBClassifier wrapper with class_weights support
+- [ ] 5.5 Implement model dispatch — select model class based on `model_type` config, fail with supported types list for unknown types
+- [ ] 5.6 Implement MLflow experiment tracking — create run, log config artifact, dataset params, per-class sample counts, all hyperparameters
+- [ ] 5.7 Implement metrics logging — accuracy, f1_macro, f1_weighted, per-class precision/recall/F1
+- [ ] 5.8 Create `services/ml/training/evaluation.py` — generate confusion matrix plot, feature importance plot, classification report text
+- [ ] 5.9 Implement MLflow artifact logging — log confusion_matrix.png, feature_importance.png, classification_report.txt, pipeline_config.yaml
+- [ ] 5.10 Implement MLflow model registration — log model with sklearn/xgboost flavor, register in registry if configured
+- [ ] 5.11 Store training run metadata in PostgreSQL `training_runs` table
+- [ ] 5.12 Wire training into `pipeline.py`
+
+## 6. Inference Service (FastAPI)
+
+- [ ] 6.1 Create `services/ml/app/main.py` — FastAPI app with CORS, startup event to load model
+- [ ] 6.2 Implement model loading — from MLflow registry (by name + stage) or from local .pkl file via joblib
+- [ ] 6.3 Implement preprocessing parity — load pipeline config from MLflow artifact, apply same feature engineering as training
+- [ ] 6.4 Create `POST /predict` endpoint — accept candles array, run preprocessing, predict, return per-candle labels + confidence + spans + model_info
+- [ ] 6.5 Implement prediction span grouping — group consecutive same-label non-"O" predictions into spans with avg_confidence
+- [ ] 6.6 Create `POST /predict/batch` endpoint — accept pair/timeframe/date range, load data, process in batch_size chunks, return predictions
+- [ ] 6.7 Create `GET /model/info` endpoint — return model metadata, per-class metrics from MLflow
+- [ ] 6.8 Create `GET /model/labels` endpoint — return label names and colors
+- [ ] 6.9 Create `GET /health` endpoint — check model loaded status, MLflow connection, PostgreSQL connection
+- [ ] 6.10 Add Pydantic request/response models for all endpoints (PredictRequest, PredictResponse, BatchPredictRequest, ModelInfoResponse)
+
+## 7. Next.js API Proxy Routes
+
+- [ ] 7.1 Create `src/app/api/predict/route.ts` — POST proxy to `${INFERENCE_API_URL}/predict` with timeout handling
+- [ ] 7.2 Create `src/app/api/predict/batch/route.ts` — POST proxy to `${INFERENCE_API_URL}/predict/batch` with INFERENCE_BATCH_TIMEOUT
+- [ ] 7.3 Create `src/app/api/model/info/route.ts` — GET proxy to `${INFERENCE_API_URL}/model/info`
+- [ ] 7.4 Add environment variables to `.env.local`: INFERENCE_API_URL, INFERENCE_API_TIMEOUT, INFERENCE_BATCH_TIMEOUT, NEXT_PUBLIC_PREDICTIONS_ENABLED
+
+## 8. Span Annotation Export & Feedback
+
+- [ ] 8.1 Create `src/app/api/span-annotations/export/route.ts` — GET endpoint exporting span annotations as JSON in ML pipeline format
+- [ ] 8.2 Add `source` and `model_prediction` fields to span annotation schema (Drizzle migration) — source defaults to "human", model_prediction is nullable JSON
+- [ ] 8.3 Update span annotation POST endpoint to accept optional `source` and `model_prediction` fields
+- [ ] 8.4 Support negative annotations — span with label "O", source "human_correction", and model_prediction metadata
+
+## 9. Prediction UI — State & Controls
+
+- [ ] 9.1 Create `src/types/predictions.ts` — PredictionSpan, PredictionState, ModelInfoResponse interfaces
+- [ ] 9.2 Create prediction state management in page.tsx (or dedicated context) — spans, isLoading, error, modelInfo, visible, confidenceThreshold, selectedLabels, autoPredict
+- [ ] 9.3 Create `src/components/PredictionPanel.tsx` — controls panel with master toggle, model info display, action buttons, confidence slider, label checkboxes with metrics
+- [ ] 9.4 Implement on-demand prediction fetching — "Run on Visible" sends visible candles to /api/predict, "Predict All" sends batch request
+- [ ] 9.5 Implement prediction caching — Map keyed by pair_timeframe_range_modelVersion, invalidate on model version change
+
+## 10. Prediction UI — Chart Rendering
+
+- [ ] 10.1 Add histogram series to CandleChart for prediction rendering — per-bar colors from label config at 10-20% opacity
+- [ ] 10.2 Add series markers for prediction span labels — show `{label} ({confidence}%)` below bars at span start
+- [ ] 10.3 Implement confidence threshold filtering — only render predictions above threshold
+- [ ] 10.4 Implement label type filtering — toggle visibility per label from PredictionPanel checkboxes
+- [ ] 10.5 Implement prediction layer visibility toggle — show/hide histogram series and markers
+
+## 11. Prediction UI — Disagreements & Feedback
+
+- [ ] 11.1 Implement disagreement detection — compare human spans vs prediction spans with >50% overlap, classify as missed_by_model, missed_by_human, label_mismatch
+- [ ] 11.2 Render disagreement highlights — red dashed border (missed_by_model), yellow highlight (missed_by_human), orange border (label_mismatch)
+- [ ] 11.3 Add "Show only disagreements" filter toggle in PredictionPanel
+- [ ] 11.4 Implement prediction-to-annotation feedback — click missed_by_human prediction opens span annotation dialog pre-filled with predicted label/times
+- [ ] 11.5 Add "Not a pattern" dismiss action — saves negative annotation with label "O" and model_prediction metadata
+- [ ] 11.6 Display prediction summary in PredictionPanel — prediction count, agreement count, disagreement count
+
+## 12. Inference API Connection & Error Handling
+
+- [ ] 12.1 Implement inference API health polling — poll /api/model/info every 30 seconds when API unavailable, auto-reconnect
+- [ ] 12.2 Show "Model server offline" banner when inference API unavailable, disable prediction controls
+- [ ] 12.3 Ensure annotation tools work independently — prediction API errors never block human annotation
+- [ ] 12.4 Add loading states for prediction fetching — skeleton/shimmer overlay during prediction requests
+
+## 13. Documentation & Deployment
+
+- [ ] 13.1 Update docker-compose.yml with all service environment variables and health checks
+- [ ] 13.2 Update DEPLOYMENT.md with Python service setup instructions, TA-Lib installation, MLflow server, PostgreSQL, DVC init
+- [ ] 13.3 Update README.md with ML pipeline overview, architecture diagram, and usage instructions
+- [ ] 13.4 Update CLAUDE_DESCRIPTION.md with new ML service capabilities and file structure
--- a/services/ml/Dockerfile
+++ b/services/ml/Dockerfile
@ -0,0 +1,28 @@
+FROM python:3.11-slim
+
+# Install system dependencies including TA-Lib C library
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    wget \
+    libta-lib-dev \
+    libpq-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set working directory
+WORKDIR /app
+
+# Copy dependency files
+COPY pyproject.toml ./
+
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir .
+
+# Copy application code
+COPY . .
+
+# Expose port for FastAPI
+EXPOSE 8001
+
+# Run the inference server by default
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8001"]
--- a/services/ml/config/pipeline.yaml
+++ b/services/ml/config/pipeline.yaml
@ -0,0 +1,152 @@
+# ML Pipeline Configuration
+# Full config for feature engineering, annotation ingestion, training, and inference
+
+data:
+  raw_path: "data/raw/OHLCV.csv"
+  enriched_path: "data/enriched/features.csv"
+  labeled_path: "data/labeled/dataset.csv"
+  annotations_path: "data/annotations/export.json"
+
+stages:
+  feature_engineering:
+    enabled: true
+    
+    # TA-Lib technical indicators
+    talib_indicators:
+      - name: "RSI"
+        params:
+          timeperiod: 14
+      - name: "EMA"
+        params:
+          timeperiod: 20
+      - name: "EMA"
+        params:
+          timeperiod: 50
+      - name: "MACD"
+        params:
+          fastperiod: 12
+          slowperiod: 26
+          signalperiod: 9
+      - name: "BBANDS"
+        params:
+          timeperiod: 20
+          nbdevup: 2
+          nbdevdn: 2
+      - name: "ATR"
+        params:
+          timeperiod: 14
+      - name: "ADX"
+        params:
+          timeperiod: 14
+      - name: "CCI"
+        params:
+          timeperiod: 14
+      - name: "MFI"
+        params:
+          timeperiod: 14
+      - name: "STOCH"
+        params:
+          fastk_period: 14
+          slowk_period: 3
+          slowd_period: 3
+    
+    # Candle-derived features
+    candle_features: true
+    
+    # Custom feature functions (module paths)
+    custom_features: []
+    
+  annotation_ingestion:
+    enabled: true
+    
+    # Label encoding: "window" or "bio"
+    label_encoding: "window"
+    
+    # For windowed classification
+    window_size: 30
+    
+    # Context padding (candles before/after)
+    context_padding: 20
+    
+    # Minimum confidence for human annotations
+    min_confidence: 1
+    
+    # Programmatic TA-Lib pattern labels
+    programmatic_labels:
+      enabled: true
+      talib_patterns:
+        - "CDLENGULFING"
+        - "CDLHAMMER"
+        - "CDLINVERTEDHAMMER"
+        - "CDLSHOOTINGSTAR"
+        - "CDLDOJI"
+        - "CDLDOJISTAR"
+        - "CDLMORNINGSTAR"
+        - "CDLEVENINGSTAR"
+        - "CDLHARAMI"
+        - "CDLPIERCING"
+        - "CDLDARKCLOUDCOVER"
+        - "CDLTHREEWHITESOLDIERS"
+        - "CDLTHREEBLACKCROWS"
+    
+    # Label merge strategy: "human_priority", "programmatic_priority", "both"
+    merge_strategy: "human_priority"
+  
+  training:
+    enabled: true
+    
+    # Model type: "random_forest", "xgboost"
+    model_type: "random_forest"
+    
+    # Train/test split
+    split_method: "temporal"  # "temporal" or "random"
+    test_split: 0.2
+    validation_split: 0.1
+    
+    # Class balancing
+    class_weights: "balanced"  # "balanced" or null
+    
+    # Hyperparameters (model-specific)
+    hyperparameters:
+      # RandomForest
+      n_estimators: 200
+      max_depth: 15
+      min_samples_split: 5
+      min_samples_leaf: 2
+      random_state: 42
+      n_jobs: -1
+      
+      # XGBoost (when model_type is "xgboost")
+      # n_estimators: 500
+      # max_depth: 6
+      # learning_rate: 0.01
+      # subsample: 0.8
+      # colsample_bytree: 0.8
+      # random_state: 42
+      # n_jobs: -1
+    
+    # MLflow settings
+    mlflow:
+      tracking_uri: "http://mlflow:5000"
+      experiment_name: "candlestick_patterns"
+      log_artifacts: true
+      register_model: false  # Set to true to register in model registry
+
+  inference:
+    enabled: true
+    
+    # Model source: "mlflow" or "local"
+    model_source: "local"
+    
+    # For MLflow source
+    mlflow_model_name: "candlestick_pattern_v1"
+    mlflow_model_stage: "Production"  # "Production", "Staging", "None"
+    
+    # For local source
+    local_model_path: "models/best_model.pkl"
+    
+    # Batch processing
+    batch_size: 1000
+    
+    # Preprocessing config loaded from MLflow artifact or use current config
+    use_training_config: true
--- a/services/ml/pyproject.toml
+++ b/services/ml/pyproject.toml
@ -0,0 +1,28 @@
+[project]
+name = "candle-ml"
+version = "0.1.0"
+description = "ML service for candlestick pattern recognition"
+requires-python = ">=3.11"
+dependencies = [
+    "fastapi>=0.109.0",
+    "uvicorn[standard]>=0.27.0",
+    "scikit-learn>=1.4.0",
+    "xgboost>=2.0.3",
+    "pandas>=2.2.0",
+    "numpy>=1.26.0",
+    "joblib>=1.3.2",
+    "mlflow>=2.10.0",
+    "pyyaml>=6.0.1",
+    "TA-Lib>=0.4.28",
+    "dvc>=3.40.0",
+    "sqlalchemy>=2.0.25",
+    "psycopg2-binary>=2.9.9",
+    "pydantic>=2.5.0",
+    "pydantic-settings>=2.1.0",
+    "matplotlib>=3.8.2",
+    "seaborn>=0.13.1",
+]
+
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"