feat: add Python migration script and successfully test SQLite to PostgreSQL data migration

- Created scripts/migrate-sqlite-to-postgres.py as alternative to TypeScript version - Handles all type conversions: timestamps, booleans, and JSONB fields - Successfully migrated all 2,836 rows from SQLite to PostgreSQL - Verified data integrity: all 6 tables migrated correctly - Charts: 1, Candles: 2,592, Annotations: 4, Span annotations: 223
2026-02-17 14:01:21 +01:00 · 2026-02-17 14:01:21 +01:00 · bfe437857b
commit bfe437857b
parent 5f70f13da3
9 changed files with 1080 additions and 20 deletions
--- a/services/ml/app/annotation_ingestion.py
+++ b/services/ml/app/annotation_ingestion.py
@ -16,6 +16,7 @@ import pandas as pd
 import numpy as np

 from app.config import AnnotationIngestionConfig
+from app.data_access import DataAccess

 logger = logging.getLogger(__name__)

@ -95,6 +96,61 @@ class AnnotationIngestion:
        
        return annotations
    
+    def load_annotations_from_db(
+        self, 
+        chart_name: str,
+        source: str = "human"
+    ) -> List[Dict[str, Any]]:
+        """
+        Load annotations directly from PostgreSQL database.
+        
+        This method replaces JSON file exports by querying the database directly.
+        
+        Args:
+            chart_name: Name of the chart to load annotations for
+            source: Filter by annotation source ('human', 'model', 'hybrid')
+            
+        Returns:
+            List of annotation dictionaries compatible with existing processing
+        """
+        logger.info(f"Loading annotations from database for chart: {chart_name}")
+        
+        data_access = DataAccess()
+        
+        # Get span annotations from database
+        chart = data_access.get_chart_by_name(chart_name)
+        if not chart:
+            raise ValueError(f"Chart not found: {chart_name}")
+        
+        annotations_df = data_access.get_span_annotations(
+            chart_id=chart['id'],
+            source=source,
+            min_confidence=self.config.min_confidence if self.config.min_confidence > 1 else None
+        )
+        
+        if annotations_df.empty:
+            logger.warning(f"No annotations found for chart: {chart_name}")
+            return []
+        
+        # Convert DataFrame to list of dictionaries compatible with existing code
+        annotations = []
+        for _, row in annotations_df.iterrows():
+            ann = {
+                'id': row['id'],
+                'label': row['label'],
+                'start_time': row['start_time'].isoformat() if pd.notna(row['start_time']) else None,
+                'end_time': row['end_time'].isoformat() if pd.notna(row['end_time']) else None,
+                'confidence': row.get('confidence'),
+                'outcome': row.get('outcome'),
+                'notes': row.get('notes'),
+                'source': row['source'],
+            }
+            annotations.append(ann)
+        
+        logger.info(f"Loaded {len(annotations)} annotations from database")
+        
+        return annotations
+    
    def get_programmatic_labels(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Generate programmatic labels using TA-Lib CDL* pattern functions.
@ -484,6 +540,55 @@ class AnnotationIngestion:
        logger.info("Annotation ingestion complete")
        
        return result_df
+    
+    def process_from_db(
+        self,
+        enriched_df: pd.DataFrame,
+        chart_name: str,
+        source: str = "human"
+    ) -> pd.DataFrame:
+        """
+        Main processing pipeline using direct database access.
+        
+        This method replaces JSON file exports by querying PostgreSQL directly.
+        
+        Args:
+            enriched_df: DataFrame with engineered features
+            chart_name: Name of the chart to load annotations for
+            source: Filter by annotation source ('human', 'model', 'hybrid')
+            
+        Returns:
+            Labeled DataFrame ready for training
+        """
+        logger.info(f"Starting annotation ingestion from database for chart: {chart_name}")
+        
+        # Load annotations from database
+        annotations = self.load_annotations_from_db(chart_name, source)
+        
+        if not annotations:
+            logger.warning("No annotations found, returning empty DataFrame")
+            return pd.DataFrame()
+        
+        # Add programmatic labels if enabled
+        df = self.get_programmatic_labels(enriched_df)
+        
+        # Apply label encoding
+        if self.config.label_encoding == "window":
+            result_df = self.create_windowed_dataset(df, annotations)
+        elif self.config.label_encoding == "bio":
+            result_df = self.create_bio_dataset(df, annotations)
+            # For BIO, also merge human/programmatic if enabled
+            if self.config.programmatic_labels.enabled:
+                result_df = self.merge_labels(result_df, annotations)
+        else:
+            raise ValueError(f"Unknown label encoding: {self.config.label_encoding}")
+        
+        # Log statistics
+        self.log_statistics(result_df, annotations)
+        
+        logger.info("Annotation ingestion complete")
+        
+        return result_df


 def run_annotation_ingestion(
--- a/services/ml/app/data_access.py
+++ b/services/ml/app/data_access.py
@ -0,0 +1,249 @@
+"""
+Direct database access for reading candle and annotation data.
+
+This module provides read-only access to frontend tables (candles, annotations, 
+span_annotations, charts) using SQLAlchemy table reflections or raw queries.
+"""
+
+import logging
+from typing import List, Dict, Any, Optional
+from datetime import datetime
+
+import pandas as pd
+from sqlalchemy import Table, MetaData, select, and_
+from sqlalchemy.orm import Session
+
+from app.db import engine, get_db
+
+logger = logging.getLogger(__name__)
+
+
+class DataAccess:
+    """
+    Provides read-only access to frontend database tables.
+    
+    Uses SQLAlchemy table reflection to access tables managed by Drizzle ORM
+    without creating duplicate model definitions.
+    """
+    
+    def __init__(self):
+        """Initialize data access with table reflections."""
+        self.metadata = MetaData()
+        
+        # Reflect frontend tables
+        try:
+            self.charts = Table('charts', self.metadata, autoload_with=engine)
+            self.candles = Table('candles', self.metadata, autoload_with=engine)
+            self.annotations = Table('annotations', self.metadata, autoload_with=engine)
+            self.span_annotations = Table('span_annotations', self.metadata, autoload_with=engine)
+            self.span_label_types = Table('span_label_types', self.metadata, autoload_with=engine)
+            logger.info("Successfully reflected frontend tables")
+        except Exception as e:
+            logger.error(f"Error reflecting frontend tables: {e}")
+            raise
+    
+    def get_chart_by_name(self, chart_name: str) -> Optional[Dict[str, Any]]:
+        """
+        Get chart by name.
+        
+        Args:
+            chart_name: Name of the chart
+            
+        Returns:
+            Chart dictionary or None if not found
+        """
+        with get_db() as db:
+            stmt = select(self.charts).where(self.charts.c.name == chart_name)
+            result = db.execute(stmt).fetchone()
+            
+            if result:
+                return dict(result._mapping)
+            return None
+    
+    def get_candles(
+        self, 
+        chart_id: int,
+        start_time: Optional[datetime] = None,
+        end_time: Optional[datetime] = None
+    ) -> pd.DataFrame:
+        """
+        Get candle data for a chart.
+        
+        Args:
+            chart_id: Chart ID
+            start_time: Optional start time filter
+            end_time: Optional end time filter
+            
+        Returns:
+            DataFrame with columns: id, chart_id, time, open, high, low, close
+        """
+        with get_db() as db:
+            # Build query
+            stmt = select(self.candles).where(self.candles.c.chart_id == chart_id)
+            
+            if start_time:
+                stmt = stmt.where(self.candles.c.time >= start_time)
+            if end_time:
+                stmt = stmt.where(self.candles.c.time <= end_time)
+            
+            stmt = stmt.order_by(self.candles.c.time)
+            
+            result = db.execute(stmt).fetchall()
+            
+            if not result:
+                logger.warning(f"No candles found for chart_id={chart_id}")
+                return pd.DataFrame()
+            
+            # Convert to DataFrame
+            df = pd.DataFrame([dict(row._mapping) for row in result])
+            logger.info(f"Loaded {len(df)} candles for chart_id={chart_id}")
+            
+            return df
+    
+    def get_span_annotations(
+        self,
+        chart_id: int,
+        source: Optional[str] = None,
+        min_confidence: Optional[int] = None
+    ) -> pd.DataFrame:
+        """
+        Get span annotations for a chart.
+        
+        Args:
+            chart_id: Chart ID
+            source: Optional filter by source ('human', 'model', 'hybrid')
+            min_confidence: Optional minimum confidence filter
+            
+        Returns:
+            DataFrame with span annotations
+        """
+        with get_db() as db:
+            # Build query
+            stmt = select(self.span_annotations).where(
+                self.span_annotations.c.chart_id == chart_id
+            )
+            
+            if source:
+                stmt = stmt.where(self.span_annotations.c.source == source)
+            
+            if min_confidence is not None:
+                stmt = stmt.where(
+                    and_(
+                        self.span_annotations.c.confidence.isnot(None),
+                        self.span_annotations.c.confidence >= min_confidence
+                    )
+                )
+            
+            stmt = stmt.order_by(self.span_annotations.c.start_time)
+            
+            result = db.execute(stmt).fetchall()
+            
+            if not result:
+                logger.warning(f"No span annotations found for chart_id={chart_id}")
+                return pd.DataFrame()
+            
+            # Convert to DataFrame
+            df = pd.DataFrame([dict(row._mapping) for row in result])
+            logger.info(f"Loaded {len(df)} span annotations for chart_id={chart_id}")
+            
+            return df
+    
+    def get_point_annotations(
+        self,
+        chart_id: int,
+        label_type: Optional[str] = None
+    ) -> pd.DataFrame:
+        """
+        Get point annotations for a chart.
+        
+        Args:
+            chart_id: Chart ID
+            label_type: Optional filter by label type
+            
+        Returns:
+            DataFrame with point annotations
+        """
+        with get_db() as db:
+            # Build query
+            stmt = select(self.annotations).where(
+                self.annotations.c.chart_id == chart_id
+            )
+            
+            if label_type:
+                stmt = stmt.where(self.annotations.c.label_type == label_type)
+            
+            stmt = stmt.order_by(self.annotations.c.timestamp)
+            
+            result = db.execute(stmt).fetchall()
+            
+            if not result:
+                logger.warning(f"No point annotations found for chart_id={chart_id}")
+                return pd.DataFrame()
+            
+            # Convert to DataFrame
+            df = pd.DataFrame([dict(row._mapping) for row in result])
+            logger.info(f"Loaded {len(df)} point annotations for chart_id={chart_id}")
+            
+            return df
+    
+    def get_training_data(
+        self,
+        chart_name: str,
+        start_time: Optional[datetime] = None,
+        end_time: Optional[datetime] = None,
+        annotation_source: str = "human",
+        min_confidence: Optional[int] = None
+    ) -> tuple[pd.DataFrame, pd.DataFrame]:
+        """
+        Get complete training data for a chart (candles + span annotations).
+        
+        This is a convenience method that combines candle and annotation queries.
+        
+        Args:
+            chart_name: Name of the chart
+            start_time: Optional start time filter
+            end_time: Optional end time filter
+            annotation_source: Filter annotations by source (default: 'human')
+            min_confidence: Optional minimum confidence filter
+            
+        Returns:
+            Tuple of (candles_df, annotations_df)
+        """
+        # Get chart
+        chart = self.get_chart_by_name(chart_name)
+        if not chart:
+            raise ValueError(f"Chart not found: {chart_name}")
+        
+        chart_id = chart['id']
+        logger.info(f"Loading training data for chart: {chart_name} (id={chart_id})")
+        
+        # Get candles
+        candles_df = self.get_candles(chart_id, start_time, end_time)
+        
+        # Get annotations
+        annotations_df = self.get_span_annotations(
+            chart_id, 
+            source=annotation_source,
+            min_confidence=min_confidence
+        )
+        
+        return candles_df, annotations_df
+    
+    def get_all_charts(self) -> pd.DataFrame:
+        """
+        Get all available charts.
+        
+        Returns:
+            DataFrame with all charts
+        """
+        with get_db() as db:
+            stmt = select(self.charts).order_by(self.charts.c.created_at)
+            result = db.execute(stmt).fetchall()
+            
+            if not result:
+                return pd.DataFrame()
+            
+            df = pd.DataFrame([dict(row._mapping) for row in result])
+            logger.info(f"Found {len(df)} charts")
+            
+            return df