feat: add Python migration script and successfully test SQLite to PostgreSQL data migration

- Created scripts/migrate-sqlite-to-postgres.py as alternative to TypeScript version - Handles all type conversions: timestamps, booleans, and JSONB fields - Successfully migrated all 2,836 rows from SQLite to PostgreSQL - Verified data integrity: all 6 tables migrated correctly - Charts: 1, Candles: 2,592, Annotations: 4, Span annotations: 223
2026-02-17 14:01:21 +01:00 · 2026-02-17 14:01:21 +01:00 · bfe437857b
commit bfe437857b
parent 5f70f13da3
9 changed files with 1080 additions and 20 deletions
--- a/services/ml/app/annotation_ingestion.py
+++ b/services/ml/app/annotation_ingestion.py
@ -16,6 +16,7 @@ import pandas as pd
 import numpy as np

 from app.config import AnnotationIngestionConfig
+from app.data_access import DataAccess

 logger = logging.getLogger(__name__)

@ -95,6 +96,61 @@ class AnnotationIngestion:
        
        return annotations
    
+    def load_annotations_from_db(
+        self, 
+        chart_name: str,
+        source: str = "human"
+    ) -> List[Dict[str, Any]]:
+        """
+        Load annotations directly from PostgreSQL database.
+        
+        This method replaces JSON file exports by querying the database directly.
+        
+        Args:
+            chart_name: Name of the chart to load annotations for
+            source: Filter by annotation source ('human', 'model', 'hybrid')
+            
+        Returns:
+            List of annotation dictionaries compatible with existing processing
+        """
+        logger.info(f"Loading annotations from database for chart: {chart_name}")
+        
+        data_access = DataAccess()
+        
+        # Get span annotations from database
+        chart = data_access.get_chart_by_name(chart_name)
+        if not chart:
+            raise ValueError(f"Chart not found: {chart_name}")
+        
+        annotations_df = data_access.get_span_annotations(
+            chart_id=chart['id'],
+            source=source,
+            min_confidence=self.config.min_confidence if self.config.min_confidence > 1 else None
+        )
+        
+        if annotations_df.empty:
+            logger.warning(f"No annotations found for chart: {chart_name}")
+            return []
+        
+        # Convert DataFrame to list of dictionaries compatible with existing code
+        annotations = []
+        for _, row in annotations_df.iterrows():
+            ann = {
+                'id': row['id'],
+                'label': row['label'],
+                'start_time': row['start_time'].isoformat() if pd.notna(row['start_time']) else None,
+                'end_time': row['end_time'].isoformat() if pd.notna(row['end_time']) else None,
+                'confidence': row.get('confidence'),
+                'outcome': row.get('outcome'),
+                'notes': row.get('notes'),
+                'source': row['source'],
+            }
+            annotations.append(ann)
+        
+        logger.info(f"Loaded {len(annotations)} annotations from database")
+        
+        return annotations
+    
    def get_programmatic_labels(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Generate programmatic labels using TA-Lib CDL* pattern functions.
@ -484,6 +540,55 @@ class AnnotationIngestion:
        logger.info("Annotation ingestion complete")
        
        return result_df
+    
+    def process_from_db(
+        self,
+        enriched_df: pd.DataFrame,
+        chart_name: str,
+        source: str = "human"
+    ) -> pd.DataFrame:
+        """
+        Main processing pipeline using direct database access.
+        
+        This method replaces JSON file exports by querying PostgreSQL directly.
+        
+        Args:
+            enriched_df: DataFrame with engineered features
+            chart_name: Name of the chart to load annotations for
+            source: Filter by annotation source ('human', 'model', 'hybrid')
+            
+        Returns:
+            Labeled DataFrame ready for training
+        """
+        logger.info(f"Starting annotation ingestion from database for chart: {chart_name}")
+        
+        # Load annotations from database
+        annotations = self.load_annotations_from_db(chart_name, source)
+        
+        if not annotations:
+            logger.warning("No annotations found, returning empty DataFrame")
+            return pd.DataFrame()
+        
+        # Add programmatic labels if enabled
+        df = self.get_programmatic_labels(enriched_df)
+        
+        # Apply label encoding
+        if self.config.label_encoding == "window":
+            result_df = self.create_windowed_dataset(df, annotations)
+        elif self.config.label_encoding == "bio":
+            result_df = self.create_bio_dataset(df, annotations)
+            # For BIO, also merge human/programmatic if enabled
+            if self.config.programmatic_labels.enabled:
+                result_df = self.merge_labels(result_df, annotations)
+        else:
+            raise ValueError(f"Unknown label encoding: {self.config.label_encoding}")
+        
+        # Log statistics
+        self.log_statistics(result_df, annotations)
+        
+        logger.info("Annotation ingestion complete")
+        
+        return result_df


 def run_annotation_ingestion(