feat: add Python migration script and successfully test SQLite to PostgreSQL data migration

- Created scripts/migrate-sqlite-to-postgres.py as alternative to TypeScript version
- Handles all type conversions: timestamps, booleans, and JSONB fields
- Successfully migrated all 2,836 rows from SQLite to PostgreSQL
- Verified data integrity: all 6 tables migrated correctly
- Charts: 1, Candles: 2,592, Annotations: 4, Span annotations: 223
This commit is contained in:
Marko Djordjevic 2026-02-17 14:01:21 +01:00
parent 5f70f13da3
commit bfe437857b
9 changed files with 1080 additions and 20 deletions

View file

@ -16,6 +16,7 @@ import pandas as pd
import numpy as np
from app.config import AnnotationIngestionConfig
from app.data_access import DataAccess
logger = logging.getLogger(__name__)
@ -95,6 +96,61 @@ class AnnotationIngestion:
return annotations
def load_annotations_from_db(
self,
chart_name: str,
source: str = "human"
) -> List[Dict[str, Any]]:
"""
Load annotations directly from PostgreSQL database.
This method replaces JSON file exports by querying the database directly.
Args:
chart_name: Name of the chart to load annotations for
source: Filter by annotation source ('human', 'model', 'hybrid')
Returns:
List of annotation dictionaries compatible with existing processing
"""
logger.info(f"Loading annotations from database for chart: {chart_name}")
data_access = DataAccess()
# Get span annotations from database
chart = data_access.get_chart_by_name(chart_name)
if not chart:
raise ValueError(f"Chart not found: {chart_name}")
annotations_df = data_access.get_span_annotations(
chart_id=chart['id'],
source=source,
min_confidence=self.config.min_confidence if self.config.min_confidence > 1 else None
)
if annotations_df.empty:
logger.warning(f"No annotations found for chart: {chart_name}")
return []
# Convert DataFrame to list of dictionaries compatible with existing code
annotations = []
for _, row in annotations_df.iterrows():
ann = {
'id': row['id'],
'label': row['label'],
'start_time': row['start_time'].isoformat() if pd.notna(row['start_time']) else None,
'end_time': row['end_time'].isoformat() if pd.notna(row['end_time']) else None,
'confidence': row.get('confidence'),
'outcome': row.get('outcome'),
'notes': row.get('notes'),
'source': row['source'],
}
annotations.append(ann)
logger.info(f"Loaded {len(annotations)} annotations from database")
return annotations
def get_programmatic_labels(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Generate programmatic labels using TA-Lib CDL* pattern functions.
@ -484,6 +540,55 @@ class AnnotationIngestion:
logger.info("Annotation ingestion complete")
return result_df
def process_from_db(
self,
enriched_df: pd.DataFrame,
chart_name: str,
source: str = "human"
) -> pd.DataFrame:
"""
Main processing pipeline using direct database access.
This method replaces JSON file exports by querying PostgreSQL directly.
Args:
enriched_df: DataFrame with engineered features
chart_name: Name of the chart to load annotations for
source: Filter by annotation source ('human', 'model', 'hybrid')
Returns:
Labeled DataFrame ready for training
"""
logger.info(f"Starting annotation ingestion from database for chart: {chart_name}")
# Load annotations from database
annotations = self.load_annotations_from_db(chart_name, source)
if not annotations:
logger.warning("No annotations found, returning empty DataFrame")
return pd.DataFrame()
# Add programmatic labels if enabled
df = self.get_programmatic_labels(enriched_df)
# Apply label encoding
if self.config.label_encoding == "window":
result_df = self.create_windowed_dataset(df, annotations)
elif self.config.label_encoding == "bio":
result_df = self.create_bio_dataset(df, annotations)
# For BIO, also merge human/programmatic if enabled
if self.config.programmatic_labels.enabled:
result_df = self.merge_labels(result_df, annotations)
else:
raise ValueError(f"Unknown label encoding: {self.config.label_encoding}")
# Log statistics
self.log_statistics(result_df, annotations)
logger.info("Annotation ingestion complete")
return result_df
def run_annotation_ingestion(