feat: add Python migration script and successfully test SQLite to PostgreSQL data migration
- Created scripts/migrate-sqlite-to-postgres.py as alternative to TypeScript version - Handles all type conversions: timestamps, booleans, and JSONB fields - Successfully migrated all 2,836 rows from SQLite to PostgreSQL - Verified data integrity: all 6 tables migrated correctly - Charts: 1, Candles: 2,592, Annotations: 4, Span annotations: 223
This commit is contained in:
parent
5f70f13da3
commit
bfe437857b
9 changed files with 1080 additions and 20 deletions
|
|
@ -16,6 +16,7 @@ import pandas as pd
|
|||
import numpy as np
|
||||
|
||||
from app.config import AnnotationIngestionConfig
|
||||
from app.data_access import DataAccess
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -95,6 +96,61 @@ class AnnotationIngestion:
|
|||
|
||||
return annotations
|
||||
|
||||
def load_annotations_from_db(
|
||||
self,
|
||||
chart_name: str,
|
||||
source: str = "human"
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Load annotations directly from PostgreSQL database.
|
||||
|
||||
This method replaces JSON file exports by querying the database directly.
|
||||
|
||||
Args:
|
||||
chart_name: Name of the chart to load annotations for
|
||||
source: Filter by annotation source ('human', 'model', 'hybrid')
|
||||
|
||||
Returns:
|
||||
List of annotation dictionaries compatible with existing processing
|
||||
"""
|
||||
logger.info(f"Loading annotations from database for chart: {chart_name}")
|
||||
|
||||
data_access = DataAccess()
|
||||
|
||||
# Get span annotations from database
|
||||
chart = data_access.get_chart_by_name(chart_name)
|
||||
if not chart:
|
||||
raise ValueError(f"Chart not found: {chart_name}")
|
||||
|
||||
annotations_df = data_access.get_span_annotations(
|
||||
chart_id=chart['id'],
|
||||
source=source,
|
||||
min_confidence=self.config.min_confidence if self.config.min_confidence > 1 else None
|
||||
)
|
||||
|
||||
if annotations_df.empty:
|
||||
logger.warning(f"No annotations found for chart: {chart_name}")
|
||||
return []
|
||||
|
||||
# Convert DataFrame to list of dictionaries compatible with existing code
|
||||
annotations = []
|
||||
for _, row in annotations_df.iterrows():
|
||||
ann = {
|
||||
'id': row['id'],
|
||||
'label': row['label'],
|
||||
'start_time': row['start_time'].isoformat() if pd.notna(row['start_time']) else None,
|
||||
'end_time': row['end_time'].isoformat() if pd.notna(row['end_time']) else None,
|
||||
'confidence': row.get('confidence'),
|
||||
'outcome': row.get('outcome'),
|
||||
'notes': row.get('notes'),
|
||||
'source': row['source'],
|
||||
}
|
||||
annotations.append(ann)
|
||||
|
||||
logger.info(f"Loaded {len(annotations)} annotations from database")
|
||||
|
||||
return annotations
|
||||
|
||||
def get_programmatic_labels(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Generate programmatic labels using TA-Lib CDL* pattern functions.
|
||||
|
|
@ -484,6 +540,55 @@ class AnnotationIngestion:
|
|||
logger.info("Annotation ingestion complete")
|
||||
|
||||
return result_df
|
||||
|
||||
def process_from_db(
|
||||
self,
|
||||
enriched_df: pd.DataFrame,
|
||||
chart_name: str,
|
||||
source: str = "human"
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Main processing pipeline using direct database access.
|
||||
|
||||
This method replaces JSON file exports by querying PostgreSQL directly.
|
||||
|
||||
Args:
|
||||
enriched_df: DataFrame with engineered features
|
||||
chart_name: Name of the chart to load annotations for
|
||||
source: Filter by annotation source ('human', 'model', 'hybrid')
|
||||
|
||||
Returns:
|
||||
Labeled DataFrame ready for training
|
||||
"""
|
||||
logger.info(f"Starting annotation ingestion from database for chart: {chart_name}")
|
||||
|
||||
# Load annotations from database
|
||||
annotations = self.load_annotations_from_db(chart_name, source)
|
||||
|
||||
if not annotations:
|
||||
logger.warning("No annotations found, returning empty DataFrame")
|
||||
return pd.DataFrame()
|
||||
|
||||
# Add programmatic labels if enabled
|
||||
df = self.get_programmatic_labels(enriched_df)
|
||||
|
||||
# Apply label encoding
|
||||
if self.config.label_encoding == "window":
|
||||
result_df = self.create_windowed_dataset(df, annotations)
|
||||
elif self.config.label_encoding == "bio":
|
||||
result_df = self.create_bio_dataset(df, annotations)
|
||||
# For BIO, also merge human/programmatic if enabled
|
||||
if self.config.programmatic_labels.enabled:
|
||||
result_df = self.merge_labels(result_df, annotations)
|
||||
else:
|
||||
raise ValueError(f"Unknown label encoding: {self.config.label_encoding}")
|
||||
|
||||
# Log statistics
|
||||
self.log_statistics(result_df, annotations)
|
||||
|
||||
logger.info("Annotation ingestion complete")
|
||||
|
||||
return result_df
|
||||
|
||||
|
||||
def run_annotation_ingestion(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue