candle-annotator/services/ml/app/annotation_ingestion.py

"""
Annotation ingestion module.

Loads annotations from JSON exports and converts them into labeled training datasets
with various encoding strategies (windowed classification, BIO sequence labeling).
Supports programmatic TA-Lib pattern labels and merge strategies.
"""

import json
import logging
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
from collections import defaultdict

import pandas as pd
import numpy as np

from app.config import AnnotationIngestionConfig
from app.data_access import DataAccess

logger = logging.getLogger(__name__)


class AnnotationIngestion:
    """
    Handles loading and processing of annotations into labeled datasets.

    Supports:
    - Loading annotations from JSON exports
    - Windowed classification encoding
    - BIO sequence labeling encoding
    - TA-Lib CDL* programmatic pattern detection
    - Human/programmatic label merging
    - Context padding
    - Dataset statistics logging
    """

    def __init__(self, config: AnnotationIngestionConfig):
        """
        Initialize annotation ingestion.

        Args:
            config: Annotation ingestion configuration
        """
        self.config = config

    def load_annotations(self, annotations_path: str) -> List[Dict[str, Any]]:
        """
        Load annotations from JSON export file.

        Args:
            annotations_path: Path to annotations JSON file

        Returns:
            List of annotation dictionaries

        Raises:
            FileNotFoundError: If annotations file doesn't exist
            ValueError: If JSON format is invalid
        """
        path = Path(annotations_path)

        if not path.exists():
            raise FileNotFoundError(
                f"Annotations file not found: {annotations_path}. "
                "Please export annotations from the UI first."
            )

        logger.info(f"Loading annotations from {annotations_path}")

        with open(path, 'r') as f:
            data = json.load(f)

        if 'annotations' not in data:
            raise ValueError(
                f"Invalid annotations JSON format: missing 'annotations' key"
            )

        annotations = data['annotations']
        logger.info(f"Loaded {len(annotations)} annotations")

        # Filter by confidence
        if self.config.min_confidence > 1:
            original_count = len(annotations)
            annotations = [
                ann for ann in annotations
                if ann.get('confidence') is not None
                and ann['confidence'] >= self.config.min_confidence
            ]
            filtered_count = original_count - len(annotations)
            if filtered_count > 0:
                logger.info(
                    f"Filtered {filtered_count} annotations below "
                    f"min_confidence={self.config.min_confidence}"
                )

        return annotations

    def load_annotations_from_db(
        self,
        chart_name: str,
        source: str = "human"
    ) -> List[Dict[str, Any]]:
        """
        Load annotations directly from PostgreSQL database.

        This method replaces JSON file exports by querying the database directly.

        Args:
            chart_name: Name of the chart to load annotations for
            source: Filter by annotation source ('human', 'model', 'hybrid')

        Returns:
            List of annotation dictionaries compatible with existing processing
        """
        logger.info(f"Loading annotations from database for chart: {chart_name}")

        data_access = DataAccess()

        # Get span annotations from database
        chart = data_access.get_chart_by_name(chart_name)
        if not chart:
            raise ValueError(f"Chart not found: {chart_name}")

        annotations_df = data_access.get_span_annotations(
            chart_id=chart['id'],
            source=source,
            min_confidence=self.config.min_confidence if self.config.min_confidence > 1 else None
        )

        if annotations_df.empty:
            logger.warning(f"No annotations found for chart: {chart_name}")
            return []

        # Convert DataFrame to list of dictionaries compatible with existing code
        annotations = []
        for _, row in annotations_df.iterrows():
            ann = {
                'id': row['id'],
                'label': row['label'],
                'start_time': row['start_time'].isoformat() if pd.notna(row['start_time']) else None,
                'end_time': row['end_time'].isoformat() if pd.notna(row['end_time']) else None,
                'confidence': row.get('confidence'),
                'outcome': row.get('outcome'),
                'notes': row.get('notes'),
                'source': row['source'],
            }
            annotations.append(ann)

        logger.info(f"Loaded {len(annotations)} annotations from database")

        return annotations

    def get_programmatic_labels(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Generate programmatic labels using TA-Lib CDL* pattern functions.

        Args:
            df: DataFrame with OHLC columns

        Returns:
            DataFrame with programmatic label column added
        """
        if not self.config.programmatic_labels.enabled:
            return df

        if not self.config.programmatic_labels.talib_patterns:
            logger.warning("Programmatic labels enabled but no patterns configured")
            return df

        try:
            import talib
        except ImportError:
            raise ImportError(
                "TA-Lib is required for programmatic labels. "
                "Install with: pip install TA-Lib (requires libta-lib-dev)"
            )

        logger.info("Computing programmatic TA-Lib pattern labels")

        # Collect all pattern detections
        pattern_labels = pd.Series(['O'] * len(df), index=df.index)

        for pattern_name in self.config.programmatic_labels.talib_patterns:
            if not hasattr(talib, pattern_name):
                logger.warning(f"Unknown TA-Lib pattern: {pattern_name}")
                continue

            pattern_func = getattr(talib, pattern_name)

            try:
                result = pattern_func(
                    df['open'].values,
                    df['high'].values,
                    df['low'].values,
                    df['close'].values
                )

                # Convert +100/-100 to label names
                for i, value in enumerate(result):
                    if value > 0:
                        label = f"bullish_{pattern_name.lower().replace('cdl', '').replace('_', '')}"
                        pattern_labels.iloc[i] = label
                    elif value < 0:
                        label = f"bearish_{pattern_name.lower().replace('cdl', '').replace('_', '')}"
                        pattern_labels.iloc[i] = label

            except Exception as e:
                logger.error(f"Error computing {pattern_name}: {e}")
                continue

        df['label_programmatic'] = pattern_labels

        # Log statistics
        prog_counts = pattern_labels.value_counts()
        logger.info(f"Programmatic labels: {prog_counts.to_dict()}")

        return df

    def create_windowed_dataset(
        self,
        df: pd.DataFrame,
        annotations: List[Dict[str, Any]]
    ) -> pd.DataFrame:
        """
        Create windowed classification dataset.

        Each annotation span is converted to a fixed-size window of candles,
        flattened into a single row with the annotation label as target.

        Args:
            df: DataFrame with features and time column
            annotations: List of annotation dictionaries

        Returns:
            DataFrame with windowed samples
        """
        logger.info("Creating windowed classification dataset")

        window_size = self.config.window_size
        context_padding = self.config.context_padding

        samples = []

        for ann in annotations:
            label = ann['label']
            start_time = pd.Timestamp(ann['start_time']).tz_localize(None)
            end_time = pd.Timestamp(ann['end_time']).tz_localize(None)

            # Find candles in span
            span_mask = (df['time'] >= start_time) & (df['time'] <= end_time)
            span_indices = df.index[span_mask].tolist()

            if not span_indices:
                logger.warning(f"No candles found for annotation {ann.get('id')}")
                continue

            span_start_idx = span_indices[0]
            span_end_idx = span_indices[-1]
            span_length = len(span_indices)

            # Determine window boundaries
            if span_length < window_size:
                # Pad with context candles (centered)
                padding_needed = window_size - span_length
                left_padding = padding_needed // 2
                right_padding = padding_needed - left_padding

                window_start = max(0, span_start_idx - left_padding - context_padding)
                window_end = min(len(df) - 1, span_end_idx + right_padding + context_padding)
            else:
                # Use full span + context padding
                window_start = max(0, span_start_idx - context_padding)
                window_end = min(len(df) - 1, span_end_idx + context_padding)

            # Extract window
            window = df.iloc[window_start:window_end + 1].copy()

            # Flatten window into single row
            # Create feature names: feature_0, feature_1, etc.
            feature_cols = [c for c in df.columns if c != 'time']
            flattened = {}

            for i, (_, row) in enumerate(window.iterrows()):
                for col in feature_cols:
                    flattened[f"{col}_{i}"] = row[col]

            # Pad with NaN if window is smaller than expected
            expected_size = window_size
            actual_size = len(window)
            if actual_size < expected_size:
                for i in range(actual_size, expected_size):
                    for col in feature_cols:
                        flattened[f"{col}_{i}"] = np.nan

            flattened['label'] = label
            samples.append(flattened)

        result_df = pd.DataFrame(samples)
        logger.info(f"Created {len(result_df)} windowed samples")

        return result_df

    def create_bio_dataset(
        self,
        df: pd.DataFrame,
        annotations: List[Dict[str, Any]]
    ) -> pd.DataFrame:
        """
        Create BIO sequence labeling dataset.

        Each candle gets a BIO tag: B-{label} for span start, I-{label} for
        continuation, O for outside annotations.

        Args:
            df: DataFrame with features and time column
            annotations: List of annotation dictionaries

        Returns:
            DataFrame with BIO tags column(s)
        """
        logger.info("Creating BIO sequence labeling dataset")

        # Initialize with 'O' tags
        bio_tags = pd.Series(['O'] * len(df), index=df.index)

        # Track overlapping annotations
        overlaps_detected = False
        bio_columns = {'bio_tag': bio_tags}

        for ann in annotations:
            label = ann['label']
            start_time = pd.Timestamp(ann['start_time'])
            end_time = pd.Timestamp(ann['end_time'])

            # Find candles in span
            span_mask = (df['time'] >= start_time) & (df['time'] <= end_time)
            span_indices = df.index[span_mask].tolist()

            if not span_indices:
                continue

            # Check for overlaps
            first_idx = span_indices[0]
            if bio_tags.iloc[first_idx] != 'O':
                # Overlap detected, create additional column
                overlaps_detected = True
                col_num = 2
                while f'bio_tag_{col_num}' in bio_columns:
                    col_num += 1

                # Create new column if needed
                if f'bio_tag_{col_num}' not in bio_columns:
                    bio_columns[f'bio_tag_{col_num}'] = pd.Series(
                        ['O'] * len(df),
                        index=df.index
                    )

                # Use this column for the annotation
                target_col = f'bio_tag_{col_num}'
            else:
                target_col = 'bio_tag'

            # Assign BIO tags
            bio_columns[target_col].iloc[span_indices[0]] = f"B-{label}"
            for idx in span_indices[1:]:
                bio_columns[target_col].iloc[idx] = f"I-{label}"

        if overlaps_detected:
            logger.info("Overlapping annotations detected, created multiple BIO tag columns")

        # Add BIO columns to dataframe
        result_df = df.copy()
        for col_name, col_data in bio_columns.items():
            result_df[col_name] = col_data

        return result_df

    def merge_labels(
        self,
        df: pd.DataFrame,
        annotations: List[Dict[str, Any]]
    ) -> pd.DataFrame:
        """
        Merge human annotations and programmatic labels based on merge strategy.

        Args:
            df: DataFrame with programmatic labels (if enabled)
            annotations: List of human annotation dictionaries

        Returns:
            DataFrame with merged labels
        """
        if not self.config.programmatic_labels.enabled:
            # No programmatic labels, just use human annotations
            return df

        strategy = self.config.merge_strategy
        logger.info(f"Merging labels with strategy: {strategy}")

        # Create human labels column
        human_labels = pd.Series(['O'] * len(df), index=df.index)

        for ann in annotations:
            label = ann['label']
            start_time = pd.Timestamp(ann['start_time'])
            end_time = pd.Timestamp(ann['end_time'])

            span_mask = (df['time'] >= start_time) & (df['time'] <= end_time)
            span_indices = df.index[span_mask].tolist()

            for idx in span_indices:
                human_labels.iloc[idx] = label

        df['label_human'] = human_labels

        # Apply merge strategy
        if strategy == "human_priority":
            # Use human label if present, else programmatic
            df['label'] = df.apply(
                lambda row: row['label_human'] if row['label_human'] != 'O'
                else row.get('label_programmatic', 'O'),
                axis=1
            )
            # Drop temporary columns
            df = df.drop(columns=['label_human', 'label_programmatic'], errors='ignore')

        elif strategy == "programmatic_priority":
            # Use programmatic label if present, else human
            df['label'] = df.apply(
                lambda row: row.get('label_programmatic', 'O') if row.get('label_programmatic', 'O') != 'O'
                else row['label_human'],
                axis=1
            )
            # Drop temporary columns
            df = df.drop(columns=['label_human', 'label_programmatic'], errors='ignore')

        elif strategy == "both":
            # Keep both columns
            pass

        return df

    def log_statistics(self, df: pd.DataFrame, annotations: List[Dict[str, Any]]):
        """
        Log dataset statistics.

        Args:
            df: Labeled DataFrame
            annotations: List of annotations
        """
        logger.info("=" * 60)
        logger.info("Dataset Statistics")
        logger.info("=" * 60)

        # Label distribution
        if 'label' in df.columns:
            label_counts = df['label'].value_counts()
            total = len(df)

            logger.info("\nLabel Distribution:")
            for label, count in label_counts.items():
                pct = (count / total) * 100
                logger.info(f"  {label}: {count} ({pct:.2f}%)")

        # If both human and programmatic labels exist
        if 'label_human' in df.columns and 'label_programmatic' in df.columns:
            # Agreement rate (excluding O labels)
            non_o_mask = (df['label_human'] != 'O') | (df['label_programmatic'] != 'O')
            non_o_df = df[non_o_mask]

            if len(non_o_df) > 0:
                agreement = (
                    non_o_df['label_human'] == non_o_df['label_programmatic']
                ).sum()
                agreement_rate = (agreement / len(non_o_df)) * 100
                logger.info(f"\nHuman/Programmatic Agreement Rate: {agreement_rate:.2f}%")

        # Average span length per label
        span_lengths = defaultdict(list)
        for ann in annotations:
            label = ann['label']
            start_time = pd.Timestamp(ann['start_time'])
            end_time = pd.Timestamp(ann['end_time'])

            if 'time' in df.columns:
                span_mask = (df['time'] >= start_time) & (df['time'] <= end_time)
                span_length = span_mask.sum()
                span_lengths[label].append(span_length)

        if span_lengths:
            logger.info("\nAverage Span Length (candles):")
            for label, lengths in span_lengths.items():
                avg_length = np.mean(lengths)
                logger.info(f"  {label}: {avg_length:.2f}")

        logger.info("=" * 60)

    def process(
        self,
        enriched_df: pd.DataFrame,
        annotations_path: str
    ) -> pd.DataFrame:
        """
        Main processing pipeline for annotation ingestion.

        Args:
            enriched_df: DataFrame with engineered features
            annotations_path: Path to annotations JSON file

        Returns:
            Labeled DataFrame ready for training
        """
        logger.info("Starting annotation ingestion")

        # Load annotations
        annotations = self.load_annotations(annotations_path)

        if not annotations:
            logger.warning("No annotations found, returning empty DataFrame")
            return pd.DataFrame()

        # Add programmatic labels if enabled
        df = self.get_programmatic_labels(enriched_df)

        # Apply label encoding
        if self.config.label_encoding == "window":
            result_df = self.create_windowed_dataset(df, annotations)
        elif self.config.label_encoding == "bio":
            result_df = self.create_bio_dataset(df, annotations)
            # For BIO, also merge human/programmatic if enabled
            if self.config.programmatic_labels.enabled:
                result_df = self.merge_labels(result_df, annotations)
        else:
            raise ValueError(f"Unknown label encoding: {self.config.label_encoding}")

        # Log statistics
        self.log_statistics(result_df, annotations)

        logger.info("Annotation ingestion complete")

        return result_df

    def process_from_db(
        self,
        enriched_df: pd.DataFrame,
        chart_name: str,
        source: str = "human"
    ) -> pd.DataFrame:
        """
        Main processing pipeline using direct database access.

        This method replaces JSON file exports by querying PostgreSQL directly.

        Args:
            enriched_df: DataFrame with engineered features
            chart_name: Name of the chart to load annotations for
            source: Filter by annotation source ('human', 'model', 'hybrid')

        Returns:
            Labeled DataFrame ready for training
        """
        logger.info(f"Starting annotation ingestion from database for chart: {chart_name}")

        # Load annotations from database
        annotations = self.load_annotations_from_db(chart_name, source)

        if not annotations:
            logger.warning("No annotations found, returning empty DataFrame")
            return pd.DataFrame()

        # Add programmatic labels if enabled
        df = self.get_programmatic_labels(enriched_df)

        # Apply label encoding
        if self.config.label_encoding == "window":
            result_df = self.create_windowed_dataset(df, annotations)
        elif self.config.label_encoding == "bio":
            result_df = self.create_bio_dataset(df, annotations)
            # For BIO, also merge human/programmatic if enabled
            if self.config.programmatic_labels.enabled:
                result_df = self.merge_labels(result_df, annotations)
        else:
            raise ValueError(f"Unknown label encoding: {self.config.label_encoding}")

        # Log statistics
        self.log_statistics(result_df, annotations)

        logger.info("Annotation ingestion complete")

        return result_df


def run_annotation_ingestion(
    config: AnnotationIngestionConfig,
    enriched_path: str,
    annotations_path: str,
    output_path: str
):
    """
    Run annotation ingestion stage.

    Args:
        config: Annotation ingestion configuration
        enriched_path: Path to enriched features CSV
        annotations_path: Path to annotations JSON
        output_path: Path to write labeled dataset CSV
    """
    logger.info("Running annotation ingestion stage")

    # Load enriched data
    enriched_df = pd.read_csv(enriched_path, parse_dates=['time'])
    logger.info(f"Loaded enriched data: {enriched_df.shape}")

    # Process annotations
    ingestion = AnnotationIngestion(config)
    labeled_df = ingestion.process(enriched_df, annotations_path)

    if labeled_df.empty:
        logger.error("No labeled data produced")
        return

    # Write output
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    labeled_df.to_csv(output_path, index=False)
    logger.info(f"Labeled dataset written to {output_path}")
    logger.info(f"Final dataset shape: {labeled_df.shape}")