BE-LightningReport/src/utils.py

import json
import pandas as pd
import numpy as np
from datetime import datetime
from typing import Union, Dict, Any, Optional, List
from zoneinfo import ZoneInfo
import logging
import re

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def format_date_ddmmyyyy(dt: datetime) -> str:
    return dt.strftime('%d-%m-%Y')

def format_datetime_ddmmyyyy_hhmmss(dt: datetime) -> str:
    return dt.strftime('%d-%m-%Y %H:%M:%S')

def format_datetime_ddmmyyyy_hhmm(dt: datetime) -> str:
    return dt.strftime('%d-%m-%Y %H:%M')

def get_utc_offset_label(timezone_name: Optional[str]) -> Optional[str]:
    if not timezone_name:
        return None
    try:
        tz = ZoneInfo(timezone_name)
        dt = datetime.now(tz)
        offset = dt.utcoffset()
        if offset is None:
            return None
        total_seconds = int(offset.total_seconds())
        hours = total_seconds // 3600
        if hours >= 0:
            return f"UTC+{hours}"
        return f"UTC{hours}"
    except Exception:
        return None

def now_in_timezone(timezone_name: Optional[str]) -> datetime:
    if not timezone_name:
        return datetime.now()
    try:
        return datetime.now(ZoneInfo(timezone_name))
    except Exception:
        return datetime.now()

def format_datetime_to_local_display(value: Optional[str], timezone_name: Optional[str] = None) -> str:
    if not value or str(value).strip() == '' or str(value).strip().upper() == 'N/A':
        return 'N/A'
    s = str(value).strip()
    try:
        ts = pd.to_datetime(s, utc=True)
        if timezone_name:
            ts = ts.tz_convert(ZoneInfo(timezone_name)).tz_localize(None)
        else:
            ts = ts.to_pydatetime().replace(tzinfo=None)
        dt = ts.to_pydatetime() if hasattr(ts, 'to_pydatetime') else ts
        return dt.strftime('%d-%m-%Y %H:%M')
    except Exception:
        return s[:19] if len(s) >= 19 else s

def parse_period_string_to_datetime(value: Optional[str]) -> Optional[datetime]:
    if value is None:
        return None
    value_str = str(value).strip()
    if not value_str:
        return None
    try:
        if re.fullmatch(r"\d{2}-\d{2}-\d{4}", value_str):
            return datetime.strptime(value_str, '%d-%m-%Y')
        if re.match(r"\d{2}-\d{2}-\d{4}\s+\d", value_str):
            try:
                return datetime.strptime(value_str[:19], '%d-%m-%Y %H:%M:%S')
            except ValueError:
                try:
                    return datetime.strptime(value_str[:16], '%d-%m-%Y %H:%M')
                except ValueError:
                    pass
        ts = pd.to_datetime(value_str, errors='raise')
        if isinstance(ts, pd.Timestamp):
            if ts.tzinfo is not None:
                ts = ts.tz_convert('UTC').tz_localize(None)
            return ts.to_pydatetime()
    except Exception as e:
        logger.debug(f"parse_period_string_to_datetime failed for {value_str}: {e}")
        return None
    return None

def normalize_local_time_to_timezone(
    df: pd.DataFrame,
    column: str,
    timezone_name: Optional[str],
) -> pd.DataFrame:
    if len(df) == 0 or not timezone_name:
        return df
    tz = ZoneInfo(timezone_name)
    df = df.copy()
    df[column] = pd.to_datetime(df[column], utc=True, errors='coerce')
    df = df[~df[column].isna()]
    if len(df) == 0:
        return df
    df[column] = df[column].dt.tz_convert(tz).dt.tz_localize(None)
    return df

def format_period_display_for_report(start_value: Optional[str], end_value: Optional[str]) -> tuple[str, str]:
    def _format_one(val: Optional[str]) -> str:
        if not val or not str(val).strip():
            return ""
        s = str(val).strip()
        try:
            if re.fullmatch(r"\d{2}-\d{2}-\d{4}", s):
                return s
            if "T" in s or "Z" in s:
                ts = pd.to_datetime(s, utc=True)
                local_dt = ts.to_pydatetime().astimezone(None)
                return local_dt.strftime('%d-%m-%Y %H:%M')
            ts = pd.to_datetime(s, errors='raise')
            if isinstance(ts, pd.Timestamp):
                if ts.tzinfo is not None:
                    ts = ts.tz_localize(None)
                dt = ts.to_pydatetime()
                return dt.strftime('%d-%m-%Y %H:%M')
            return s
        except Exception:
            return s
    start_display = _format_one(start_value) if start_value else ""
    end_display = _format_one(end_value) if end_value else ""
    return start_display, end_display

def get_analysis_radius_m() -> int:
    from .config import config
    rings = config.distance_rings or []
    outermost_ring = int(max(rings)) if rings else 0
    boundary = config.analysis_boundary_m
    if isinstance(boundary, (int, float)) and boundary > 0:
        if outermost_ring > 0:
            return min(int(boundary), outermost_ring)
        return int(boundary)
    return outermost_ring


def get_storm_monitoring_radius_km() -> float:
    """Outer radius used when filtering thunderstorm cells for the report."""
    from .config import config
    boundary = config.analysis_boundary_m
    if isinstance(boundary, (int, float)) and boundary > 0:
        return float(boundary) / 1000.0
    rings = config.distance_rings or []
    if rings:
        return float(max(rings)) / 1000.0
    return 50.0

def get_turbine_color_by_fixed_intervals(risk_log_value: float) -> str:
    """
    Get turbine color based on fixed risk score intervals.
    Uses consistent color coding across all groups and tables.

    Args:
        risk_log_value: Log-transformed risk score

    Returns:
        Color string for the turbine
    """
    # Define fixed risk intervals and corresponding colors
    # Using the new color palette: F94144, F3722C, F8961E, F9C74F, 90BE6D, 43AA8B, 577590
    if risk_log_value < 0.1:
        return '#577590'
    elif risk_log_value < 0.2:
        return '#43AA8B'
    elif risk_log_value < 0.4:
        return '#90BE6D'
    elif risk_log_value < 0.6:
        return '#F9C74F'
    elif risk_log_value < 0.8:
        return '#F8961E'
    elif risk_log_value < 1.0:
        return '#F3722C'
    elif risk_log_value < 1.2:
        return '#F94144'
    elif risk_log_value < 1.4:
        return '#D32F2F'
    else:
        return '#B71C1C'

def get_risk_definition_by_fixed_intervals(
    risk_log_value: float,
    language: str | None = None,
) -> str:
    from src.reporting.strings import get_risk_definition, get_report_language

    lang = language if language is not None else get_report_language()
    return get_risk_definition(risk_log_value, lang)

def get_turbine_colors_by_fixed_intervals(risk_log_values: List[float]) -> List[str]:
    """
    Get turbine colors for a list of risk scores based on fixed intervals.

    Args:
        risk_log_values: List of log-transformed risk scores

    Returns:
        List of color strings for the turbines
    """
    return [get_turbine_color_by_fixed_intervals(risk_log) for risk_log in risk_log_values]

def safe_datetime_conversion(time_str: str) -> Optional[datetime]:
    """
    Safely convert string to datetime with error handling.

    Args:
        time_str: String representation of datetime

    Returns:
        datetime object or None if conversion fails
    """
    if not time_str or pd.isna(time_str):
        return None

    # Try different datetime formats
    formats = [
        '%Y-%m-%d %H:%M:%S',
        '%Y-%m-%d %H:%M:%S.%f',
        '%Y-%m-%dT%H:%M:%S',
        '%Y-%m-%dT%H:%M:%S.%f',
        '%Y-%m-%d'
    ]

    for fmt in formats:
        try:
            return datetime.strptime(time_str[:19], fmt)
        except ValueError:
            continue

    # Try pandas parsing as fallback
    parsed = None
    try:
        parsed = pd.to_datetime(time_str, errors='coerce')
    except Exception:
        parsed = None
    if isinstance(parsed, pd.Timestamp) and not pd.isna(parsed):
        return parsed.to_pydatetime()

    if isinstance(time_str, datetime):
        return time_str

    logger.error(f"Failed to convert datetime: {time_str}")
    return None

def load_json_data(file_path: str) -> Dict[str, Any]:
    """
    Generic JSON loader with error handling.

    Args:
        file_path: Path to JSON file

    Returns:
        Dictionary containing JSON data

    Raises:
        FileNotFoundError: If file doesn't exist
        json.JSONDecodeError: If JSON is invalid
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        logger.info(f"Successfully loaded JSON data from {file_path}")
        return data
    except FileNotFoundError:
        logger.error(f"File not found: {file_path}")
        raise
    except json.JSONDecodeError as e:
        logger.error(f"Invalid JSON in {file_path}: {e}")
        raise
    except Exception as e:
        logger.error(f"Unexpected error loading {file_path}: {e}")
        raise

def filter_lightning_data_by_date_range(lightning_df: pd.DataFrame, start_date: Optional[str] = None, end_date: Optional[str] = None) -> pd.DataFrame:
    """
    Filter lightning data by date range.

    Args:
        lightning_df: DataFrame containing lightning data with 'local_time' column
        start_date: Start date in format 'DD-MM-YYYY' or None for no filtering
        end_date: End date in format 'DD-MM-YYYY' or None for no filtering

    Returns:
        Filtered DataFrame containing only lightning data within the specified date range
    """
    if start_date is None and end_date is None:
        return lightning_df

    def _parse_flexible_datetime(value: Optional[str], is_end: bool = False) -> Optional[datetime]:
        if value is None:
            return None

        value_str = str(value).strip()
        if not value_str:
            return None

        try:
            if re.fullmatch(r"\d{2}-\d{2}-\d{4} \d{2}:\d{2}", value_str):
                dt = datetime.strptime(value_str, '%d-%m-%Y %H:%M')
                if is_end:
                    dt = dt.replace(second=59)
                return dt
            if re.fullmatch(r"\d{2}-\d{2}-\d{4}", value_str):
                dt = datetime.strptime(value_str, '%d-%m-%Y')
                if is_end:
                    dt = dt.replace(hour=23, minute=59, second=59)
                return dt

            ts = pd.to_datetime(value_str, errors='raise')
            if isinstance(ts, pd.Timestamp):
                if ts.tzinfo is not None:
                    ts = ts.tz_convert('UTC').tz_localize(None)
                return ts.to_pydatetime()
        except Exception as e:
            logger.error(f"Invalid datetime value: {value_str}. Error: {e}")
            return None

        return None

    df = lightning_df.copy()
    if df['local_time'].dtype == 'object':
        df['local_time'] = pd.to_datetime(df['local_time'])

    if df['local_time'].dt.tz is not None:
        from src.config import config

        tz_name = getattr(config, 'timezone', None) or 'UTC'
        df['local_time'] = df['local_time'].dt.tz_convert(tz_name).dt.tz_localize(None)

    start_dt = _parse_flexible_datetime(start_date, is_end=False)
    end_dt = _parse_flexible_datetime(end_date, is_end=True)

    if start_date and start_dt is None:
        logger.error(f"Invalid start_date value: {start_date}. Expected 'DD-MM-YYYY' or ISO datetime string.")
        return lightning_df

    if end_date and end_dt is None:
        logger.error(f"Invalid end_date value: {end_date}. Expected 'DD-MM-YYYY' or ISO datetime string.")
        return lightning_df

    # Apply date filtering
    if start_dt and end_dt:
        mask = (df['local_time'] >= start_dt) & (df['local_time'] <= end_dt)
        filtered_df = df[mask]
        logger.info(f"Filtered lightning data from {len(lightning_df)} to {len(filtered_df)} records ({start_date} to {end_date})")
        return filtered_df

    if start_dt:
        mask = df['local_time'] >= start_dt
        filtered_df = df[mask]
        logger.info(f"Filtered lightning data from {len(lightning_df)} to {len(filtered_df)} records (from {start_date})")
        return filtered_df

    if end_dt:
        mask = df['local_time'] <= end_dt
        filtered_df = df[mask]
        logger.info(f"Filtered lightning data from {len(lightning_df)} to {len(filtered_df)} records (until {end_date})")
        return filtered_df

    return df

def validate_lightning_data(df: pd.DataFrame) -> bool:
    """
    Validate lightning data structure and content.

    Args:
        df: Lightning DataFrame

    Returns:
        True if valid, False otherwise
    """
    required_columns = ['lat', 'lng', 'current', 'p_type', 'local_time']

    # Handle empty dataset gracefully
    if len(df) == 0:
        logger.warning("Lightning dataset is empty - this is acceptable for analysis")
        return True

    # Check required columns
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        logger.error(f"Missing required columns: {missing_columns}")
        return False

    # Check data types
    if not pd.api.types.is_numeric_dtype(df['lat']):
        logger.error("Latitude column must be numeric")
        return False

    if not pd.api.types.is_numeric_dtype(df['lng']):
        logger.error("Longitude column must be numeric")
        return False

    if not pd.api.types.is_numeric_dtype(df['current']):
        logger.error("Current column must be numeric")
        return False

    # Check coordinate ranges
    if not (df['lat'].between(-90, 90).all()):
        logger.error("Latitude values must be between -90 and 90")
        return False

    if not (df['lng'].between(-180, 180).all()):
        logger.error("Longitude values must be between -180 and 180")
        return False

    # Check p_type values
    valid_p_types = ['0', '1', 0, 1]
    invalid_p_types = df[~df['p_type'].astype(str).isin(['0', '1'])]
    if len(invalid_p_types) > 0:
        logger.warning(f"Found {len(invalid_p_types)} invalid p_type values")

    logger.info(f"Lightning data validation passed: {len(df)} records")
    return True

def validate_turbine_data(df: pd.DataFrame) -> bool:
    """
    Validate turbine data structure and content.

    Args:
        df: Turbine DataFrame

    Returns:
        True if valid, False otherwise
    """
    required_columns = ['lat', 'lng', 'name']

    # Check required columns
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        logger.error(f"Missing required columns: {missing_columns}")
        return False

    # Check data types
    if not pd.api.types.is_numeric_dtype(df['lat']):
        logger.error("Latitude column must be numeric")
        return False

    if not pd.api.types.is_numeric_dtype(df['lng']):
        logger.error("Longitude column must be numeric")
        return False

    # Check coordinate ranges
    if not (df['lat'].between(-90, 90).all()):
        logger.error("Latitude values must be between -90 and 90")
        return False

    if not (df['lng'].between(-180, 180).all()):
        logger.error("Longitude values must be between -180 and 180")
        return False

    logger.info(f"Turbine data validation passed: {len(df)} records")
    return True

def ensure_datetime_column(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """
    Ensure a column contains datetime objects.

    Args:
        df: DataFrame
        column: Column name to convert

    Returns:
        DataFrame with converted datetime column
    """
    # Handle empty DataFrame
    if len(df) == 0:
        return df

    if df[column].dtype == 'object':
        df = df.copy()
        df[column] = pd.to_datetime(df[column], errors='coerce')
        logger.info(f"Converted {column} to datetime")
    return df