479 lines
16 KiB
Python

import json
import pandas as pd
import numpy as np
from datetime import datetime
from typing import Union, Dict, Any, Optional, List
from zoneinfo import ZoneInfo
import logging
import re
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def format_date_ddmmyyyy(dt: datetime) -> str:
return dt.strftime('%d-%m-%Y')
def format_datetime_ddmmyyyy_hhmmss(dt: datetime) -> str:
return dt.strftime('%d-%m-%Y %H:%M:%S')
def format_datetime_ddmmyyyy_hhmm(dt: datetime) -> str:
return dt.strftime('%d-%m-%Y %H:%M')
def get_utc_offset_label(timezone_name: Optional[str]) -> Optional[str]:
if not timezone_name:
return None
try:
tz = ZoneInfo(timezone_name)
dt = datetime.now(tz)
offset = dt.utcoffset()
if offset is None:
return None
total_seconds = int(offset.total_seconds())
hours = total_seconds // 3600
if hours >= 0:
return f"UTC+{hours}"
return f"UTC{hours}"
except Exception:
return None
def now_in_timezone(timezone_name: Optional[str]) -> datetime:
if not timezone_name:
return datetime.now()
try:
return datetime.now(ZoneInfo(timezone_name))
except Exception:
return datetime.now()
def format_datetime_to_local_display(value: Optional[str], timezone_name: Optional[str] = None) -> str:
if not value or str(value).strip() == '' or str(value).strip().upper() == 'N/A':
return 'N/A'
s = str(value).strip()
try:
ts = pd.to_datetime(s, utc=True)
if timezone_name:
ts = ts.tz_convert(ZoneInfo(timezone_name)).tz_localize(None)
else:
ts = ts.to_pydatetime().replace(tzinfo=None)
dt = ts.to_pydatetime() if hasattr(ts, 'to_pydatetime') else ts
return dt.strftime('%d-%m-%Y %H:%M')
except Exception:
return s[:19] if len(s) >= 19 else s
def parse_period_string_to_datetime(value: Optional[str]) -> Optional[datetime]:
if value is None:
return None
value_str = str(value).strip()
if not value_str:
return None
try:
if re.fullmatch(r"\d{2}-\d{2}-\d{4}", value_str):
return datetime.strptime(value_str, '%d-%m-%Y')
if re.match(r"\d{2}-\d{2}-\d{4}\s+\d", value_str):
try:
return datetime.strptime(value_str[:19], '%d-%m-%Y %H:%M:%S')
except ValueError:
try:
return datetime.strptime(value_str[:16], '%d-%m-%Y %H:%M')
except ValueError:
pass
ts = pd.to_datetime(value_str, errors='raise')
if isinstance(ts, pd.Timestamp):
if ts.tzinfo is not None:
ts = ts.tz_convert('UTC').tz_localize(None)
return ts.to_pydatetime()
except Exception as e:
logger.debug(f"parse_period_string_to_datetime failed for {value_str}: {e}")
return None
return None
def normalize_local_time_to_timezone(
df: pd.DataFrame,
column: str,
timezone_name: Optional[str],
) -> pd.DataFrame:
if len(df) == 0 or not timezone_name:
return df
tz = ZoneInfo(timezone_name)
df = df.copy()
df[column] = pd.to_datetime(df[column], utc=True, errors='coerce')
df = df[~df[column].isna()]
if len(df) == 0:
return df
df[column] = df[column].dt.tz_convert(tz).dt.tz_localize(None)
return df
def format_period_display_for_report(start_value: Optional[str], end_value: Optional[str]) -> tuple[str, str]:
def _format_one(val: Optional[str]) -> str:
if not val or not str(val).strip():
return ""
s = str(val).strip()
try:
if re.fullmatch(r"\d{2}-\d{2}-\d{4}", s):
return s
if "T" in s or "Z" in s:
ts = pd.to_datetime(s, utc=True)
local_dt = ts.to_pydatetime().astimezone(None)
return local_dt.strftime('%d-%m-%Y %H:%M')
ts = pd.to_datetime(s, errors='raise')
if isinstance(ts, pd.Timestamp):
if ts.tzinfo is not None:
ts = ts.tz_localize(None)
dt = ts.to_pydatetime()
return dt.strftime('%d-%m-%Y %H:%M')
return s
except Exception:
return s
start_display = _format_one(start_value) if start_value else ""
end_display = _format_one(end_value) if end_value else ""
return start_display, end_display
def get_analysis_radius_m() -> int:
from .config import config
rings = config.distance_rings or []
outermost_ring = int(max(rings)) if rings else 0
boundary = config.analysis_boundary_m
if isinstance(boundary, (int, float)) and boundary > 0:
if outermost_ring > 0:
return min(int(boundary), outermost_ring)
return int(boundary)
return outermost_ring
def get_storm_monitoring_radius_km() -> float:
"""Outer radius used when filtering thunderstorm cells for the report."""
from .config import config
boundary = config.analysis_boundary_m
if isinstance(boundary, (int, float)) and boundary > 0:
return float(boundary) / 1000.0
rings = config.distance_rings or []
if rings:
return float(max(rings)) / 1000.0
return 50.0
def get_turbine_color_by_fixed_intervals(risk_log_value: float) -> str:
"""
Get turbine color based on fixed risk score intervals.
Uses consistent color coding across all groups and tables.
Args:
risk_log_value: Log-transformed risk score
Returns:
Color string for the turbine
"""
# Define fixed risk intervals and corresponding colors
# Using the new color palette: F94144, F3722C, F8961E, F9C74F, 90BE6D, 43AA8B, 577590
if risk_log_value < 0.1:
return '#577590'
elif risk_log_value < 0.2:
return '#43AA8B'
elif risk_log_value < 0.4:
return '#90BE6D'
elif risk_log_value < 0.6:
return '#F9C74F'
elif risk_log_value < 0.8:
return '#F8961E'
elif risk_log_value < 1.0:
return '#F3722C'
elif risk_log_value < 1.2:
return '#F94144'
elif risk_log_value < 1.4:
return '#D32F2F'
else:
return '#B71C1C'
def get_risk_definition_by_fixed_intervals(
risk_log_value: float,
language: str | None = None,
) -> str:
from src.reporting.strings import get_risk_definition, get_report_language
lang = language if language is not None else get_report_language()
return get_risk_definition(risk_log_value, lang)
def get_turbine_colors_by_fixed_intervals(risk_log_values: List[float]) -> List[str]:
"""
Get turbine colors for a list of risk scores based on fixed intervals.
Args:
risk_log_values: List of log-transformed risk scores
Returns:
List of color strings for the turbines
"""
return [get_turbine_color_by_fixed_intervals(risk_log) for risk_log in risk_log_values]
def safe_datetime_conversion(time_str: str) -> Optional[datetime]:
"""
Safely convert string to datetime with error handling.
Args:
time_str: String representation of datetime
Returns:
datetime object or None if conversion fails
"""
if not time_str or pd.isna(time_str):
return None
# Try different datetime formats
formats = [
'%Y-%m-%d %H:%M:%S',
'%Y-%m-%d %H:%M:%S.%f',
'%Y-%m-%dT%H:%M:%S',
'%Y-%m-%dT%H:%M:%S.%f',
'%Y-%m-%d'
]
for fmt in formats:
try:
return datetime.strptime(time_str[:19], fmt)
except ValueError:
continue
# Try pandas parsing as fallback
parsed = None
try:
parsed = pd.to_datetime(time_str, errors='coerce')
except Exception:
parsed = None
if isinstance(parsed, pd.Timestamp) and not pd.isna(parsed):
return parsed.to_pydatetime()
if isinstance(time_str, datetime):
return time_str
logger.error(f"Failed to convert datetime: {time_str}")
return None
def load_json_data(file_path: str) -> Dict[str, Any]:
"""
Generic JSON loader with error handling.
Args:
file_path: Path to JSON file
Returns:
Dictionary containing JSON data
Raises:
FileNotFoundError: If file doesn't exist
json.JSONDecodeError: If JSON is invalid
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
logger.info(f"Successfully loaded JSON data from {file_path}")
return data
except FileNotFoundError:
logger.error(f"File not found: {file_path}")
raise
except json.JSONDecodeError as e:
logger.error(f"Invalid JSON in {file_path}: {e}")
raise
except Exception as e:
logger.error(f"Unexpected error loading {file_path}: {e}")
raise
def filter_lightning_data_by_date_range(lightning_df: pd.DataFrame, start_date: Optional[str] = None, end_date: Optional[str] = None) -> pd.DataFrame:
"""
Filter lightning data by date range.
Args:
lightning_df: DataFrame containing lightning data with 'local_time' column
start_date: Start date in format 'DD-MM-YYYY' or None for no filtering
end_date: End date in format 'DD-MM-YYYY' or None for no filtering
Returns:
Filtered DataFrame containing only lightning data within the specified date range
"""
if start_date is None and end_date is None:
return lightning_df
def _parse_flexible_datetime(value: Optional[str], is_end: bool = False) -> Optional[datetime]:
if value is None:
return None
value_str = str(value).strip()
if not value_str:
return None
try:
if re.fullmatch(r"\d{2}-\d{2}-\d{4} \d{2}:\d{2}", value_str):
dt = datetime.strptime(value_str, '%d-%m-%Y %H:%M')
if is_end:
dt = dt.replace(second=59)
return dt
if re.fullmatch(r"\d{2}-\d{2}-\d{4}", value_str):
dt = datetime.strptime(value_str, '%d-%m-%Y')
if is_end:
dt = dt.replace(hour=23, minute=59, second=59)
return dt
ts = pd.to_datetime(value_str, errors='raise')
if isinstance(ts, pd.Timestamp):
if ts.tzinfo is not None:
ts = ts.tz_convert('UTC').tz_localize(None)
return ts.to_pydatetime()
except Exception as e:
logger.error(f"Invalid datetime value: {value_str}. Error: {e}")
return None
return None
df = lightning_df.copy()
if df['local_time'].dtype == 'object':
df['local_time'] = pd.to_datetime(df['local_time'])
if df['local_time'].dt.tz is not None:
from src.config import config
tz_name = getattr(config, 'timezone', None) or 'UTC'
df['local_time'] = df['local_time'].dt.tz_convert(tz_name).dt.tz_localize(None)
start_dt = _parse_flexible_datetime(start_date, is_end=False)
end_dt = _parse_flexible_datetime(end_date, is_end=True)
if start_date and start_dt is None:
logger.error(f"Invalid start_date value: {start_date}. Expected 'DD-MM-YYYY' or ISO datetime string.")
return lightning_df
if end_date and end_dt is None:
logger.error(f"Invalid end_date value: {end_date}. Expected 'DD-MM-YYYY' or ISO datetime string.")
return lightning_df
# Apply date filtering
if start_dt and end_dt:
mask = (df['local_time'] >= start_dt) & (df['local_time'] <= end_dt)
filtered_df = df[mask]
logger.info(f"Filtered lightning data from {len(lightning_df)} to {len(filtered_df)} records ({start_date} to {end_date})")
return filtered_df
if start_dt:
mask = df['local_time'] >= start_dt
filtered_df = df[mask]
logger.info(f"Filtered lightning data from {len(lightning_df)} to {len(filtered_df)} records (from {start_date})")
return filtered_df
if end_dt:
mask = df['local_time'] <= end_dt
filtered_df = df[mask]
logger.info(f"Filtered lightning data from {len(lightning_df)} to {len(filtered_df)} records (until {end_date})")
return filtered_df
return df
def validate_lightning_data(df: pd.DataFrame) -> bool:
"""
Validate lightning data structure and content.
Args:
df: Lightning DataFrame
Returns:
True if valid, False otherwise
"""
required_columns = ['lat', 'lng', 'current', 'p_type', 'local_time']
# Handle empty dataset gracefully
if len(df) == 0:
logger.warning("Lightning dataset is empty - this is acceptable for analysis")
return True
# Check required columns
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
logger.error(f"Missing required columns: {missing_columns}")
return False
# Check data types
if not pd.api.types.is_numeric_dtype(df['lat']):
logger.error("Latitude column must be numeric")
return False
if not pd.api.types.is_numeric_dtype(df['lng']):
logger.error("Longitude column must be numeric")
return False
if not pd.api.types.is_numeric_dtype(df['current']):
logger.error("Current column must be numeric")
return False
# Check coordinate ranges
if not (df['lat'].between(-90, 90).all()):
logger.error("Latitude values must be between -90 and 90")
return False
if not (df['lng'].between(-180, 180).all()):
logger.error("Longitude values must be between -180 and 180")
return False
# Check p_type values
valid_p_types = ['0', '1', 0, 1]
invalid_p_types = df[~df['p_type'].astype(str).isin(['0', '1'])]
if len(invalid_p_types) > 0:
logger.warning(f"Found {len(invalid_p_types)} invalid p_type values")
logger.info(f"Lightning data validation passed: {len(df)} records")
return True
def validate_turbine_data(df: pd.DataFrame) -> bool:
"""
Validate turbine data structure and content.
Args:
df: Turbine DataFrame
Returns:
True if valid, False otherwise
"""
required_columns = ['lat', 'lng', 'name']
# Check required columns
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
logger.error(f"Missing required columns: {missing_columns}")
return False
# Check data types
if not pd.api.types.is_numeric_dtype(df['lat']):
logger.error("Latitude column must be numeric")
return False
if not pd.api.types.is_numeric_dtype(df['lng']):
logger.error("Longitude column must be numeric")
return False
# Check coordinate ranges
if not (df['lat'].between(-90, 90).all()):
logger.error("Latitude values must be between -90 and 90")
return False
if not (df['lng'].between(-180, 180).all()):
logger.error("Longitude values must be between -180 and 180")
return False
logger.info(f"Turbine data validation passed: {len(df)} records")
return True
def ensure_datetime_column(df: pd.DataFrame, column: str) -> pd.DataFrame:
"""
Ensure a column contains datetime objects.
Args:
df: DataFrame
column: Column name to convert
Returns:
DataFrame with converted datetime column
"""
# Handle empty DataFrame
if len(df) == 0:
return df
if df[column].dtype == 'object':
df = df.copy()
df[column] = pd.to_datetime(df[column], errors='coerce')
logger.info(f"Converted {column} to datetime")
return df