BE-LightningReport/batch_generate.py

#!/usr/bin/env python3
"""
Batch generate lightning reports for multiple wind farms.
"""

import sys
import json
import argparse
import logging
from datetime import datetime
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv

from src.api.data_fetcher import APIDataFetcher
from src.data.loader import load_turbine_data, load_lightning_data_from_csv
from src.analysis.risk import calculate_turbine_risks
from src.analysis.grouping import create_turbine_groups
from src.reporting.docx import create_docx_report
from src.reporting.filename_utils import farm_local_date_range_from_config, slugify_ascii_underscore
from src.utils import (
    filter_lightning_data_by_date_range,
    format_date_ddmmyyyy,
    format_period_display_for_report,
    normalize_local_time_to_timezone,
)
from src.config import config as global_config

load_dotenv()

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler(f'batch_generation_{datetime.now().strftime("%Y-%m-%d")}.log')
    ]
)
logger = logging.getLogger(__name__)


def load_wind_farms_config(config_path: str) -> dict:
    """Load wind farms configuration from JSON file."""
    try:
        with open(config_path, 'r', encoding='utf-8') as f:
            config = json.load(f)
        logger.info(f"Loaded configuration from {config_path}")
        return config
    except FileNotFoundError:
        logger.error(f"Configuration file not found: {config_path}")
        raise
    except json.JSONDecodeError as e:
        logger.error(f"Invalid JSON in configuration file: {e}")
        raise


def filter_enabled_farms(wind_farms: list) -> tuple:
    """Filter farms by enabled status."""
    enabled = []
    disabled = []

    for farm in wind_farms:
        is_enabled = farm.get('enabled', True)
        if is_enabled:
            enabled.append(farm)
        else:
            disabled.append(farm)

    return enabled, disabled


def get_location_bounds(farm: dict, turbine_df: pd.DataFrame, api_fetcher: APIDataFetcher) -> dict:
    """Get location bounds for API query."""
    location_config = farm['api_params']['location_bounds']

    if location_config['method'] == 'auto':
        max_distance_ring = max(farm['distance_rings'])
        padding_km = location_config.get('padding_km', 5)

        bounds = api_fetcher.calculate_location_bounds(
            turbine_df,
            max_distance_ring,
            padding_km
        )
        return bounds
    else:
        return {
            'center_lat': location_config['center_lat'],
            'center_lng': location_config['center_lng'],
            'radius_km': location_config['radius_km']
        }


def update_global_config(farm: dict, start_date: str = None, end_date: str = None):
    """Update global config with farm-specific settings."""
    global_config.distance_rings = farm.get('distance_rings', global_config.distance_rings)
    global_config.ring_colors = farm.get('ring_colors', global_config.ring_colors)
    # DOCX title is based on the top-level `name` field for the farm.
    global_config.wind_farm_name = farm.get('name', 'Unknown')
    global_config.timezone = farm['report_config'].get('timezone', None)

    # Lightning data source configuration (auto-detected from farm config)
    lightning_source_type = farm.get('lightning_source_type')
    if lightning_source_type:
        global_config.lightning_source_type = lightning_source_type
        if lightning_source_type == 'csv':
            global_config.lightning_csv = farm.get('lightning_csv')
        elif lightning_source_type == 'api':
            global_config.lightning_json = farm.get('lightning_json')

    # Set date range if provided (for reporting)
    if start_date and end_date:
        global_config.analysis_start_date = start_date
        global_config.analysis_end_date = end_date

    # Update grouping params if specified in farm config
    if 'grouping_params' in farm:
        global_config.grouping_params = farm['grouping_params']

    logger.debug(f"Updated global config: distance_rings={global_config.distance_rings}, wind_farm_name={global_config.wind_farm_name}")


def convert_api_response_to_dataframe(records: list, data_type: str = 'lightning') -> pd.DataFrame:
    """
    Convert API response to DataFrame format expected by existing code.

    Args:
        records: List of records from API
        data_type: 'lightning' or 'storm'

    Returns:
        DataFrame in expected format
    """
    if not records:
        if data_type == 'lightning':
            return pd.DataFrame(columns=['lat', 'lng', 'current', 'p_type', 'local_time'])
        else:
            return pd.DataFrame()

    df = pd.DataFrame(records)

    if data_type == 'lightning':
        if 'local_time' not in df.columns and 'timestamp' in df.columns:
            df['local_time'] = pd.to_datetime(df['timestamp'])
        elif 'local_time' in df.columns:
            df['local_time'] = pd.to_datetime(df['local_time'])

        if 'current_abs' not in df.columns and 'current' in df.columns:
            df['current_abs'] = df['current'].abs()

    return df


def process_farm(farm: dict, api_fetcher: APIDataFetcher, config: dict) -> dict:
    """Process a single farm and generate report."""
    farm_id = farm['farm_id']
    farm_name = farm.get('name', farm_id)

    logger.info(f"Processing farm: {farm_id} ({farm_name})")

    try:
        start_time = datetime.now()

        # Update global config with farm-specific settings BEFORE processing
        # (dates will be set later after they're determined)
        update_global_config(farm)

        turbine_file = farm['coordinates_file']
        turbine_df = load_turbine_data(turbine_file)
        logger.info(f"Loaded {len(turbine_df)} turbines")

        location_bounds = get_location_bounds(farm, turbine_df, api_fetcher)

        query_start, query_end = APIDataFetcher.determine_query_date_range(farm, config['api_config'])
        start_date_str = query_start.strftime('%Y-%m-%d')
        end_date_str = query_end.strftime('%Y-%m-%d')

        source_type = farm.get('lightning_source_type', 'api')

        if source_type == 'csv':
            lightning_df = load_lightning_data_from_csv(farm.get('lightning_csv'))
            logger.info(f"Loaded {len(lightning_df)} lightning records from CSV for {farm_id}")
        else:
            logger.info(f"Fetching lightning data from API for period: {start_date_str} to {end_date_str}")
            lightning_records = api_fetcher.fetch_lightning_data(
                center_lat=location_bounds['center_lat'],
                center_lng=location_bounds['center_lng'],
                radius_km=location_bounds['radius_km'],
                start_date=start_date_str,
                end_date=end_date_str
            )
            lightning_df = convert_api_response_to_dataframe(lightning_records, 'lightning')
            logger.info(f"Converted {len(lightning_df)} lightning records to DataFrame")

            if len(lightning_df) == 0:
                logger.warning(f"No lightning data found for {farm_id}")
                lightning_df = pd.DataFrame(columns=['lat', 'lng', 'current', 'p_type', 'local_time', 'current_abs'])

        storm_records = api_fetcher.fetch_storm_data(
            center_lat=location_bounds['center_lat'],
            center_lng=location_bounds['center_lng'],
            radius_km=location_bounds['radius_km'],
            start_date=start_date_str,
            end_date=end_date_str
        )

        date_range_cfg = farm.get('api_params', {}).get('date_range', {})
        start_filter = None
        end_filter = None
        method = date_range_cfg.get('method')

        if source_type != 'csv':
            if method == 'manual':
                start_filter = date_range_cfg.get('start_date')
                end_filter = date_range_cfg.get('end_date')
            else:
                query_range_cfg = date_range_cfg.get('query_range', {})
                start_filter = query_range_cfg.get('start_date')
                end_filter = query_range_cfg.get('end_date')

            if len(lightning_df) > 0 and (start_filter is not None or end_filter is not None):
                lightning_df = filter_lightning_data_by_date_range(lightning_df, start_filter, end_filter)

            farm_tz = farm.get('report_config', {}).get('timezone')
            if len(lightning_df) > 0 and farm_tz:
                lightning_df = normalize_local_time_to_timezone(lightning_df, 'local_time', farm_tz)

        turbine_df = calculate_turbine_risks(turbine_df, lightning_df)

        group_data = create_turbine_groups(turbine_df)
        logger.info(f"Created {group_data['total_groups']} groups")

        # Determine actual dates for report (display strings: DD-MM-YYYY or DD-MM-YYYY HH:MM in local time)
        if source_type == 'csv' and len(lightning_df) > 0:
            local_times = pd.to_datetime(lightning_df['local_time'])
            start_val = local_times.min()
            end_val = local_times.max()
            actual_start = start_val.strftime('%d-%m-%Y %H:%M')
            actual_end = end_val.strftime('%d-%m-%Y %H:%M')
        else:
            if method == 'manual' and date_range_cfg.get('start_date') is not None and date_range_cfg.get('end_date') is not None:
                actual_start, actual_end = format_period_display_for_report(start_filter, end_filter)
                if not actual_start or not actual_end:
                    actual_start = format_date_ddmmyyyy(query_start)
                    actual_end = format_date_ddmmyyyy(query_end)
            elif start_filter is not None and end_filter is not None:
                actual_start, actual_end = format_period_display_for_report(start_filter, end_filter)
                if not actual_start or not actual_end:
                    actual_start = format_date_ddmmyyyy(query_start)
                    actual_end = format_date_ddmmyyyy(query_end)
            else:
                actual_start = format_date_ddmmyyyy(query_start)
                actual_end = format_date_ddmmyyyy(query_end)

        # Update global config with dates for PDF generation
        update_global_config(farm, actual_start, actual_end)

        output_dir = Path(farm['report_config']['output_directory'])
        output_dir.mkdir(parents=True, exist_ok=True)

        local_range = farm_local_date_range_from_config(farm)
        safe_name = slugify_ascii_underscore(farm.get("name", farm_id))
        docx_filename = (
            f"{safe_name}_{local_range.start_date_yyyy_mm_dd}"
            f"_{local_range.end_date_yyyy_mm_dd}_report.docx"
        )
        docx_path = output_dir / docx_filename

        create_docx_report(
            str(docx_path),
            turbine_df,
            lightning_df,
            storm_data_path=None,
            storm_data_records=storm_records if storm_records else None,
        )

        processing_time = (datetime.now() - start_time).total_seconds()

        logger.info(f"Successfully generated report for {farm_id} in {processing_time:.1f}s")

        return {
            'farm_id': farm_id,
            'name': farm_name,
            'status': 'success',
            'report_path': str(docx_path),
            'docx_path': str(docx_path),
            'pdf_path': None,
            'location': location_bounds,
            'processing_time_seconds': processing_time,
            'lightning_records': len(lightning_df),
            'storm_records': len(storm_records)
        }

    except Exception as e:
        processing_time = (datetime.now() - start_time).total_seconds() if 'start_time' in locals() else 0
        logger.error(f"Failed to process farm {farm_id}: {e}", exc_info=True)

        return {
            'farm_id': farm_id,
            'name': farm_name,
            'status': 'failed',
            'error': str(e),
            'processing_time_seconds': processing_time
        }


def generate_batch_summary(results: list, total_farms: int, enabled_count: int,
                          disabled_count: int, start_time: datetime) -> dict:
    """Generate batch summary report."""
    successful = [r for r in results if r['status'] == 'success']
    failed = [r for r in results if r['status'] == 'failed']
    skipped = disabled_count

    total_time = (datetime.now() - start_time).total_seconds()

    summary = {
        'batch_date': datetime.now().strftime('%Y-%m-%d'),
        'batch_time': datetime.now().strftime('%H:%M:%S'),
        'total_farms': total_farms,
        'enabled_farms': enabled_count,
        'disabled_farms': disabled_count,
        'processed': len(results),
        'successful': len(successful),
        'failed': len(failed),
        'skipped': skipped,
        'processing_time_seconds': total_time,
        'results': results
    }

    return summary


def save_batch_summary(summary: dict, output_dir: str):
    """Save batch summary to JSON file."""
    output_path = Path(output_dir) / f"batch_summary_{summary['batch_date']}.json"
    output_path.parent.mkdir(parents=True, exist_ok=True)

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(summary, f, indent=2)

    logger.info(f"Batch summary saved to {output_path}")
    return output_path


def list_farms(config: dict):
    """List all farms and their enabled status."""
    print("\nWind Farms Configuration:")
    print("=" * 60)

    for i, farm in enumerate(config['wind_farms'], 1):
        enabled = farm.get('enabled', True)
        status = "✓ Enabled" if enabled else "✗ Disabled"
        farm_id = farm['farm_id']
        name = farm.get('name', 'N/A')

        print(f"{i}. {status}: {farm_id} - {name}")

    enabled, disabled = filter_enabled_farms(config['wind_farms'])
    print(f"\nTotal: {len(config['wind_farms'])} farms ({len(enabled)} enabled, {len(disabled)} disabled)")


def main():
    parser = argparse.ArgumentParser(description='Batch generate lightning reports')
    parser.add_argument('--config', required=True, help='Path to wind_farms_config.json')
    parser.add_argument('--farm-id', help='Process specific farm only')
    parser.add_argument('--force-all', action='store_true',
                        help='Process all farms, ignoring enabled flag')
    parser.add_argument('--force', action='store_true',
                        help='Process even if disabled')
    parser.add_argument('--list-farms', action='store_true',
                        help='List all farms and their enabled status')
    parser.add_argument('--output-dir', help='Override output directory')

    args = parser.parse_args()

    try:
        config = load_wind_farms_config(args.config)

        if args.list_farms:
            list_farms(config)
            return

        api_config = config['api_config']
        api_fetcher = APIDataFetcher(
            base_url=api_config['base_url'],
            timeout=api_config.get('timeout_seconds', 30),
            retry_attempts=api_config.get('retry_attempts', 3)
        )

        wind_farms = config['wind_farms']

        if args.force_all:
            farms_to_process = wind_farms
            logger.info(f"Processing all {len(farms_to_process)} farms (--force-all)")
        else:
            enabled_farms, disabled_farms = filter_enabled_farms(wind_farms)
            farms_to_process = enabled_farms

            if disabled_farms:
                logger.info(f"Skipping {len(disabled_farms)} disabled farms:")
                for farm in disabled_farms:
                    logger.info(f"  - {farm['farm_id']}: {farm.get('name', 'N/A')}")

        if args.farm_id:
            farms_to_process = [f for f in farms_to_process if f['farm_id'] == args.farm_id]
            if not farms_to_process:
                logger.error(f"Farm '{args.farm_id}' not found or not enabled")
                return

        if not farms_to_process:
            logger.warning("No farms to process")
            return

        logger.info(f"Processing {len(farms_to_process)} farm(s)")
        start_time = datetime.now()

        results = []
        for i, farm in enumerate(farms_to_process, 1):
            logger.info(f"\n[{i}/{len(farms_to_process)}] Processing {farm['farm_id']}...")
            result = process_farm(farm, api_fetcher, config)
            results.append(result)

        enabled_count, disabled_count = filter_enabled_farms(wind_farms)
        summary = generate_batch_summary(
            results,
            len(wind_farms),
            len(enabled_count),
            len(disabled_count),
            start_time
        )

        output_base = config.get('output_base_directory', 'reports/')
        save_batch_summary(summary, output_base)

        print("\n" + "=" * 60)
        print("Batch Processing Summary")
        print("=" * 60)
        print(f"Total farms: {summary['total_farms']}")
        print(f"Enabled: {summary['enabled_farms']}")
        print(f"Disabled: {summary['disabled_farms']}")
        print(f"Processed: {summary['processed']}")
        print(f"Successful: {summary['successful']}")
        print(f"Failed: {summary['failed']}")
        print(f"Total time: {summary['processing_time_seconds']:.1f}s")
        print("=" * 60)

        if summary['failed'] > 0:
            print("\nFailed farms:")
            for result in [r for r in results if r['status'] == 'failed']:
                print(f"  - {result['farm_id']}: {result.get('error', 'Unknown error')}")

    except KeyboardInterrupt:
        logger.info("Batch processing interrupted by user")
        sys.exit(1)
    except Exception as e:
        logger.error(f"Batch processing failed: {e}", exc_info=True)
        sys.exit(1)


if __name__ == '__main__':
    main()