#!/usr/bin/env python3
"""
CLI tool to load CSV data into ClickHouse from dataset configurations.

Автоматически находит все config/dataset_*.yaml файлы и загружает соответствующие CSV данные.

Usage:
    python load_csv_to_clickhouse.py                    # Load all datasets
    python load_csv_to_clickhouse.py --dataset timeline # Load specific dataset
    python load_csv_to_clickhouse.py --dry-run          # Show what would be done
    python load_csv_to_clickhouse.py --drop-existing    # Drop and recreate tables
"""

import argparse
import sys
import os
import re
from pathlib import Path
from typing import Dict, Any, List, Optional, Tuple
import logging

import yaml
import pandas as pd
import clickhouse_connect
from dotenv import load_dotenv
from rich.console import Console
from rich.table import Table
from rich.progress import Progress, SpinnerColumn, TextColumn

# Setup paths
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

console = Console()


def load_env() -> Dict[str, str]:
    """Load ClickHouse settings from .env file."""
    env_path = PROJECT_ROOT / '.env'
    load_dotenv(env_path)

    # Определяем протокол: если указан CLICKHOUSE_INTERFACE, используем его
    # иначе автоматически: 8123 = http, 9000/9440 = native
    port = int(os.getenv('CLICKHOUSE_PORT', 8123))
    interface = os.getenv('CLICKHOUSE_INTERFACE')

    if not interface:
        # Автоопределение протокола по порту
        if port == 8123:
            interface = 'http'
        elif port in (9000, 9440):
            interface = 'native'
        else:
            interface = 'http'  # По умолчанию HTTP

    return {
        'host': os.getenv('CLICKHOUSE_HOST', 'localhost'),
        'port': port,
        'user': os.getenv('CLICKHOUSE_USER', 'default'),
        'password': os.getenv('CLICKHOUSE_PASSWORD', ''),
        'database': os.getenv('CLICKHOUSE_DATABASE', 'default'),
        'interface': interface
    }


def discover_datasets(config_dir: Path) -> List[Path]:
    """Find all dataset_*.yaml files in config directory."""
    pattern = 'dataset_*.yaml'
    datasets = list(config_dir.glob(pattern))
    logger.info(f"Found {len(datasets)} dataset configuration files")
    return sorted(datasets)


def load_dataset_config(yaml_path: Path) -> Dict[str, Any]:
    """Load dataset configuration from YAML file."""
    with open(yaml_path, 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)
    return config.get('dataset', {})


def get_clickhouse_client(config: Dict[str, str], timeout: int = 300):
    """
    Create ClickHouse client connection.
    
    Args:
        config: ClickHouse connection config
        timeout: Request timeout in seconds (default 300 = 5 minutes for large uploads)
    """
    try:
        interface = config['interface']

        client = clickhouse_connect.get_client(
            host=config['host'],
            port=config['port'],
            username=config['user'],
            password=config['password'],
            database=config['database'],
            interface=interface,
            connect_timeout=60,  # Connection timeout
            send_receive_timeout=timeout,  # Request timeout for large uploads
            # Disable session reuse to avoid SESSION_IS_LOCKED errors
            session_id=None  # Each client gets a new session
        )

        logger.info(f"✅ Connected to ClickHouse ({interface}): {config['host']}:{config['port']}/{config['database']} (timeout={timeout}s)")
        return client
    except Exception as e:
        logger.error(f"❌ Failed to connect to ClickHouse: {e}")
        raise


def clean_numeric_column(series: pd.Series) -> pd.Series:
    """
    Convert European number format (1.234,56) to standard format (1234.56).
    Handles both dot and comma as decimal separators.
    """
    def clean_value(val):
        if pd.isna(val):
            return None
        val_str = str(val).strip()

        # Check if it's already a number
        try:
            return float(val_str)
        except ValueError:
            pass

        # European format: 1.234,56 -> 1234.56
        if ',' in val_str and '.' in val_str:
            # Remove thousands separator (.), replace comma with dot
            val_str = val_str.replace('.', '').replace(',', '.')
        elif ',' in val_str:
            # Only comma, assume it's decimal separator
            val_str = val_str.replace(',', '.')

        try:
            return float(val_str)
        except ValueError:
            return None

    return series.apply(clean_value)


def infer_clickhouse_type(dtype, col_name: str, sample_values: pd.Series) -> str:
    """Map pandas dtype to ClickHouse type with better inference."""
    # Check for numeric patterns in column name
    is_numeric_name = any(keyword in col_name.lower() for keyword in [
        'sum', 'fsum', 'count', 'total', 'volume', 'amount', 'qty', 'quantity',
        'usd', 'price', 'revenue', 'cost', '9l', 'cs'
    ])

    # Integer types
    if 'int' in str(dtype):
        max_val = sample_values.max() if len(sample_values) > 0 else 0
        if max_val < 2**31:
            return 'Int32'
        elif max_val < 2**63:
            return 'Int64'
        else:
            return 'UInt64'

    # Float types or numeric column names with string data
    elif 'float' in str(dtype) or (is_numeric_name and str(dtype) == 'object'):
        # Use Decimal for financial data
        if any(keyword in col_name.lower() for keyword in ['usd', 'price', 'revenue', 'cost']):
            return 'Decimal(18, 3)'
        return 'Float64'

    # Boolean
    elif 'bool' in str(dtype):
        return 'Bool'

    # Date/DateTime
    elif 'datetime' in str(dtype):
        return 'DateTime'

    # String (default)
    else:
        # Check if it's a dimension/key (low cardinality)
        if any(keyword in col_name.lower() for keyword in ['key_', 'name', 'status', 'type', 'brand', 'category']):
            return 'LowCardinality(String)'
        return 'String'


def create_table_from_schema_file(
    client,
    table_name: str,
    schema_path: Path,
    drop_existing: bool = False
) -> Tuple[List[str], bool]:
    """
    Create table(s) from SQL schema file.

    Schema file may contain:
    - CREATE TABLE statements (for staging and/or final tables)
    - CREATE MATERIALIZED VIEW (for automatic migration via ClickHouse)

    Returns: (List of created table names, has_materialized_view)
    """
    try:
        logger.info(f"Reading SQL schema from {schema_path}")
        with open(schema_path, 'r', encoding='utf-8') as f:
            sql_content = f.read()

        # Remove ALL comments from SQL before parsing
        # This is critical because comments can break SQL parsing in ClickHouse
        import re
        
        # Remove single-line comments (-- ...)
        # Process line by line, removing everything after -- (but preserve -- inside string literals)
        lines = []
        in_string = False
        string_char = None
        
        for line in sql_content.split('\n'):
            new_line = []
            i = 0
            while i < len(line):
                char = line[i]
                
                # Track string literals (single or double quotes)
                if char in ("'", '"'):
                    # Check if escaped
                    if i == 0 or line[i-1] != '\\':
                        if not in_string:
                            in_string = True
                            string_char = char
                        elif char == string_char:
                            in_string = False
                            string_char = None
                
                # If we hit -- and we're not in a string, remove rest of line
                if not in_string and i < len(line) - 1 and line[i:i+2] == '--':
                    break
                
                new_line.append(char)
                i += 1
            
            # Only add line if it has content (not just whitespace after comment removal)
            cleaned_line = ''.join(new_line).rstrip()
            if cleaned_line:
                lines.append(cleaned_line)
        
        sql_content = '\n'.join(lines)
        
        # Also remove multi-line comments (/* ... */) if any
        sql_content = re.sub(r'/\*.*?\*/', '', sql_content, flags=re.DOTALL)

        # Split by semicolons to get individual statements
        statements = [s.strip() for s in sql_content.split(';') if s.strip()]

        if drop_existing:
            # Drop tables and views if requested
            # IMPORTANT: Drop MV first, then tables
            drop_statements = [
                f"DROP VIEW IF EXISTS {table_name}_mv",  # Drop materialized view first
                f"DROP TABLE IF EXISTS {table_name}",
                f"DROP TABLE IF EXISTS {table_name}_raw"
            ]
            for drop_sql in drop_statements:
                try:
                    client.command(drop_sql)
                    logger.info(f"✅ Dropped: {drop_sql}")
                except Exception as e:
                    logger.debug(f"Drop error (may not exist): {e}")

        # Execute CREATE statements
        created_tables = []
        has_materialized_view = False

        for i, sql in enumerate(statements):
            # Remove leading/trailing whitespace but keep the statement intact
            sql = sql.strip()
            if not sql:
                continue

            # Find first real SQL statement (skip comments and fragments)
            sql_upper = sql.upper()

            # Check if this is a CREATE MATERIALIZED VIEW statement
            if 'CREATE MATERIALIZED VIEW' in sql_upper:
                # Use regex to avoid UTF-8 index mismatch with .upper()
                mv_match = re.search(r'CREATE\s+MATERIALIZED\s+VIEW', sql, re.IGNORECASE)
                if not mv_match:
                    continue
                mv_statement = sql[mv_match.start():]
                first_line = mv_statement.split('\n')[0].strip()

                try:
                    logger.info(f"🔄 Creating Materialized View for auto-migration")
                    logger.debug(f"SQL: {first_line[:150]}...")

                    client.command(mv_statement)
                    has_materialized_view = True
                    logger.info(f"✅ Created Materialized View: ClickHouse will auto-migrate data on INSERT")
                except Exception as e:
                    logger.error(f"❌ Failed to create Materialized View: {e}")
                    logger.debug(f"Failed SQL: {sql}")
                    logger.warning(f"⚠️ Migration will not work without Materialized View")

            # Check if this statement contains CREATE TABLE
            elif 'CREATE TABLE' in sql_upper:
                # Extract the part starting from CREATE TABLE
                # Use regex to avoid UTF-8 index mismatch with .upper()
                match = re.search(r'CREATE\s+TABLE', sql, re.IGNORECASE)
                if not match:
                    continue
                create_statement = sql[match.start():]

                # Get the first line of the CREATE statement for table name
                first_code_line = create_statement.split('\n')[0].strip()

                try:
                    logger.info(f"Executing CREATE TABLE statement {i+1}/{len(statements)}")
                    logger.info(f"DEBUG first 500 chars: {repr(create_statement[:500])}")

                    # Use only the part starting from CREATE TABLE (remove any garbage before it)
                    client.command(create_statement)

                    # Extract table name from CREATE TABLE statement
                    if 'IF NOT EXISTS' in first_code_line.upper():
                        created_table = first_code_line.split('IF NOT EXISTS')[1].split('(')[0].strip()
                    else:
                        created_table = first_code_line.split('CREATE TABLE', 1)[1].split('(')[0].strip()

                    created_tables.append(created_table)
                    logger.info(f"✅ Created table: {created_table}")
                except Exception as e:
                    logger.error(f"❌ Failed to execute statement: {e}")
                    logger.debug(f"Failed SQL: {sql}")
                    return [], False

        if len(created_tables) == 0:
            logger.warning(f"⚠️ No CREATE TABLE statements found in {schema_path}")
            return [], False

        logger.info(f"✅ Created {len(created_tables)} table(s) from schema file: {schema_path}")
        return created_tables, has_materialized_view

    except Exception as e:
        logger.error(f"❌ Failed to create tables from schema: {e}", exc_info=True)
        return [], False


def create_table_from_csv(
    client,
    table_name: str,
    df: pd.DataFrame,
    drop_existing: bool = False
) -> bool:
    """
    Auto-generate table schema from CSV DataFrame and create table.
    """
    try:
        if drop_existing:
            drop_sql = f"DROP TABLE IF EXISTS {table_name}"
            client.command(drop_sql)
            logger.info(f"Dropped existing table: {table_name}")

        # Build column definitions
        columns = []
        for col in df.columns:
            ch_type = infer_clickhouse_type(df[col].dtype, col, df[col])

            # Add Nullable if column has nulls
            if df[col].isna().any():
                if not ch_type.startswith('Nullable'):
                    ch_type = f'Nullable({ch_type})'

            columns.append(f"    `{col}` {ch_type}")

        columns_sql = ",\n".join(columns)

        create_sql = f"""
CREATE TABLE IF NOT EXISTS {table_name}
(
{columns_sql}
)
ENGINE = MergeTree()
ORDER BY tuple()
"""

        logger.info(f"Creating table with auto-generated schema:\n{create_sql}")
        client.command(create_sql)

        logger.info(f"✅ Table created with auto-generated schema: {table_name}")
        return True

    except Exception as e:
        logger.error(f"❌ Failed to create table: {e}")
        return False


def clean_dataframe_for_clickhouse(df: pd.DataFrame) -> pd.DataFrame:
    """
    Prepare DataFrame for ClickHouse insertion.
    Convert European number formats and handle data types.
    """
    df_clean = df.copy()

    for col in df_clean.columns:
        # Try to detect and clean numeric columns
        if df_clean[col].dtype == 'object':
            # Sample a few values to check if numeric
            sample = df_clean[col].dropna().head(10)

            if len(sample) > 0:
                # Check if looks like numbers (contains digits and ,./)
                sample_str = str(sample.iloc[0])
                if any(c.isdigit() for c in sample_str):
                    # Try cleaning as numeric
                    try:
                        cleaned = clean_numeric_column(df_clean[col])
                        # Check if conversion was successful
                        if cleaned.notna().sum() > len(df_clean) * 0.5:  # At least 50% success
                            df_clean[col] = cleaned
                            logger.debug(f"Cleaned numeric column: {col}")
                    except Exception as e:
                        logger.debug(f"Column {col} not numeric: {e}")

    return df_clean


def load_csv_to_table(
    client,
    table_name: str,
    csv_path: Path,
    chunk_size: int = 10000,
    max_retries: int = 3
) -> bool:
    """
    Load CSV data into ClickHouse table in chunks with retry logic.
    
    Args:
        client: ClickHouse client
        table_name: Target table name
        csv_path: Path to CSV file
        chunk_size: Number of rows per chunk
        max_retries: Maximum retry attempts for failed chunks
    """
    import time
    
    try:
        logger.info(f"Reading CSV file: {csv_path}")

        # Detect CSV delimiter and encoding
        with open(csv_path, 'r', encoding='utf-8') as f:
            first_line = f.readline()
            delimiter = ';' if ';' in first_line else ','

        # Get file size for progress estimation
        file_size = csv_path.stat().st_size
        logger.info(f"File size: {file_size / 1024 / 1024:.2f} MB")

        # Read CSV in chunks to avoid memory issues
        total_rows = 0
        chunk_num = 0
        failed_chunks = []

        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
            console=console
        ) as progress:
            task = progress.add_task(f"Inserting into {table_name}", total=None)

            # Detect encoding: try utf-8 first, fallback to cp1252
            csv_encoding = 'utf-8-sig'
            try:
                with open(csv_path, encoding='utf-8-sig') as f:
                    f.read()
            except UnicodeDecodeError:
                csv_encoding = 'cp1252'
                logger.warning(f"UTF-8 decode failed, using cp1252 encoding")

            # Read CSV in chunks
            for chunk_df in pd.read_csv(
                csv_path,
                delimiter=delimiter,
                encoding=csv_encoding,
                dtype=str,
                chunksize=chunk_size
            ):
                chunk_num += 1
                chunk_rows = len(chunk_df)
                total_rows += chunk_rows

                # Clean chunk: fill NaN with empty strings
                chunk_clean = chunk_df.fillna('')

                # Retry logic for each chunk
                success = False
                last_error = None
                
                for attempt in range(max_retries):
                    try:
                        # Insert chunk
                        client.insert_df(table_name, chunk_clean)
                        success = True
                        progress.update(task, description=f"Inserting into {table_name} ({total_rows:,} rows, chunk {chunk_num})")
                        break
                    except Exception as e:
                        last_error = e
                        if attempt < max_retries - 1:
                            wait_time = 2 ** attempt  # Exponential backoff: 1s, 2s, 4s
                            logger.warning(f"Chunk {chunk_num} failed (attempt {attempt + 1}/{max_retries}), retrying in {wait_time}s...")
                            time.sleep(wait_time)
                        else:
                            logger.error(f"Chunk {chunk_num} failed after {max_retries} attempts: {e}")
                            failed_chunks.append((chunk_num, str(e)))

                if not success:
                    # Continue with next chunk even if this one failed
                    logger.warning(f"Skipping chunk {chunk_num} after {max_retries} failed attempts")

        if failed_chunks:
            logger.error(f"❌ Failed to load {len(failed_chunks)} chunks out of {chunk_num} total chunks")
            for chunk_num, error in failed_chunks:
                logger.error(f"  Chunk {chunk_num}: {error}")
            return False

        logger.info(f"✅ Loaded {total_rows:,} rows into {table_name} in {chunk_num} chunks")
        return True

    except Exception as e:
        logger.error(f"❌ Failed to load CSV data: {e}", exc_info=True)
        return False


def process_dataset(
    client,
    dataset_config: Dict[str, Any],
    dataset_id: str,
    config_dir: Path,
    dry_run: bool = False,
    drop_existing: bool = False
) -> bool:
    """
    Process a single dataset: create table (if needed) and load CSV data.

    If SQL schema contains staging table (table_name_raw) and Materialized View,
    data is automatically migrated via ClickHouse on INSERT.
    Otherwise, CSV is loaded directly to table_name.

    Supports two formats for rawdata:
    1. String (legacy): "rawdata: ./data/file.csv"
    2. Array (new): "rawdata: [{table: 'table1', csv: './data/file1.csv'}, ...]"
    """
    console.rule(f"[bold blue]Dataset: {dataset_id}")

    # Extract configuration
    default_table_name = dataset_config.get('clickhouse_table')
    rawdata = dataset_config.get('rawdata')
    schema_path_str = dataset_config.get('sql_schema')

    if not rawdata:
        logger.warning(f"⚠️ No rawdata specified for {dataset_id}, skipping")
        return False

    # Normalize rawdata to array format
    if isinstance(rawdata, str):
        # Legacy format: single CSV file
        csv_files = [{'table': default_table_name, 'csv': rawdata}]
    elif isinstance(rawdata, list):
        # New format: array of {table, csv}
        csv_files = rawdata
    else:
        logger.error(f"❌ Invalid rawdata format for {dataset_id}: must be string or array")
        return False

    if not csv_files:
        logger.warning(f"⚠️ No CSV files specified for {dataset_id}, skipping")
        return False

    # Create tables from schema once (if provided)
    schema_created = False
    if schema_path_str and not dry_run:
        schema_path = PROJECT_ROOT / schema_path_str
        if schema_path.exists():
            logger.info(f"📋 Using SQL schema: {schema_path}")
            # Create ALL tables from schema file (including all _raw, final, MV)
            # We pass the first table name for compatibility, but schema creates all tables
            first_table = csv_files[0]['table']
            created_tables, _ = create_table_from_schema_file(
                client, first_table, schema_path, drop_existing
            )
            if created_tables:
                logger.info(f"📋 Created tables from schema: {', '.join(created_tables)}")
                schema_created = True
            else:
                logger.warning(f"⚠️ Failed to create tables from schema")
        else:
            logger.warning(f"⚠️ SQL schema file not found: {schema_path}")

    # Process each CSV file
    overall_success = True
    for file_entry in csv_files:
        table_name = file_entry.get('table')
        csv_path_str = file_entry.get('csv')

        if not table_name or not csv_path_str:
            logger.error(f"❌ Invalid CSV entry: {file_entry}")
            overall_success = False
            continue

        csv_path = PROJECT_ROOT / csv_path_str
        if not csv_path.exists():
            logger.warning(f"⚠️ CSV file not found: {csv_path}")
            overall_success = False
            continue

        logger.info(f"📊 Processing: Table={table_name}, CSV={csv_path.name}")

        if dry_run:
            logger.info("🔍 DRY RUN: Would load this CSV")
            continue

        # Determine target table for CSV loading
        staging_table = f"{table_name}_raw"
        staging_exists_query = f"EXISTS TABLE {staging_table}"

        # Check if staging table exists
        staging_exists = client.command(staging_exists_query) == 1

        # If no schema was provided or schema creation failed, auto-generate table
        if not schema_created:
            table_exists_query = f"EXISTS TABLE {table_name}"
            table_exists = client.command(table_exists_query) == 1

            if not table_exists or drop_existing:
                logger.info(f"📋 Auto-generating schema for {table_name} from CSV...")
                df = pd.read_csv(
                    csv_path,
                    delimiter=';' if ';' in open(csv_path).readline() else ',',
                    nrows=1000
                )
                if not create_table_from_csv(client, table_name, df, drop_existing):
                    overall_success = False
                    continue

        # Determine target table
        if staging_exists:
            target_table = staging_table
            logger.info(f"📥 Loading into staging table: {target_table}")
        else:
            target_table = table_name
            logger.info(f"📥 Loading directly into: {target_table}")

        # Load CSV data
        chunk_size = 50000 if staging_exists else 10000
        success = load_csv_to_table(client, target_table, csv_path, chunk_size=chunk_size)

        if success:
            # Get row count
            count_query = f"SELECT count() FROM {target_table}"
            row_count = client.command(count_query)
            logger.info(f"✅ Loaded {row_count:,} rows into {target_table}")

            # Check migration if staging table
            if staging_exists and target_table == staging_table:
                # Give ClickHouse time to process materialized view
                import time
                time.sleep(1)

                # Check final table
                final_count_query = f"SELECT count() FROM {table_name}"
                try:
                    final_row_count = client.command(final_count_query)
                    if final_row_count > 0:
                        logger.info(f"⚡ Auto-migration complete: {final_row_count:,} rows in {table_name}")
                    else:
                        logger.warning(f"⚠️ Final table {table_name} is empty - check Materialized View")
                except Exception as e:
                    logger.warning(f"⚠️ Could not check final table {table_name}: {e}")
        else:
            overall_success = False

    if dry_run:
        logger.info("🔍 DRY RUN: Would process all CSV files above")
        return True

    return overall_success


def main():
    parser = argparse.ArgumentParser(
        description='Load CSV data into ClickHouse from dataset configurations',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python load_csv_to_clickhouse.py                         # Load all datasets
  python load_csv_to_clickhouse.py --dataset timeline      # Load specific dataset
  python load_csv_to_clickhouse.py --dry-run               # Show what would be done
  python load_csv_to_clickhouse.py --drop-existing         # Drop and recreate tables
  python load_csv_to_clickhouse.py -v                      # Verbose logging

Note:
  If SQL schema contains Materialized View, data is automatically migrated from
  staging (_raw) to final table via ClickHouse on INSERT (native performance).
        """
    )

    parser.add_argument(
        '--dataset',
        type=str,
        help='Process specific dataset only (e.g., "timeline")'
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Show what would be done without executing'
    )
    parser.add_argument(
        '--drop-existing',
        action='store_true',
        help='Drop existing tables before loading'
    )
    parser.add_argument(
        '-v', '--verbose',
        action='store_true',
        help='Enable verbose logging'
    )

    args = parser.parse_args()

    # Setup logging level
    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    # Display header
    console.print("\n[bold cyan]CSV to ClickHouse Loader[/bold cyan]")
    console.print("[dim]Automated data loading from dataset configurations[/dim]\n")

    try:
        # Load environment
        ch_config = load_env()

        # Discover datasets
        config_dir = PROJECT_ROOT / 'config'
        dataset_files = discover_datasets(config_dir)

        if not dataset_files:
            console.print("[red]No dataset configuration files found![/red]")
            return 1

        # Filter by specific dataset if requested
        if args.dataset:
            dataset_files = [f for f in dataset_files if args.dataset in f.stem]
            if not dataset_files:
                console.print(f"[red]Dataset '{args.dataset}' not found![/red]")
                return 1

        # Show summary table
        table = Table(title="Datasets to Process")
        table.add_column("ID", style="cyan")
        table.add_column("Name", style="green")
        table.add_column("Table", style="yellow")
        table.add_column("CSV", style="blue")

        datasets_to_process = []
        for yaml_path in dataset_files:
            config = load_dataset_config(yaml_path)
            dataset_id = config.get('id', yaml_path.stem.replace('dataset_', ''))
            datasets_to_process.append((dataset_id, config))

            # Format rawdata for display (handle both string and array)
            rawdata = config.get('rawdata', '-')
            if isinstance(rawdata, list):
                csv_display = f"{len(rawdata)} files"
            else:
                csv_display = str(rawdata) if rawdata else '-'

            table.add_row(
                dataset_id,
                config.get('name', '-'),
                config.get('clickhouse_table', '-'),
                csv_display
            )

        console.print(table)
        console.print()

        if args.dry_run:
            console.print("[yellow]DRY RUN MODE - No changes will be made[/yellow]\n")

        # Process each dataset (create separate client for each to avoid SESSION_IS_LOCKED)
        success_count = 0
        for dataset_id, config in datasets_to_process:
            try:
                # Create separate client for each dataset to avoid session conflicts
                if not args.dry_run:
                    # Use longer timeout for large datasets
                    timeout = 600 if dataset_id in ['iwsr', 'timeline'] else 300
                    client = get_clickhouse_client(ch_config, timeout=timeout)
                else:
                    client = None

                success = process_dataset(
                    client,
                    config,
                    dataset_id,
                    config_dir,
                    dry_run=args.dry_run,
                    drop_existing=args.drop_existing
                )
                if success:
                    success_count += 1
                
                # Close client after each dataset to free resources
                if client:
                    try:
                        client.close()
                    except:
                        pass
                        
            except Exception as e:
                logger.error(f"Failed to process dataset {dataset_id}: {e}", exc_info=True)

        # Summary
        console.print()
        console.rule("[bold green]Summary")
        console.print(f"✅ Successfully processed: {success_count}/{len(datasets_to_process)} datasets")

        return 0 if success_count == len(datasets_to_process) else 1

    except Exception as e:
        console.print(f"[red]Fatal error: {e}[/red]")
        logger.error("Fatal error", exc_info=True)
        return 1


if __name__ == '__main__':
    sys.exit(main())
