gymkhana/scripts/extract_dialog_keys.py

#!/usr/bin/env python3
"""
Script to extract dialog keys from ESC files and TSCN files and compare with existing translations.

This script:
1. Scans all .esc files in the gymkhana folder
2. Extracts dialog keys from 'say' commands and dialog blocks
3. Scans all .tscn files in the gymkhana folder
4. Extracts translation keys from action3_target_texts, action4_target_texts, and custom_data tooltips
5. Compares with existing translations in turno_cocina.csv
6. Generates a CSV file with missing translations
7. Generates a markdown file with keys that only have default translations
8. Generates a summary report in markdown format
"""

import os
import re
import csv
import glob
import argparse
from pathlib import Path

def extract_dialog_keys_from_esc_file(file_path):
    """
    Extract dialog keys from a single ESC file.

    Returns a set of dialog keys found in the file.
    """
    dialog_keys = set()

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return dialog_keys

    # Pattern for say commands with dialog keys
    # Examples: say($player, "text", "dialog_key")
    say_pattern = r'say\s*\(\s*[^,]+,\s*"[^"]*",\s*"([^"]+)"\s*\)'
    say_matches = re.findall(say_pattern, content, re.MULTILINE)
    dialog_keys.update(say_matches)

    # Pattern for dialog options in dialog blocks
    # Examples: - "dialog_key:text" or - "dialog_key:text" [condition]
    dialog_pattern = r'-\s*"([^:]+):[^"]*"(?:\s*\[[^\]]*\])?'
    dialog_matches = re.findall(dialog_pattern, content, re.MULTILINE)
    dialog_keys.update(dialog_matches)

    # Clean up any malformed keys (remove newlines and extra whitespace)
    cleaned_keys = set()
    for key in dialog_keys:
        clean_key = re.sub(r'\s+', ' ', key.strip())
        # Filter out invalid keys
        if (clean_key and
            not clean_key.startswith('"') and
            not clean_key.endswith('"') and
            not clean_key.startswith('say(') and
            not clean_key.startswith('done') and
            not clean_key.startswith('stop') and
            not clean_key.startswith('Agur') and
            not 'say(' in clean_key and  # Don't include keys with say( in them
            not 'done' in clean_key and  # Don't include keys with done in them
            len(clean_key) > 3 and  # Minimum length for a valid key
            ('_' in clean_key or clean_key.isalnum())):  # Keys should have underscores or be alphanumeric
            cleaned_keys.add(clean_key)

    return cleaned_keys

def load_existing_translations(csv_path):
    """
    Load existing translations from CSV file.

    Returns a set of existing dialog keys.
    """
    existing_keys = set()

    try:
        with open(csv_path, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                if 'keys' in row and row['keys']:
                    existing_keys.add(row['keys'])
    except Exception as e:
        print(f"Error reading translations file {csv_path}: {e}")

    return existing_keys

def extract_default_translations_from_esc_files(esc_files):
    """
    Extract default translations (Spanish text) from ESC files.

    Returns a dictionary mapping dialog keys to their default Spanish text.
    """
    key_to_text = {}

    for file_path in esc_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            continue

        # Pattern for say commands with both text and key
        # say($player, "Spanish text", "dialog_key")
        say_with_text_pattern = r'say\s*\(\s*[^,]+,\s*"([^"]+)",\s*"([^"]+)"\s*\)'
        matches = re.findall(say_with_text_pattern, content, re.MULTILINE)

        for text, key in matches:
            # Clean up the text (remove extra whitespace)
            clean_text = text.strip()
            clean_key = key.strip()
            if clean_text and clean_key:
                key_to_text[clean_key] = clean_text

        # Pattern for dialog options with both key and text
        # - "dialog_key:Spanish text"
        dialog_with_text_pattern = r'-\s*"([^:]+):([^"]*)"(?:\s*\[[^\]]*\])?'
        dialog_matches = re.findall(dialog_with_text_pattern, content, re.MULTILINE)

        for key, text in dialog_matches:
            # Clean up the text (remove extra whitespace)
            clean_text = text.strip()
            clean_key = key.strip()
            if clean_text and clean_key:
                key_to_text[clean_key] = clean_text

    return key_to_text

def extract_keys_with_only_default_translations(esc_files):
    """
    Extract dialog keys that only have default translations (no translation key).

    Returns a list of dictionaries with file, line, key, and text information.
    """
    keys_with_only_default = []

    for file_path in esc_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                lines = f.readlines()
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            continue

        for line_num, line in enumerate(lines, 1):
            line = line.strip()

            # Pattern for say commands with only text (no translation key)
            # say($player, "Spanish text only")
            say_only_text_pattern = r'say\s*\(\s*[^,]+,\s*"([^"]+)"\s*\)(?!\s*,\s*"[^"]+")'
            match = re.search(say_only_text_pattern, line)
            if match:
                text = match.group(1).strip()
                if text and len(text) > 3:  # Filter out very short texts
                    keys_with_only_default.append({
                        'file': os.path.relpath(file_path),
                        'line': line_num,
                        'key': None,  # No key provided
                        'text': text
                    })

            # Pattern for dialog options with only text (no translation key)
            # - "Spanish text only" (but NOT - "key:text")
            dialog_only_text_pattern = r'-\s*"([^"]+)"(?:\s*\[[^\]]*\])?(?!\s*:)'
            match = re.search(dialog_only_text_pattern, line)
            if match:
                text = match.group(1).strip()
                # Check if this text contains a colon (indicating it has a translation key)
                if text and len(text) > 3 and ':' not in text:  # Filter out very short texts and keys with colons
                    keys_with_only_default.append({
                        'file': os.path.relpath(file_path),
                        'line': line_num,
                        'key': None,  # No key provided
                        'text': text
                    })

    return keys_with_only_default

def extract_translation_keys_from_tscn_file(file_path):
    """
    Extract translation keys from a single TSCN file.

    Returns a list of dictionaries with translation key, line number, and type information.
    """
    translation_keys = []

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            content = ''.join(lines)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return translation_keys

    # Pattern for action3_target_texts and action4_target_texts
    # These contain key-value pairs where values are translation keys
    action_pattern = r'(action[34]_target_texts)\s*=\s*\{([^}]+)\}'
    action_matches = re.findall(action_pattern, content, re.MULTILINE | re.DOTALL)

    for action_type, content_block in action_matches:
        # Find the line number where this action block starts
        action_line_num = None
        for line_num, line in enumerate(lines, 1):
            if action_type in line and '=' in line:
                action_line_num = line_num
                break

        # Extract translation keys from the content block
        # Pattern: "key": "translation_key"
        key_pattern = r'"([^"]+)":\s*"([^"]+)"'
        key_matches = re.findall(key_pattern, content_block)

        for key, translation_key in key_matches:
            # Clean up the translation key
            clean_key = translation_key.strip()
            if clean_key and len(clean_key) > 3:
                translation_keys.append({
                    'key': clean_key,
                    'file': os.path.relpath(file_path),
                    'line': action_line_num,
                    'type': action_type
                })

    # Pattern for custom_data tooltips
    # Look for custom_data blocks that contain tooltips
    custom_data_pattern = r'custom_data\s*=\s*\{([^}]+)\}'
    custom_data_matches = re.findall(custom_data_pattern, content, re.MULTILINE | re.DOTALL)

    for custom_data_block in custom_data_matches:
        # Find the line number where this custom_data block starts
        custom_data_line_num = None
        for line_num, line in enumerate(lines, 1):
            if 'custom_data' in line and '=' in line:
                custom_data_line_num = line_num
                break

        # Look for tooltips within custom_data
        tooltips_pattern = r'"tooltips":\s*\{([^}]+)\}'
        tooltips_matches = re.findall(tooltips_pattern, custom_data_block, re.MULTILINE | re.DOTALL)

        for tooltips_block in tooltips_matches:
            # Extract translation keys from tooltips
            # Pattern: "action1": "translation_key"
            tooltip_pattern = r'"([^"]+)":\s*"([^"]+)"'
            tooltip_matches = re.findall(tooltip_pattern, tooltips_block)

            for action, translation_key in tooltip_matches:
                # Clean up the translation key
                clean_key = translation_key.strip()
                if clean_key and len(clean_key) > 3:
                    translation_keys.append({
                        'key': clean_key,
                        'file': os.path.relpath(file_path),
                        'line': custom_data_line_num,
                        'type': 'custom_data_tooltip'
                    })

    return translation_keys

def extract_untranslated_texts_from_tscn_files(tscn_files):
    """
    Extract untranslated texts from TSCN files.

    Returns a list of dictionaries with file, line, key, and text information for untranslated texts.
    """
    untranslated_texts = []

    for file_path in tscn_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                lines = f.readlines()
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            continue

        for line_num, line in enumerate(lines, 1):
            line = line.strip()

            # Pattern for action3_target_texts and action4_target_texts with untranslated text
            # Look for lines that contain Spanish text instead of translation keys
            action_untranslated_pattern = r'(action[34]_target_texts)\s*=\s*\{([^}]+)\}'
            match = re.search(action_untranslated_pattern, line)
            if match:
                # Check if the content contains Spanish text (not translation keys)
                content_block = match.group(2)
                # Look for Spanish text patterns (contains accented characters or common Spanish words)
                spanish_pattern = r'"([^"]*[áéíóúñüÁÉÍÓÚÑÜ][^"]*)"'
                spanish_matches = re.findall(spanish_pattern, content_block)

                for spanish_text in spanish_matches:
                    if spanish_text and len(spanish_text) > 3:
                        untranslated_texts.append({
                            'file': os.path.relpath(file_path),
                            'line': line_num,
                            'key': None,  # No translation key provided
                            'text': spanish_text,
                            'type': 'action_target_text'
                        })

            # Pattern for custom_data tooltips with untranslated text
            custom_data_pattern = r'custom_data\s*=\s*\{([^}]+)\}'
            match = re.search(custom_data_pattern, line)
            if match:
                content_block = match.group(1)
                # Look for tooltips with Spanish text
                tooltips_pattern = r'"tooltips":\s*\{([^}]+)\}'
                tooltips_match = re.search(tooltips_pattern, content_block)

                if tooltips_match:
                    tooltips_block = tooltips_match.group(1)
                    # Look for Spanish text in tooltips
                    spanish_pattern = r'"([^"]*[áéíóúñüÁÉÍÓÚÑÜ][^"]*)"'
                    spanish_matches = re.findall(spanish_pattern, tooltips_block)

                    for spanish_text in spanish_matches:
                        if spanish_text and len(spanish_text) > 3:
                            untranslated_texts.append({
                                'file': os.path.relpath(file_path),
                                'line': line_num,
                                'key': None,  # No translation key provided
                                'text': spanish_text,
                                'type': 'custom_data_tooltip'
                            })

    return untranslated_texts

def generate_markdown_report(missing_keys, keys_with_only_default, key_to_text, tscn_missing_keys_data, tscn_untranslated_texts, output_dir):
    """
    Generate a markdown summary report.
    """
    report_path = os.path.join(output_dir, "translation_report.md")

    with open(report_path, 'w', encoding='utf-8') as f:
        f.write("# Translation Analysis Report\n\n")
        f.write(f"Generated on: {__import__('datetime').datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")

        f.write("## Summary\n\n")
        f.write(f"- **Total ESC dialog keys found**: {len(missing_keys) + len(key_to_text)}\n")
        f.write(f"- **ESC keys with missing translations**: {len(missing_keys)}\n")
        f.write(f"- **ESC keys with only default translations**: {len(keys_with_only_default)}\n")
        f.write(f"- **ESC keys with proper translation keys**: {len(key_to_text)}\n")
        f.write(f"- **TSCN translation keys found**: {len(tscn_missing_keys_data)}\n")
        f.write(f"- **TSCN keys with missing translations**: {len(tscn_missing_keys_data)}\n")
        f.write(f"- **TSCN untranslated texts**: {len(tscn_untranslated_texts)}\n\n")

        f.write("## ESC Files - Missing Translations\n\n")
        if missing_keys:
            f.write(f"Found {len(missing_keys)} dialog keys that are missing from the translation file:\n\n")
            for key in sorted(missing_keys):
                default_text = key_to_text.get(key, "")
                f.write(f"- **{key}**: {default_text}\n")
        else:
            f.write("✅ No missing ESC translations found!\n")

        f.write("\n## ESC Files - Keys with Only Default Translations\n\n")
        if keys_with_only_default:
            f.write(f"Found {len(keys_with_only_default)} dialog entries that only have default text (no translation key):\n\n")
            for item in keys_with_only_default:
                f.write(f"- **File**: `{item['file']}` (line {item['line']})\n")
                f.write(f"  - **Text**: {item['text']}\n\n")
        else:
            f.write("✅ All ESC dialog entries have proper translation keys!\n")

        f.write("\n## TSCN Files - Missing Translations\n\n")
        if tscn_missing_keys_data:
            f.write(f"Found {len(tscn_missing_keys_data)} TSCN translation keys that are missing from the translation file:\n\n")
            for item in sorted(tscn_missing_keys_data, key=lambda x: x['key']):
                f.write(f"- **{item['key']}**\n")
                f.write(f"  - **File**: `{item['file']}` (line {item['line']})\n")
                f.write(f"  - **Type**: {item['type']}\n\n")
        else:
            f.write("✅ No missing TSCN translations found!\n")

        f.write("\n## TSCN Files - Untranslated Texts\n\n")
        if tscn_untranslated_texts:
            f.write(f"Found {len(tscn_untranslated_texts)} TSCN entries with untranslated text (no translation key):\n\n")
            for item in tscn_untranslated_texts:
                f.write(f"- **File**: `{item['file']}` (line {item['line']})\n")
                f.write(f"  - **Type**: {item['type']}\n")
                f.write(f"  - **Text**: {item['text']}\n\n")
        else:
            f.write("✅ All TSCN entries have proper translation keys!\n")

        f.write("\n## Recommendations\n\n")
        recommendations = []
        if missing_keys:
            recommendations.append("1. Add the missing ESC translation keys to your translation CSV file")
        if keys_with_only_default:
            recommendations.append("2. Consider adding translation keys to ESC dialog entries that only have default text")
        if tscn_missing_keys_data:
            recommendations.append("3. Add the missing TSCN translation keys to your translation CSV file")
        if tscn_untranslated_texts:
            recommendations.append("4. Replace untranslated text in TSCN files with proper translation keys")

        if recommendations:
            for rec in recommendations:
                f.write(f"{rec}\n")
        else:
            f.write("🎉 Your translation setup looks great! All dialog entries have proper translation keys.\n")

    return report_path

def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Extract dialog keys from ESC files and analyze translations')
    parser.add_argument('--output-dir', '-o',
                       help='Output directory for generated files (default: project root)',
                       default=None)
    args = parser.parse_args()

    # Get the script directory and project root
    # Script is in scripts/ folder, so project root is one level up
    script_dir = os.path.dirname(os.path.abspath(__file__))
    project_root = os.path.dirname(script_dir)

    # Set output directory
    if args.output_dir:
        output_dir = os.path.abspath(args.output_dir)
        os.makedirs(output_dir, exist_ok=True)
    else:
        output_dir = project_root

    # Paths relative to project root
    gymkhana_path = os.path.join(project_root, "gymkhana")
    translations_path = os.path.join(project_root, "gymkhana", "translations", "turno_cocina.csv")

    # Output file paths
    missing_translations_path = os.path.join(output_dir, "missing_translations.csv")
    default_only_path = os.path.join(output_dir, "default_only_translations.md")

    print("Extracting dialog keys from ESC files...")

    # Find all ESC files
    esc_files = glob.glob(os.path.join(gymkhana_path, "**/*.esc"), recursive=True)
    print(f"Found {len(esc_files)} ESC files")

    # Extract all dialog keys
    all_dialog_keys = set()
    for esc_file in esc_files:
        keys = extract_dialog_keys_from_esc_file(esc_file)
        all_dialog_keys.update(keys)
        if keys:
            print(f"  {os.path.relpath(esc_file, gymkhana_path)}: {len(keys)} keys")

    print(f"\nTotal unique ESC dialog keys found: {len(all_dialog_keys)}")

    # Find all TSCN files
    print("\nExtracting translation keys from TSCN files...")
    tscn_files = glob.glob(os.path.join(gymkhana_path, "**/*.tscn"), recursive=True)
    print(f"Found {len(tscn_files)} TSCN files")

    # Extract all TSCN translation keys
    all_tscn_keys_data = []
    for tscn_file in tscn_files:
        keys_data = extract_translation_keys_from_tscn_file(tscn_file)
        all_tscn_keys_data.extend(keys_data)
        if keys_data:
            print(f"  {os.path.relpath(tscn_file, gymkhana_path)}: {len(keys_data)} keys")

    # Extract unique keys for comparison
    all_tscn_keys = set(item['key'] for item in all_tscn_keys_data)
    print(f"\nTotal unique TSCN translation keys found: {len(all_tscn_keys)}")

    # Load existing translations
    print("\nLoading existing translations...")
    existing_keys = load_existing_translations(translations_path)
    print(f"Existing translation keys: {len(existing_keys)}")

    # Find missing keys for ESC files
    missing_keys = all_dialog_keys - existing_keys
    print(f"Missing ESC translation keys: {len(missing_keys)}")

    # Find missing keys for TSCN files
    tscn_missing_keys = all_tscn_keys - existing_keys
    tscn_missing_keys_data = [item for item in all_tscn_keys_data if item['key'] in tscn_missing_keys]
    print(f"Missing TSCN translation keys: {len(tscn_missing_keys)}")

    if missing_keys:
        print("\nMissing ESC keys:")
        for key in sorted(missing_keys):
            print(f"  - {key}")

    if tscn_missing_keys:
        print("\nMissing TSCN keys:")
        for key in sorted(tscn_missing_keys):
            print(f"  - {key}")

    # Extract default translations for missing keys
    print("\nExtracting default translations...")
    key_to_text = extract_default_translations_from_esc_files(esc_files)

    # Extract keys with only default translations
    print("\nExtracting keys with only default translations...")
    keys_with_only_default = extract_keys_with_only_default_translations(esc_files)

    # Extract untranslated texts from TSCN files
    print("\nExtracting untranslated texts from TSCN files...")
    tscn_untranslated_texts = extract_untranslated_texts_from_tscn_files(tscn_files)

    # Generate CSV output for missing translations
    print(f"\nGenerating CSV output: {missing_translations_path}")
    with open(missing_translations_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['keys', 'en', 'fr', 'es'])  # Header matching the original format

        # Add missing ESC keys
        for key in sorted(missing_keys):
            default_text = key_to_text.get(key, "")
            writer.writerow([key, "", "", default_text])

        # Add missing TSCN keys
        for key in sorted(tscn_missing_keys):
            writer.writerow([key, "", "", ""])

    # Generate markdown output for keys with only default translations
    print(f"\nGenerating markdown output: {default_only_path}")
    with open(default_only_path, 'w', encoding='utf-8') as f:
        f.write("# Dialog Keys with Only Default Translations\n\n")
        f.write(f"Generated on: {__import__('datetime').datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")

        f.write("## ESC Files\n\n")
        if keys_with_only_default:
            f.write(f"Found {len(keys_with_only_default)} ESC dialog entries that only have default text (no translation key):\n\n")
            for item in keys_with_only_default:
                f.write(f"## {item['file']} (line {item['line']})\n\n")
                f.write(f"**Text**: {item['text']}\n\n")
                f.write("---\n\n")
        else:
            f.write("✅ No ESC dialog entries found with only default translations!\n")

        f.write("\n## TSCN Files\n\n")
        if tscn_untranslated_texts:
            f.write(f"Found {len(tscn_untranslated_texts)} TSCN entries with untranslated text (no translation key):\n\n")
            for item in tscn_untranslated_texts:
                f.write(f"## {item['file']} (line {item['line']})\n\n")
                f.write(f"**Type**: {item['type']}\n")
                f.write(f"**Text**: {item['text']}\n\n")
                f.write("---\n\n")
        else:
            f.write("✅ No TSCN entries found with untranslated text!\n")

    # Generate summary report
    print(f"\nGenerating summary report...")
    report_path = generate_markdown_report(missing_keys, keys_with_only_default, key_to_text, tscn_missing_keys_data, tscn_untranslated_texts, output_dir)

    # Summary
    print(f"\nSummary:")
    print(f"  Total ESC dialog keys found: {len(all_dialog_keys)}")
    print(f"  Total TSCN translation keys found: {len(all_tscn_keys)}")
    print(f"  Existing translations: {len(existing_keys)}")
    print(f"  Missing ESC translations: {len(missing_keys)}")
    print(f"  Missing TSCN translations: {len(tscn_missing_keys_data)}")
    print(f"  ESC keys with only default translations: {len(keys_with_only_default)}")
    print(f"  TSCN untranslated texts: {len(tscn_untranslated_texts)}")
    print(f"  Output directory: {output_dir}")
    print(f"  Files generated:")
    print(f"    - {missing_translations_path}")
    print(f"    - {default_only_path}")
    print(f"    - {report_path}")

if __name__ == "__main__":
    main()