Files
gymkhana/scripts/extract_dialog_keys.py

565 lines
24 KiB
Python

#!/usr/bin/env python3
"""
Script to extract dialog keys from ESC files and TSCN files and compare with existing translations.
This script:
1. Scans all .esc files in the gymkhana folder
2. Extracts dialog keys from 'say' commands and dialog blocks
3. Scans all .tscn files in the gymkhana folder
4. Extracts translation keys from action3_target_texts, action4_target_texts, and custom_data tooltips
5. Compares with existing translations in turno_cocina.csv
6. Generates a CSV file with missing translations
7. Generates a markdown file with keys that only have default translations
8. Generates a summary report in markdown format
"""
import os
import re
import csv
import glob
import argparse
from pathlib import Path
def extract_dialog_keys_from_esc_file(file_path):
"""
Extract dialog keys from a single ESC file.
Returns a set of dialog keys found in the file.
"""
dialog_keys = set()
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
print(f"Error reading {file_path}: {e}")
return dialog_keys
# Pattern for say commands with dialog keys
# Examples: say($player, "text", "dialog_key")
say_pattern = r'say\s*\(\s*[^,]+,\s*"[^"]*",\s*"([^"]+)"\s*\)'
say_matches = re.findall(say_pattern, content, re.MULTILINE)
dialog_keys.update(say_matches)
# Pattern for dialog options in dialog blocks
# Examples: - "dialog_key:text" or - "dialog_key:text" [condition]
dialog_pattern = r'-\s*"([^:]+):[^"]*"(?:\s*\[[^\]]*\])?'
dialog_matches = re.findall(dialog_pattern, content, re.MULTILINE)
dialog_keys.update(dialog_matches)
# Clean up any malformed keys (remove newlines and extra whitespace)
cleaned_keys = set()
for key in dialog_keys:
clean_key = re.sub(r'\s+', ' ', key.strip())
# Filter out invalid keys
if (clean_key and
not clean_key.startswith('"') and
not clean_key.endswith('"') and
not clean_key.startswith('say(') and
not clean_key.startswith('done') and
not clean_key.startswith('stop') and
not clean_key.startswith('Agur') and
not 'say(' in clean_key and # Don't include keys with say( in them
not 'done' in clean_key and # Don't include keys with done in them
len(clean_key) > 3 and # Minimum length for a valid key
('_' in clean_key or clean_key.isalnum())): # Keys should have underscores or be alphanumeric
cleaned_keys.add(clean_key)
return cleaned_keys
def load_existing_translations(csv_path):
"""
Load existing translations from CSV file.
Returns a set of existing dialog keys.
"""
existing_keys = set()
try:
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
if 'keys' in row and row['keys']:
existing_keys.add(row['keys'])
except Exception as e:
print(f"Error reading translations file {csv_path}: {e}")
return existing_keys
def extract_default_translations_from_esc_files(esc_files):
"""
Extract default translations (Spanish text) from ESC files.
Returns a dictionary mapping dialog keys to their default Spanish text.
"""
key_to_text = {}
for file_path in esc_files:
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
print(f"Error reading {file_path}: {e}")
continue
# Pattern for say commands with both text and key
# say($player, "Spanish text", "dialog_key")
say_with_text_pattern = r'say\s*\(\s*[^,]+,\s*"([^"]+)",\s*"([^"]+)"\s*\)'
matches = re.findall(say_with_text_pattern, content, re.MULTILINE)
for text, key in matches:
# Clean up the text (remove extra whitespace)
clean_text = text.strip()
clean_key = key.strip()
if clean_text and clean_key:
key_to_text[clean_key] = clean_text
# Pattern for dialog options with both key and text
# - "dialog_key:Spanish text"
dialog_with_text_pattern = r'-\s*"([^:]+):([^"]*)"(?:\s*\[[^\]]*\])?'
dialog_matches = re.findall(dialog_with_text_pattern, content, re.MULTILINE)
for key, text in dialog_matches:
# Clean up the text (remove extra whitespace)
clean_text = text.strip()
clean_key = key.strip()
if clean_text and clean_key:
key_to_text[clean_key] = clean_text
return key_to_text
def extract_keys_with_only_default_translations(esc_files):
"""
Extract dialog keys that only have default translations (no translation key).
Returns a list of dictionaries with file, line, key, and text information.
"""
keys_with_only_default = []
for file_path in esc_files:
try:
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
except Exception as e:
print(f"Error reading {file_path}: {e}")
continue
for line_num, line in enumerate(lines, 1):
line = line.strip()
# Pattern for say commands with only text (no translation key)
# say($player, "Spanish text only")
say_only_text_pattern = r'say\s*\(\s*[^,]+,\s*"([^"]+)"\s*\)(?!\s*,\s*"[^"]+")'
match = re.search(say_only_text_pattern, line)
if match:
text = match.group(1).strip()
if text and len(text) > 3: # Filter out very short texts
keys_with_only_default.append({
'file': os.path.relpath(file_path),
'line': line_num,
'key': None, # No key provided
'text': text
})
# Pattern for dialog options with only text (no translation key)
# - "Spanish text only" (but NOT - "key:text")
dialog_only_text_pattern = r'-\s*"([^"]+)"(?:\s*\[[^\]]*\])?(?!\s*:)'
match = re.search(dialog_only_text_pattern, line)
if match:
text = match.group(1).strip()
# Check if this text contains a colon (indicating it has a translation key)
if text and len(text) > 3 and ':' not in text: # Filter out very short texts and keys with colons
keys_with_only_default.append({
'file': os.path.relpath(file_path),
'line': line_num,
'key': None, # No key provided
'text': text
})
return keys_with_only_default
def extract_translation_keys_from_tscn_file(file_path):
"""
Extract translation keys from a single TSCN file.
Returns a list of dictionaries with translation key, line number, and type information.
"""
translation_keys = []
try:
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
content = ''.join(lines)
except Exception as e:
print(f"Error reading {file_path}: {e}")
return translation_keys
# Pattern for action3_target_texts and action4_target_texts
# These contain key-value pairs where values are translation keys
action_pattern = r'(action[34]_target_texts)\s*=\s*\{([^}]+)\}'
action_matches = re.findall(action_pattern, content, re.MULTILINE | re.DOTALL)
for action_type, content_block in action_matches:
# Find the line number where this action block starts
action_line_num = None
for line_num, line in enumerate(lines, 1):
if action_type in line and '=' in line:
action_line_num = line_num
break
# Extract translation keys from the content block
# Pattern: "key": "translation_key"
key_pattern = r'"([^"]+)":\s*"([^"]+)"'
key_matches = re.findall(key_pattern, content_block)
for key, translation_key in key_matches:
# Clean up the translation key
clean_key = translation_key.strip()
if clean_key and len(clean_key) > 3:
translation_keys.append({
'key': clean_key,
'file': os.path.relpath(file_path),
'line': action_line_num,
'type': action_type
})
# Pattern for custom_data tooltips
# Look for custom_data blocks that contain tooltips
custom_data_pattern = r'custom_data\s*=\s*\{([^}]+)\}'
custom_data_matches = re.findall(custom_data_pattern, content, re.MULTILINE | re.DOTALL)
for custom_data_block in custom_data_matches:
# Find the line number where this custom_data block starts
custom_data_line_num = None
for line_num, line in enumerate(lines, 1):
if 'custom_data' in line and '=' in line:
custom_data_line_num = line_num
break
# Look for tooltips within custom_data
tooltips_pattern = r'"tooltips":\s*\{([^}]+)\}'
tooltips_matches = re.findall(tooltips_pattern, custom_data_block, re.MULTILINE | re.DOTALL)
for tooltips_block in tooltips_matches:
# Extract translation keys from tooltips
# Pattern: "action1": "translation_key"
tooltip_pattern = r'"([^"]+)":\s*"([^"]+)"'
tooltip_matches = re.findall(tooltip_pattern, tooltips_block)
for action, translation_key in tooltip_matches:
# Clean up the translation key
clean_key = translation_key.strip()
if clean_key and len(clean_key) > 3:
translation_keys.append({
'key': clean_key,
'file': os.path.relpath(file_path),
'line': custom_data_line_num,
'type': 'custom_data_tooltip'
})
return translation_keys
def extract_untranslated_texts_from_tscn_files(tscn_files):
"""
Extract untranslated texts from TSCN files.
Returns a list of dictionaries with file, line, key, and text information for untranslated texts.
"""
untranslated_texts = []
for file_path in tscn_files:
try:
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
except Exception as e:
print(f"Error reading {file_path}: {e}")
continue
for line_num, line in enumerate(lines, 1):
line = line.strip()
# Pattern for action3_target_texts and action4_target_texts with untranslated text
# Look for lines that contain Spanish text instead of translation keys
action_untranslated_pattern = r'(action[34]_target_texts)\s*=\s*\{([^}]+)\}'
match = re.search(action_untranslated_pattern, line)
if match:
# Check if the content contains Spanish text (not translation keys)
content_block = match.group(2)
# Look for Spanish text patterns (contains accented characters or common Spanish words)
spanish_pattern = r'"([^"]*[áéíóúñüÁÉÍÓÚÑÜ][^"]*)"'
spanish_matches = re.findall(spanish_pattern, content_block)
for spanish_text in spanish_matches:
if spanish_text and len(spanish_text) > 3:
untranslated_texts.append({
'file': os.path.relpath(file_path),
'line': line_num,
'key': None, # No translation key provided
'text': spanish_text,
'type': 'action_target_text'
})
# Pattern for custom_data tooltips with untranslated text
custom_data_pattern = r'custom_data\s*=\s*\{([^}]+)\}'
match = re.search(custom_data_pattern, line)
if match:
content_block = match.group(1)
# Look for tooltips with Spanish text
tooltips_pattern = r'"tooltips":\s*\{([^}]+)\}'
tooltips_match = re.search(tooltips_pattern, content_block)
if tooltips_match:
tooltips_block = tooltips_match.group(1)
# Look for Spanish text in tooltips
spanish_pattern = r'"([^"]*[áéíóúñüÁÉÍÓÚÑÜ][^"]*)"'
spanish_matches = re.findall(spanish_pattern, tooltips_block)
for spanish_text in spanish_matches:
if spanish_text and len(spanish_text) > 3:
untranslated_texts.append({
'file': os.path.relpath(file_path),
'line': line_num,
'key': None, # No translation key provided
'text': spanish_text,
'type': 'custom_data_tooltip'
})
return untranslated_texts
def generate_markdown_report(missing_keys, keys_with_only_default, key_to_text, tscn_missing_keys_data, tscn_untranslated_texts, output_dir):
"""
Generate a markdown summary report.
"""
report_path = os.path.join(output_dir, "translation_report.md")
with open(report_path, 'w', encoding='utf-8') as f:
f.write("# Translation Analysis Report\n\n")
f.write(f"Generated on: {__import__('datetime').datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
f.write("## Summary\n\n")
f.write(f"- **Total ESC dialog keys found**: {len(missing_keys) + len(key_to_text)}\n")
f.write(f"- **ESC keys with missing translations**: {len(missing_keys)}\n")
f.write(f"- **ESC keys with only default translations**: {len(keys_with_only_default)}\n")
f.write(f"- **ESC keys with proper translation keys**: {len(key_to_text)}\n")
f.write(f"- **TSCN translation keys found**: {len(tscn_missing_keys_data)}\n")
f.write(f"- **TSCN keys with missing translations**: {len(tscn_missing_keys_data)}\n")
f.write(f"- **TSCN untranslated texts**: {len(tscn_untranslated_texts)}\n\n")
f.write("## ESC Files - Missing Translations\n\n")
if missing_keys:
f.write(f"Found {len(missing_keys)} dialog keys that are missing from the translation file:\n\n")
for key in sorted(missing_keys):
default_text = key_to_text.get(key, "")
f.write(f"- **{key}**: {default_text}\n")
else:
f.write("✅ No missing ESC translations found!\n")
f.write("\n## ESC Files - Keys with Only Default Translations\n\n")
if keys_with_only_default:
f.write(f"Found {len(keys_with_only_default)} dialog entries that only have default text (no translation key):\n\n")
for item in keys_with_only_default:
f.write(f"- **File**: `{item['file']}` (line {item['line']})\n")
f.write(f" - **Text**: {item['text']}\n\n")
else:
f.write("✅ All ESC dialog entries have proper translation keys!\n")
f.write("\n## TSCN Files - Missing Translations\n\n")
if tscn_missing_keys_data:
f.write(f"Found {len(tscn_missing_keys_data)} TSCN translation keys that are missing from the translation file:\n\n")
for item in sorted(tscn_missing_keys_data, key=lambda x: x['key']):
f.write(f"- **{item['key']}**\n")
f.write(f" - **File**: `{item['file']}` (line {item['line']})\n")
f.write(f" - **Type**: {item['type']}\n\n")
else:
f.write("✅ No missing TSCN translations found!\n")
f.write("\n## TSCN Files - Untranslated Texts\n\n")
if tscn_untranslated_texts:
f.write(f"Found {len(tscn_untranslated_texts)} TSCN entries with untranslated text (no translation key):\n\n")
for item in tscn_untranslated_texts:
f.write(f"- **File**: `{item['file']}` (line {item['line']})\n")
f.write(f" - **Type**: {item['type']}\n")
f.write(f" - **Text**: {item['text']}\n\n")
else:
f.write("✅ All TSCN entries have proper translation keys!\n")
f.write("\n## Recommendations\n\n")
recommendations = []
if missing_keys:
recommendations.append("1. Add the missing ESC translation keys to your translation CSV file")
if keys_with_only_default:
recommendations.append("2. Consider adding translation keys to ESC dialog entries that only have default text")
if tscn_missing_keys_data:
recommendations.append("3. Add the missing TSCN translation keys to your translation CSV file")
if tscn_untranslated_texts:
recommendations.append("4. Replace untranslated text in TSCN files with proper translation keys")
if recommendations:
for rec in recommendations:
f.write(f"{rec}\n")
else:
f.write("🎉 Your translation setup looks great! All dialog entries have proper translation keys.\n")
return report_path
def main():
# Parse command line arguments
parser = argparse.ArgumentParser(description='Extract dialog keys from ESC files and analyze translations')
parser.add_argument('--output-dir', '-o',
help='Output directory for generated files (default: project root)',
default=None)
args = parser.parse_args()
# Get the script directory and project root
# Script is in scripts/ folder, so project root is one level up
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(script_dir)
# Set output directory
if args.output_dir:
output_dir = os.path.abspath(args.output_dir)
os.makedirs(output_dir, exist_ok=True)
else:
output_dir = project_root
# Paths relative to project root
gymkhana_path = os.path.join(project_root, "gymkhana")
translations_path = os.path.join(project_root, "gymkhana", "translations", "turno_cocina.csv")
# Output file paths
missing_translations_path = os.path.join(output_dir, "missing_translations.csv")
default_only_path = os.path.join(output_dir, "default_only_translations.md")
print("Extracting dialog keys from ESC files...")
# Find all ESC files
esc_files = glob.glob(os.path.join(gymkhana_path, "**/*.esc"), recursive=True)
print(f"Found {len(esc_files)} ESC files")
# Extract all dialog keys
all_dialog_keys = set()
for esc_file in esc_files:
keys = extract_dialog_keys_from_esc_file(esc_file)
all_dialog_keys.update(keys)
if keys:
print(f" {os.path.relpath(esc_file, gymkhana_path)}: {len(keys)} keys")
print(f"\nTotal unique ESC dialog keys found: {len(all_dialog_keys)}")
# Find all TSCN files
print("\nExtracting translation keys from TSCN files...")
tscn_files = glob.glob(os.path.join(gymkhana_path, "**/*.tscn"), recursive=True)
print(f"Found {len(tscn_files)} TSCN files")
# Extract all TSCN translation keys
all_tscn_keys_data = []
for tscn_file in tscn_files:
keys_data = extract_translation_keys_from_tscn_file(tscn_file)
all_tscn_keys_data.extend(keys_data)
if keys_data:
print(f" {os.path.relpath(tscn_file, gymkhana_path)}: {len(keys_data)} keys")
# Extract unique keys for comparison
all_tscn_keys = set(item['key'] for item in all_tscn_keys_data)
print(f"\nTotal unique TSCN translation keys found: {len(all_tscn_keys)}")
# Load existing translations
print("\nLoading existing translations...")
existing_keys = load_existing_translations(translations_path)
print(f"Existing translation keys: {len(existing_keys)}")
# Find missing keys for ESC files
missing_keys = all_dialog_keys - existing_keys
print(f"Missing ESC translation keys: {len(missing_keys)}")
# Find missing keys for TSCN files
tscn_missing_keys = all_tscn_keys - existing_keys
tscn_missing_keys_data = [item for item in all_tscn_keys_data if item['key'] in tscn_missing_keys]
print(f"Missing TSCN translation keys: {len(tscn_missing_keys)}")
if missing_keys:
print("\nMissing ESC keys:")
for key in sorted(missing_keys):
print(f" - {key}")
if tscn_missing_keys:
print("\nMissing TSCN keys:")
for key in sorted(tscn_missing_keys):
print(f" - {key}")
# Extract default translations for missing keys
print("\nExtracting default translations...")
key_to_text = extract_default_translations_from_esc_files(esc_files)
# Extract keys with only default translations
print("\nExtracting keys with only default translations...")
keys_with_only_default = extract_keys_with_only_default_translations(esc_files)
# Extract untranslated texts from TSCN files
print("\nExtracting untranslated texts from TSCN files...")
tscn_untranslated_texts = extract_untranslated_texts_from_tscn_files(tscn_files)
# Generate CSV output for missing translations
print(f"\nGenerating CSV output: {missing_translations_path}")
with open(missing_translations_path, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['keys', 'en', 'fr', 'es']) # Header matching the original format
# Add missing ESC keys
for key in sorted(missing_keys):
default_text = key_to_text.get(key, "")
writer.writerow([key, "", "", default_text])
# Add missing TSCN keys
for key in sorted(tscn_missing_keys):
writer.writerow([key, "", "", ""])
# Generate markdown output for keys with only default translations
print(f"\nGenerating markdown output: {default_only_path}")
with open(default_only_path, 'w', encoding='utf-8') as f:
f.write("# Dialog Keys with Only Default Translations\n\n")
f.write(f"Generated on: {__import__('datetime').datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
f.write("## ESC Files\n\n")
if keys_with_only_default:
f.write(f"Found {len(keys_with_only_default)} ESC dialog entries that only have default text (no translation key):\n\n")
for item in keys_with_only_default:
f.write(f"## {item['file']} (line {item['line']})\n\n")
f.write(f"**Text**: {item['text']}\n\n")
f.write("---\n\n")
else:
f.write("✅ No ESC dialog entries found with only default translations!\n")
f.write("\n## TSCN Files\n\n")
if tscn_untranslated_texts:
f.write(f"Found {len(tscn_untranslated_texts)} TSCN entries with untranslated text (no translation key):\n\n")
for item in tscn_untranslated_texts:
f.write(f"## {item['file']} (line {item['line']})\n\n")
f.write(f"**Type**: {item['type']}\n")
f.write(f"**Text**: {item['text']}\n\n")
f.write("---\n\n")
else:
f.write("✅ No TSCN entries found with untranslated text!\n")
# Generate summary report
print(f"\nGenerating summary report...")
report_path = generate_markdown_report(missing_keys, keys_with_only_default, key_to_text, tscn_missing_keys_data, tscn_untranslated_texts, output_dir)
# Summary
print(f"\nSummary:")
print(f" Total ESC dialog keys found: {len(all_dialog_keys)}")
print(f" Total TSCN translation keys found: {len(all_tscn_keys)}")
print(f" Existing translations: {len(existing_keys)}")
print(f" Missing ESC translations: {len(missing_keys)}")
print(f" Missing TSCN translations: {len(tscn_missing_keys_data)}")
print(f" ESC keys with only default translations: {len(keys_with_only_default)}")
print(f" TSCN untranslated texts: {len(tscn_untranslated_texts)}")
print(f" Output directory: {output_dir}")
print(f" Files generated:")
print(f" - {missing_translations_path}")
print(f" - {default_only_path}")
print(f" - {report_path}")
if __name__ == "__main__":
main()