From 4050d94608a420de7ce52b29f977402f4aa84ff7 Mon Sep 17 00:00:00 2001 From: Brian Read Date: Fri, 4 Jul 2025 09:59:16 +0100 Subject: [PATCH] llm first shot at lex audit code --- lex_scan.py | 288 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 288 insertions(+) create mode 100644 lex_scan.py diff --git a/lex_scan.py b/lex_scan.py new file mode 100644 index 0000000..c5df676 --- /dev/null +++ b/lex_scan.py @@ -0,0 +1,288 @@ +import argparse +import logging +import os +import re +import sys +import json + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +SYSTEM_BASE_PATH = "/usr/share/smanager/" + +def validate_panel_name(panel_name): + if not panel_name[0].isupper(): + logging.error(f"Error: Panel name \'{panel_name}\' must start with a capital letter.") + sys.exit(1) + +def check_controller_file_exists(system, panel): + controller_path = os.path.join(SYSTEM_BASE_PATH, "lib/SrvMngr/Controller", f"{panel}.pm") + if not os.path.exists(controller_path): + logging.error(f"Error: Controller file \'{controller_path}\' does not exist.") + sys.exit(1) + return controller_path + +def extract_title_prefix(controller_path): + prefix = None + with open(controller_path, 'r') as f: + content = f.read() + # Corrected regex: match either " or \' for the string enclosure + match = re.search(r"my \$title = \$c->l\([\"|\"]([A-Za-z]{2,4})_.*?\)", content) + if match: + prefix = match.group(1) + logging.info(f"Extracted prefix: {prefix}") + else: + logging.error(f"Error: Could not find title prefix in \'{controller_path}\'. Expected format: my $title = $c->l(\"_something\") or my $title = $c->l(\\'_something\\')") + sys.exit(1) + return prefix + +def scan_application_files(system, panel, prefix, scan_general=False): + extracted_strings = {} + + # Controller file + controller_path = os.path.join(SYSTEM_BASE_PATH, "lib/SrvMngr/Controller", f"{panel}.pm") + logging.info(f"Scanning controller file: {controller_path}") + scan_file_for_lexical_strings(controller_path, prefix, extracted_strings, scan_general) + + # Template files + themes = ["default", "AdminLTE"] + for theme in themes: + template_base_path = os.path.join(SYSTEM_BASE_PATH, "themes", theme, "templates") + panel_template_path = os.path.join(template_base_path, f"{panel.lower()}.html.ep") + logging.info(f"Scanning panel template file: {panel_template_path}") + scan_file_for_lexical_strings(panel_template_path, prefix, extracted_strings, scan_general) + + # Scan partials + partials_dir = os.path.join(template_base_path, "partials") + if os.path.exists(partials_dir): + for filename in os.listdir(partials_dir): + if filename.endswith(".html.ep"): + partial_path = os.path.join(partials_dir, filename) + logging.info(f"Scanning partial template file: {partial_path}") + scan_file_for_lexical_strings(partial_path, prefix, extracted_strings, scan_general) + + return extracted_strings + +def scan_file_for_lexical_strings(filepath, prefix, extracted_strings_dict, scan_general): + if not os.path.exists(filepath): + logging.warning(f"Warning: File not found: {filepath}") + return + + with open(filepath, 'r') as f: + content = f.read() + if scan_general: + # Regex for general strings: l[\s|(][\"|"](.*)[\"|"]\) + pattern = re.compile(r"l[\s|(][\"|\"](.*?)[\"|\"]\)") + found_strings = pattern.findall(content) + for s in found_strings: + if not s.startswith(f"{prefix}_"): + if s not in extracted_strings_dict: + extracted_strings_dict[s] = [] + if filepath not in extracted_strings_dict[s]: + extracted_strings_dict[s].append(filepath) + else: + # Regex to find _ + pattern = re.compile(rf"{prefix}_[a-zA-Z0-9_]+") + found_strings = pattern.findall(content) + for s in found_strings: + if s not in extracted_strings_dict: + extracted_strings_dict[s] = [] + if filepath not in extracted_strings_dict[s]: + extracted_strings_dict[s].append(filepath) + +def read_lex_file(filepath): + lex_data = {} + if not os.path.exists(filepath): + logging.warning(f"Lex file not found: {filepath}. Returning empty dictionary.") + return lex_data + with open(filepath, 'r') as f: + for line in f: + match = re.match(r"'(.*?)' => '(.*)'", line.strip()) + if match: + key, value = match.groups() + lex_data[key] = value + return lex_data + +def write_lex_file(filepath, lex_data): + with open(filepath, 'w') as f: + for key, value in lex_data.items(): + f.write(f"'{key}' => '{value}'\n") + +def read_languages_json(filepath): + if not os.path.exists(filepath): + logging.error(f"Error: languages.json file not found at {filepath}") + sys.exit(1) + with open(filepath, 'r') as f: + languages = json.load(f) + return languages + +def update_file_with_new_lexical_string(filepath, old_string, new_string): + try: + with open(filepath, 'r') as f: + content = f.read() + new_content = content.replace(old_string, new_string) + with open(filepath, 'w') as f: + f.write(new_content) + logging.info(f"Updated \'{old_string}\' to \'{new_string}\' in file: {filepath}") + except Exception as e: + logging.error(f"Error updating file {filepath}: {e}") + +def main(): + parser = argparse.ArgumentParser(description="Scan Mojolicious application files for lexical strings.") + parser.add_argument("-p", "--panel", required=True, help="Name of the Mojolicious panel (e.g., MyPanel).") + parser.add_argument("-s", "--system", default="SME11", help="System name (default: SME11).") + + args = parser.parse_args() + + panel = args.panel + system = args.system + + logging.info(f"Starting scan for panel: {panel}, system: {system}") + + validate_panel_name(panel) + controller_path = check_controller_file_exists(system, panel) + prefix = extract_title_prefix(controller_path) + + if prefix: + logging.info(f"Scanning application files for strings with prefix \'{prefix}\'...") + extracted_panel_strings = scan_application_files(system, panel, prefix) + logging.info(f"Deduplicated extracted panel strings: {len(extracted_panel_strings)} unique strings found.") + + # Process panel-specific English lexical file + # Output to current working directory + panel_lex_output_dir = os.path.join(os.getcwd(), "output", panel.capitalize()) + os.makedirs(panel_lex_output_dir, exist_ok=True) + + en_lex_path = os.path.join(SYSTEM_BASE_PATH, "lib/SrvMngr/I128N", panel.capitalize(), f"{panel.lower()}_en.lex") + en_lex_new_path = os.path.join(panel_lex_output_dir, f"{panel.lower()}_en.lex.new") + + en_lex_data = read_lex_file(en_lex_path) + new_en_lex_data = {} + + for lex_string in extracted_panel_strings.keys(): + if lex_string in en_lex_data: + new_en_lex_data[lex_string] = en_lex_data[lex_string] + else: + # Convert _ to with underlines mapped to spaces + sometext = lex_string.replace(f"{prefix}_", "").replace("_", " ") + new_en_lex_data[lex_string] = sometext + + write_lex_file(en_lex_new_path, new_en_lex_data) + logging.info(f"Generated {en_lex_new_path}. Lines in new file: {len(new_en_lex_data)}, Lines in original file: {len(en_lex_data)}") + + # Read languages.json (assuming it\'s in a known path, e.g., /usr/share/smanager/Templates/languages.json) + languages_json_path = os.path.join(SYSTEM_BASE_PATH, "Templates", "languages.json") # Placeholder path + try: + languages = read_languages_json(languages_json_path) + except SystemExit: + logging.warning(f"Could not read languages.json from {languages_json_path}. Skipping language-specific lexical file processing.") + languages = [] # Set to empty list to skip the loop + + # Process panel-specific other language lexical files + for lang_entry in languages: + lang_code = lang_entry["code"] + if lang_code == "en": # Skip English, already processed + continue + + lang_lex_path = os.path.join(SYSTEM_BASE_PATH, "lib/SrvMngr/I128N", panel.capitalize(), f"{panel.lower()}_{lang_code}.lex") + lang_lex_new_path = os.path.join(panel_lex_output_dir, f"{panel.lower()}_{lang_code}.lex.new") + + lang_lex_data = read_lex_file(lang_lex_path) + new_lang_lex_data = {} + + for lex_string in extracted_panel_strings.keys(): + if lex_string in lang_lex_data: + new_lang_lex_data[lex_string] = lang_lex_data[lex_string] + else: + sometext_from_en = new_en_lex_data.get(lex_string, "") + new_lang_lex_data[lex_string] = sometext_from_en + + write_lex_file(lang_lex_new_path, new_lang_lex_data) + logging.info(f"Generated {lang_lex_new_path}. Lines in new file: {len(new_lang_lex_data)}, Lines in original file: {len(lang_lex_data)}") + + # Scan for general lexical strings + logging.info("Scanning application files for general lexical strings...") + extracted_general_strings = scan_application_files(system, panel, prefix, scan_general=True) + logging.info(f"Deduplicated extracted general strings: {len(extracted_general_strings)} unique strings found.") + + general_lex_output_dir = os.path.join(os.getcwd(), "output", "general") + os.makedirs(general_lex_output_dir, exist_ok=True) + + general_en_lex_path = os.path.join(SYSTEM_BASE_PATH, "lib/SrvMngr/I128N", "general", "general_en.lex") + general_en_lex_new_path = os.path.join(general_lex_output_dir, "general_en.lex.new") + + general_en_lex_data = read_lex_file(general_en_lex_path) + new_general_en_lex_data = {} + + for lex_string in extracted_general_strings.keys(): + if lex_string in general_en_lex_data: + new_general_en_lex_data[lex_string] = general_en_lex_data[lex_string] + else: + sometext = lex_string.replace("_", " ") + new_general_en_lex_data[lex_string] = sometext + + write_lex_file(general_en_lex_new_path, new_general_en_lex_data) + logging.info(f"Generated {general_en_lex_new_path}. Lines in new file: {len(new_general_en_lex_data)}, Lines in original file: {len(general_en_lex_data)}") + + # Process general other language lexical files + for lang_entry in languages: + lang_code = lang_entry["code"] + if lang_code == "en": + continue + + general_lang_lex_path = os.path.join(SYSTEM_BASE_PATH, "lib/SrvMngr/I128N", "general", f"general_{lang_code}.lex") + general_lang_lex_new_path = os.path.join(general_lex_output_dir, f"general_{lang_code}.lex.new") + + general_lang_lex_data = read_lex_file(general_lang_lex_path) + new_general_lang_lex_data = {} + + for lex_string in extracted_general_strings.keys(): + if lex_string in general_lang_lex_data: + new_general_lang_lex_data[lex_string] = general_lang_lex_data[lex_string] + else: + sometext_from_en = new_general_en_lex_data.get(lex_string, "") + new_general_lang_lex_data[lex_string] = sometext_from_en + + write_lex_file(general_lang_lex_new_path, new_general_lang_lex_data) + logging.info(f"Generated {general_lang_lex_new_path}. Lines in new file: {len(new_general_lang_lex_data)}, Lines in original file: {len(general_lang_lex_data)}") + + # Handle single-word lexical strings + logging.info("Handling single-word lexical strings...") + for lex_string, filepaths in extracted_panel_strings.items(): + if lex_string.startswith(f"{prefix}_"): + sometext_part = lex_string[len(prefix) + 1:] + if "_" not in sometext_part: # It\'s a single word after prefix + just_one_word = sometext_part + + # Check in general_en.lex.new + if just_one_word not in new_general_en_lex_data: + new_general_en_lex_data[just_one_word] = just_one_word + logging.info(f"Added \'{just_one_word}\' to {general_en_lex_new_path}") + write_lex_file(general_en_lex_new_path, new_general_en_lex_data) + + # Update other general language files + for lang_entry in languages: + lang_code = lang_entry["code"] + if lang_code == "en": + continue + general_lang_lex_path = os.path.join(SYSTEM_BASE_PATH, "lib/SrvMngr/I128N", "general", f"general_{lang_code}.lex") + general_lang_lex_new_path = os.path.join(general_lex_output_dir, f"general_{lang_code}.lex.new") + + current_general_lang_lex_data = read_lex_file(general_lang_lex_new_path) # Read the .new file + if just_one_word not in current_general_lang_lex_data: + current_general_lang_lex_data[just_one_word] = just_one_word # Assuming same value for now + write_lex_file(general_lang_lex_new_path, current_general_lang_lex_data) + logging.info(f"Added \'{just_one_word}\' to {general_lang_lex_new_path}") + + # Edit original files + for filepath in filepaths: + update_file_with_new_lexical_string(filepath, lex_string, just_one_word) + + else: + logging.error("Could not determine prefix, exiting.") + sys.exit(1) + +if __name__ == "__main__": + main() + +