SM2Gen/lex_scan.py

468 lines
19 KiB
Python

import argparse
import logging
import os
import re
import sys
import json
# Configure logger
#logger.basicConfig(level=logger.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
# Create a custom logger
logger = logging.getLogger("lex_scan_logger")
logger.setLevel(logging.DEBUG) # Set to lowest level needed by any handler
# Formatter for both handlers
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
# File handler (INFO and above)
file_handler = logging.FileHandler('lex_scan.log')
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(formatter)
# Console handler (WARNING and above)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.WARNING)
console_handler.setFormatter(formatter)
# Add handlers to the logger
logger.addHandler(file_handler)
logger.addHandler(console_handler)
missing_files = []
def validate_panel_name(panel_name):
if not panel_name[0].isupper():
logger.error(f"Error: Panel name \'{panel_name}\' must start with a capital letter.")
sys.exit(1)
def get_full_base_path(system):
return os.path.expanduser(os.path.join("~", system, "usr", "share", "smanager"))
def check_controller_file_exists(system, panel):
full_base_path = get_full_base_path(system)
controller_path = os.path.join(full_base_path, "lib/SrvMngr/Controller", f"{panel}.pm")
if not os.path.exists(controller_path):
logger.error(f"Error: Controller file \'{controller_path}\' does not exist.")
sys.exit(1)
return controller_path
def extract_title_prefix(controller_path):
prefix = None
with open(controller_path, 'r') as f:
content = f.read()
# Regex: my $title = $c->l('<prefix>_<anything>');
match = re.search(
r"my\s*\$title\s*=\s*\$c->l\(\s*['\"]([A-Za-z]{2,10})_[^'\"]+['\"]\s*\)",
content
)
if match:
prefix = match.group(1)
logger.info(f"Extracted prefix: {prefix}")
else:
logger.error(
f"Error: Could not find title prefix in '{controller_path}'.\n"
"Expected format: my $title = $c->l('<prefix>_something')"
)
sys.exit(1)
return prefix
def find_matching_files_variable_part(input_string, directory):
# Extract the first alphanumeric part from the input string
match = re.match(r"([A-Za-z0-9]+)", input_string)
if not match:
return []
variable_part = match.group(1)
# Try matching the full variable_part, then progressively remove last characters
for length in range(len(variable_part), 1, -1):
sub_part = variable_part[:length]
matching_files = []
for fname in os.listdir(directory):
name, ext = os.path.splitext(fname)
if name.startswith(sub_part): # match with extra characters allowed
matching_files.append(os.path.join(directory, fname))
if matching_files:
return matching_files
return []
def scan_application_files(system, panel, prefix, scan_general=False):
extracted_strings = {}
full_base_path = get_full_base_path(system)
# Controller file
controller_path = os.path.join(full_base_path, "lib/SrvMngr/Controller", f"{panel}.pm")
logger.info(f"Scanning controller file: {controller_path}")
scan_file_for_lexical_strings(controller_path, prefix, extracted_strings, scan_general)
#Controller file custom code
controller_custom_path = os.path.join(full_base_path, "lib/SrvMngr/Controller", f"{panel}-Custom.pm")
logger.info(f"Scanning Custom controller file: {controller_custom_path}")
scan_file_for_lexical_strings(controller_custom_path, prefix, extracted_strings, scan_general)
# Template files
themes = ["default", "AdminLTE"]
for theme in themes:
template_base_path = os.path.join(full_base_path, "themes", theme, "templates")
if panel in ['Backup','Yum','Bugreport']:
#find the extra layout type files that these use (they do not have partials)
template_files = find_matching_files_variable_part(panel.lower(),template_base_path)
# print(f"Matching template files: {panel.lower()!r} -> Matches: {[os.path.basename(m) for m in template_files]}")
for file_path in template_files:
panel_template_path = os.path.join(template_base_path, f"{file_path}")
logger.warning(f"Scanning panel template file: {panel_template_path}")
scan_file_for_lexical_strings(panel_template_path, prefix, extracted_strings, scan_general)
else:
panel_template_path = os.path.join(template_base_path, f"{panel.lower()}.html.ep")
logger.info(f"Scanning panel template file: {panel_template_path}")
scan_file_for_lexical_strings(panel_template_path, prefix, extracted_strings, scan_general)
# Scan partials
partials_dir = os.path.join(template_base_path, "partials")
if os.path.exists(partials_dir):
for filename in os.listdir(partials_dir):
# Only scan partial files that match the pattern _<prefix>_<anything>.html.ep
if filename.startswith(f"_{prefix.lower()}_") and filename.endswith(".html.ep"):
partial_path = os.path.join(partials_dir, filename)
logger.info(f"Scanning partial template file: {partial_path}")
scan_file_for_lexical_strings(partial_path, prefix, extracted_strings, scan_general)
# Deduplicate lists of dicts in extracted_strings
for key, value in extracted_strings.items():
if isinstance(value, list) and value and isinstance(value[0], dict):
# Deduplicate list of dicts using JSON serialization
seen = set()
deduped = []
for d in value:
ser = json.dumps(d, sort_keys=True)
if ser not in seen:
seen.add(ser)
deduped.append(d)
extracted_strings[key] = deduped
return extracted_strings
def scan_file_for_lexical_strings(filepath, prefix, extracted_strings_dict, scan_general):
if not os.path.exists(filepath):
print(f"Missing file: {filepath}")
return
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# Always scan for l '...' or l "..."
# pattern_l_call = re.compile(r"\bl\s*(['\"])(.+?)\1")
# found_l_calls = pattern_l_call.findall(content)
# for quote, string in found_l_calls:
# if string not in extracted_strings_dict:
# extracted_strings_dict[string] = []
# if filepath not in extracted_strings_dict[string]:
# extracted_strings_dict[string].append(filepath)
# Either scan for strings with a prefix or ones without
# and check for dis-allowed characters
allowed_pattern = r"[A-Za-z0-9_\-/%:,()\. @]+"
if scan_general:
pattern = re.compile(r"l[\s|(][\"|\'](.*?)[\"|\']\)")
found_strings1 = pattern.findall(content)
pattern_l_call = re.compile(r"\bl\s*(['\"])(.+?)\1")
found_l_calls = [match[1] for match in pattern_l_call.findall(content)]
found_strings = found_strings1 + found_l_calls
for s in found_strings:
# Ignore strings that start with the prefix (with underscore)
if s.startswith(f"{prefix}_"):
continue
s = s.replace(" ","_")
#print(f"General:{s} ")
if re.fullmatch(allowed_pattern, s):
if s not in extracted_strings_dict:
extracted_strings_dict[s] = []
if filepath not in extracted_strings_dict[s]:
extracted_strings_dict[s].append(filepath)
else:
logger.error(f"Unexpected chars ({s}) found in {filepath}")
continue
else:
pattern = re.compile(
rf"(['\"])" # opening quote
rf"({prefix}_" # prefix and underscore
rf"(?:\\.|(?!\1).)*?)" # non-greedy: escaped char or any char not the closing quote
rf"\1" # closing quote (same as opening)
)
found_strings = [m.group(2) for m in pattern.finditer(content)]
for s in found_strings:
#print(f"Prefix: {s}")
if re.fullmatch(allowed_pattern, s):
if s not in extracted_strings_dict:
extracted_strings_dict[s] = []
if filepath not in extracted_strings_dict[s]:
extracted_strings_dict[s].append(filepath)
else:
logger.error(f"Unexpected chars ({s}) found in {filepath}")
continue
def read_lex_file(filepath):
logger.info(f"Reading file: {filepath}")
lex_data = {}
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# Improved regex: handles single/double quotes and escaped quotes in value
pattern = r"""
(['"])(.*?)\1 # key in quotes
\s*=>\s*
(['"])((?:\\.|(?!\3).)*)\3 # value in quotes, allowing escaped chars
"""
matches = re.findall(pattern, content, re.DOTALL | re.VERBOSE)
for _, key, quote, value in matches:
# Unescape the quote character and backslashes in value
value = value.replace(f"\\{quote}", quote).replace("\\\\", "\\")
lex_data[key] = value
return lex_data
def write_lex_file(filepath, lex_data):
"""
Writes a dictionary to a lex file, sorted alphabetically by key (case-insensitive).
"""
# Sort the dictionary by key, case-insensitive
sorted_items = sorted(lex_data.items(), key=lambda item: item[0].lower())
with open(filepath, 'w', encoding='utf-8') as f:
for key, value in sorted_items:
value = value.replace("'",'"')
f.write(f"'{key}' => '{value}',{os.linesep}")
def read_languages_json(filepath):
if not os.path.exists(filepath):
missing_files.append(filepath)
return [] # Return empty list instead of exiting
with open(filepath, 'r') as f:
languages = json.load(f)
return languages
def update_file_with_new_lexical_string(filepath, old_string, new_string):
try:
with open(filepath, 'r') as f:
content = f.read()
new_content = content.replace(old_string, new_string)
with open(filepath, 'w') as f:
f.write(new_content)
#map any single quotes to double
logger.info(f"Updated \'{old_string}\' to \'{new_string}\' in file: {filepath}")
except Exception as e:
logger.error(f"Error updating file {filepath}: {e}")
def export_sorted_missing_lex(input_file1, input_file2, output_file):
"""
Reads two lex files, finds all entries in input_file1 missing from input_file2,
sorts them alphabetically by key (case-insensitive), and writes them to output_file.
"""
dict1 = read_lex_file(input_file1)
dict2 = read_lex_file(input_file2)
# Find keys in input_file1 but not in input_file2
missing_keys = set(dict1.keys()) - set(dict2.keys())
sorted_missing_keys = sorted(missing_keys, key=lambda x: x.lower())
# Write missing, sorted lines to output_file
with open(output_file, 'w', encoding='utf-8') as out:
for k in sorted_missing_keys:
out.write(f"'{k}' => '{dict1[k]}',\n")
logger.info(f"Missing lines written to {output_file}:")
#for k in sorted_missing_keys:
# print(f"'{k}' => '{dict1[k]}',")
def main():
parser = argparse.ArgumentParser(description="Scan Mojolicious application files for lexical strings.")
parser.add_argument("-p", "--panel", required=True, help="Name of the Mojolicious panel (e.g., MyPanel).")
parser.add_argument("-s", "--system", default="SME11", help="System name (default: SME11).")
parser.add_argument("-e", "--edit", action="store_true", help="Enable editing of original files (default: False).")
parser.add_argument("-l", "--lang", action="store_true", help="Enable other language processing (default: False).")
args = parser.parse_args()
panel = args.panel
system = args.system
edit_files = args.edit
do_lang = args.lang
logger.warning(f"Lex scan for panel: {panel}, system: {system} edit: {edit_files} lang: {do_lang}\n")
validate_panel_name(panel)
controller_path = check_controller_file_exists(system, panel)
prefix = extract_title_prefix(controller_path)
if prefix:
logger.info(f"Scanning application files for strings with prefix \'{prefix}\'...")
extracted_panel_strings = scan_application_files(system, panel, prefix)
logger.info(f"Deduplicated extracted panel strings: {len(extracted_panel_strings)} unique strings found.")
# Process panel-specific English lexical file
# Output to current working directory
panel_lex_output_dir = os.path.join(os.getcwd(), "output", panel.capitalize())
os.makedirs(panel_lex_output_dir, exist_ok=True)
full_base_path = get_full_base_path(system)
# Corrected capitalization for panel in path
en_lex_path = os.path.join(full_base_path, "lib/SrvMngr/I18N/Modules", panel, f"{panel.lower()}_en.lex.bak")
en_lex_new_path = os.path.join(panel_lex_output_dir, f"{panel.lower()}_en.lex.new")
en_lex_data = read_lex_file(en_lex_path)
logger.info(f"Original English lex file lines: {len(en_lex_data)}")
new_en_lex_data = {}
for lex_string in extracted_panel_strings.keys():
if lex_string in en_lex_data:
new_en_lex_data[lex_string] = en_lex_data[lex_string]
else:
#Replace rhs by the lhs less the prefix and no underlines, in lowercase (but capitalised)
# this may make a reasonable message, derived from the lex string id.
sometext = lex_string.replace(f"{prefix}_", "").replace("_", " ")
# Split into words
words = sometext.split()
# Lowercase all words, capitalize the first
if words:
words = [words[0].capitalize()] + [w.lower() for w in words[1:]]
sometext = ' '.join(words)
new_en_lex_data[lex_string] = sometext
write_lex_file(en_lex_new_path, new_en_lex_data)
logger.info(f"Generated {en_lex_new_path}. Lines in new file: {len(new_en_lex_data)}, Lines in original file: {len(en_lex_data)}")
#Create file of the ones not in the new lex file
output_diff_file = os.path.join(panel_lex_output_dir, f"{panel.lower()}_en.lex.diff")
export_sorted_missing_lex(en_lex_path, en_lex_new_path, output_diff_file)
if do_lang:
languages_json_path = os.path.join(".", "Templates", "languages.json") # Corrected path
languages = read_languages_json(languages_json_path)
for lang_entry in languages:
lang_code = lang_entry["code"]
if lang_code == "en":
continue
lang_lex_path = os.path.join(full_base_path, "lib/SrvMngr/I18N/Modules", panel, f"{panel.lower()}_{lang_code}.lex")
lang_lex_new_path = os.path.join(panel_lex_output_dir, f"{panel.lower()}_{lang_code}.lex.new")
lang_lex_data = read_lex_file(lang_lex_path)
logger.info(f"Original {lang_code} lex file lines: {len(lang_lex_data)}")
new_lang_lex_data = {}
for lex_string in extracted_panel_strings.keys():
if lex_string in lang_lex_data:
new_lang_lex_data[lex_string] = lang_lex_data[lex_string]
else:
sometext_from_en = new_en_lex_data.get(lex_string, "")
new_en_lex_data[lex_string] = sometext
new_lang_lex_data[lex_string] = sometext_from_en
write_lex_file(lang_lex_new_path, new_lang_lex_data)
logger.info(f"Generated {lang_lex_new_path}. Lines in new file: {len(new_lang_lex_data)}, Lines in original file: {len(lang_lex_data)}")
logger.info("")
logger.info("Scanning application files for general lexical strings...")
extracted_general_strings = scan_application_files(system, panel, prefix, scan_general=True)
logger.info(f"Deduplicated extracted general strings: {len(extracted_general_strings)} unique strings found.")
general_lex_output_dir = os.path.join(os.getcwd(), "output", "General")
os.makedirs(general_lex_output_dir, exist_ok=True)
general_en_lex_path_orig = os.path.join(full_base_path, "lib/SrvMngr/I18N/Modules", "General", "general_en.lex.bak")
general_en_lex_new_path = os.path.join(general_lex_output_dir, "general_en.lex.new")
general_en_lex_data_orig = read_lex_file(general_en_lex_path_orig)
logger.info(f"Original general English lex file lines: {len(general_en_lex_data_orig)}")
new_general_en_lex_data = read_lex_file(general_en_lex_new_path)
for lex_string in extracted_general_strings.keys():
if lex_string in general_en_lex_data_orig:
new_general_en_lex_data[lex_string] = general_en_lex_data_orig[lex_string]
else:
sometext = lex_string.replace("_", " ")
sometext = sometext.replace("'",'"')
# Split into words
words = sometext.split()
# Lowercase all words, capitalize the first
if words:
words = [words[0].capitalize()] + [w.lower() for w in words[1:]]
sometext = ' '.join(words)
new_general_en_lex_data[lex_string] = sometext
write_lex_file(general_en_lex_new_path, new_general_en_lex_data)
logger.info(f"Generated {general_en_lex_new_path}. Lines in new file: {len(new_general_en_lex_data)}, Lines in original file: {len(general_en_lex_data_orig)}")
logger.info("")
if do_lang:
for lang_entry in languages:
lang_code = lang_entry["code"]
if lang_code == "en":
continue
general_lang_lex_path = os.path.join(full_base_path, "lib/SrvMngr/I18N/Modules", "General", f"general_{lang_code}.lex")
general_lang_lex_new_path = os.path.join(general_lex_output_dir, f"general_{lang_code}.lex.new")
general_lang_lex_data = read_lex_file(general_lang_lex_path)
logger.info(f"Original general {lang_code} lex file lines: {len(general_lang_lex_data)}")
new_general_lang_lex_data = {}
for lex_string in extracted_general_strings.keys():
if lex_string in general_lang_lex_data:
new_general_lang_lex_data[lex_string] = general_lang_lex_data[lex_string]
else:
sometext_from_en = new_general_en_lex_data.get(lex_string, "")
new_general_lang_lex_data[lex_string] = sometext_from_en
write_lex_file(general_lang_lex_new_path, new_general_lang_lex_data)
logger.info(f"Generated {general_lang_lex_new_path}. Lines in new file: {len(new_general_lang_lex_data)}, Lines in original file: {len(general_lang_lex_data)}")
logger.info("")
if edit_files:
logger.info("Handling single-word lexical strings...")
for lex_string, filepaths in extracted_panel_strings.items():
if lex_string.startswith(f"{prefix}_"):
sometext_part = lex_string[len(prefix) + 1:]
if "_" not in sometext_part:
just_one_word = sometext_part
if just_one_word not in new_general_en_lex_data:
new_general_en_lex_data[just_one_word] = just_one_word
logger.info(f"Added \'{just_one_word}\' to {general_en_lex_new_path}")
write_lex_file(general_en_lex_new_path, new_general_en_lex_data)
for lang_entry in languages:
lang_code = lang_entry["code"]
if lang_code == "en":
continue
general_lang_lex_path = os.path.join(full_base_path, "lib/SrvMngr/I18N/Modules", "General", f"general_{lang_code}.lex")
general_lang_lex_new_path = os.path.join(general_lex_output_dir, f"general_{lang_code}.lex.new")
current_general_lang_lex_data = read_lex_file(general_lang_lex_new_path)
if just_one_word not in current_general_lang_lex_data:
current_general_lang_lex_data[just_one_word] = just_one_word
write_lex_file(general_lang_lex_new_path, current_general_lang_lex_data)
logger.info(f"Added \'{just_one_word}\' to {general_lang_lex_new_path}")
for filepath in filepaths:
update_file_with_new_lexical_string(filepath, lex_string, just_one_word)
else:
logger.error("Could not determine prefix, exiting.")
sys.exit(1)
if missing_files:
logger.warning("The following files were not found:")
for f in missing_files:
logger.warning(f"- {f}")
if __name__ == "__main__":
main()