#!/usr/bin/env python3 """ Lex ID Reference Counter This program parses a lex file containing quoted lex-ids and strings, then searches through *.ep and *.pm files in a directory hierarchy to count references to those lex-ids and prints a sorted table. Files with "I18N" or "AdminLTE" in their pathname are excluded from the search. Results are sorted by Weighted Score (Total Refs × File Count) highest to lowest. Usage: python3 count-references.py """ import os import re import sys import glob from collections import defaultdict from pathlib import Path def parse_lex_file(lex_file_path): """ Parse the lex file to extract lex-ids. Expected format: "lex-id" => "quoted string", Returns: set: A set of lex-ids (without quotes) """ lex_ids = set() try: with open(lex_file_path, 'r', encoding='utf-8') as f: content = f.read() # Pattern to match "lex-id" => "quoted string", # Captures the lex-id (first quoted string) pattern = r"""['"]([^'"]+)['"]\s*=>\s*['"][^'"]*['"]\s*,""" matches = re.findall(pattern, content) for match in matches: lex_ids.add(match) print(f"Found {len(lex_ids)} lex-ids in {lex_file_path}") except FileNotFoundError: print(f"Error: Lex file '{lex_file_path}' not found.") sys.exit(1) except Exception as e: print(f"Error reading lex file: {e}") sys.exit(1) return lex_ids def find_target_files(search_directory): """ Find all *.ep and *.pm files in the directory hierarchy. Excludes files whose pathname includes "I18N" or "AdminLTE". Args: search_directory (str): Root directory to search Returns: list: List of file paths (excluding I18N and AdminLTE files) """ target_files = [] if not os.path.exists(search_directory): print(f"Error: Search directory '{search_directory}' not found.") sys.exit(1) # Use glob to find all .ep and .pm files recursively ep_files = glob.glob(os.path.join(search_directory, '**', '*.ep'), recursive=True) pm_files = glob.glob(os.path.join(search_directory, '**', '*.pm'), recursive=True) all_files = ep_files + pm_files # Filter out files with "I18N" or "AdminLTE" in their pathname target_files = [f for f in all_files if "I18N" not in f and "AdminLTE" not in f] excluded_count = len(all_files) - len(target_files) i18n_excluded = [f for f in all_files if "I18N" in f] adminlte_excluded = [f for f in all_files if "AdminLTE" in f] print(f"Found {len(all_files)} total files (.ep and .pm)") if excluded_count > 0: print(f"Excluded {len(i18n_excluded)} files containing 'I18N' in pathname") print(f"Excluded {len(adminlte_excluded)} files containing 'AdminLTE' in pathname") print(f"Total excluded: {excluded_count} files") print(f"Processing {len(target_files)} target files") return target_files def count_lex_references(lex_ids, target_files): """ Count references to lex-ids in target files and track file counts. Looks for quoted lex-ids in the files. Args: lex_ids (set): Set of lex-ids to search for target_files (list): List of file paths to search in Returns: dict: Dictionary with lex-id as key and dict containing 'total_refs' and 'file_count' as value """ # Structure: {lex_id: {'total_refs': count, 'file_count': count, 'files': set()}} reference_data = defaultdict(lambda: {'total_refs': 0, 'file_count': 0, 'files': set()}) for file_path in target_files: try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() # Search for each lex-id in quotes for lex_id in lex_ids: # Pattern to match the lex-id in quotes quoted_pattern = f"['\"]{re.escape(lex_id)}['\"]" matches = re.findall(quoted_pattern, content) if matches: # Add to total reference count reference_data[lex_id]['total_refs'] += len(matches) # Add file to the set of files containing this lex_id reference_data[lex_id]['files'].add(file_path) except Exception as e: print(f"Warning: Could not read file {file_path}: {e}") continue # Calculate file counts from the sets for lex_id in reference_data: reference_data[lex_id]['file_count'] = len(reference_data[lex_id]['files']) return reference_data def print_results_table(reference_data): """ Print the results in a table format, sorted by Weighted Score (Total Refs × File Count) highest to lowest. Args: reference_data (dict): Dictionary with lex-id as key and data dict as value """ if not reference_data: print("No references found.") return # Calculate weighted score for each lex_id and sort by it (descending), then by lex-id (ascending) for ties def get_weighted_score(item): lex_id, data = item return data['total_refs'] * data['file_count'] sorted_items = sorted(reference_data.items(), key=lambda x: (-get_weighted_score(x), x[0])) # Calculate column widths max_lex_id_width = max(len(lex_id) for lex_id in reference_data.keys()) if reference_data else 0 max_total_refs_width = max(len(str(data['total_refs'])) for data in reference_data.values()) if reference_data else 0 max_file_count_width = max(len(str(data['file_count'])) for data in reference_data.values()) if reference_data else 0 # Calculate refs per file and weighted score values for width determination refs_per_file_values = [] weighted_score_values = [] for data in reference_data.values(): if data['file_count'] > 0: refs_per_file = data['total_refs'] / data['file_count'] refs_per_file_values.append(f"{refs_per_file:.1f}") else: refs_per_file_values.append("0.0") weighted_score = data['total_refs'] * data['file_count'] weighted_score_values.append(str(weighted_score)) max_refs_per_file_width = max(len(val) for val in refs_per_file_values) if refs_per_file_values else 0 max_weighted_score_width = max(len(val) for val in weighted_score_values) if weighted_score_values else 0 # Ensure minimum widths for headers lex_id_width = max(max_lex_id_width, len("Lex ID")) total_refs_width = max(max_total_refs_width, len("Total Refs")) file_count_width = max(max_file_count_width, len("Files")) refs_per_file_width = max(max_refs_per_file_width, len("Refs/File")) weighted_score_width = max(max_weighted_score_width, len("Weighted Score")) # Calculate total table width table_width = lex_id_width + total_refs_width + file_count_width + refs_per_file_width + weighted_score_width + 16 # 16 for separators # Print header print("\nReference Count Results (sorted by Weighted Score, excluding I18N and AdminLTE files):") print("=" * table_width) print(f"{'Lex ID':<{lex_id_width}} | {'Total Refs':>{total_refs_width}} | {'Files':>{file_count_width}} | {'Refs/File':>{refs_per_file_width}} | {'Weighted Score':>{weighted_score_width}}") print("-" * table_width) # Print results total_references = 0 total_files_with_refs = set() total_weighted_score = 0 for lex_id, data in sorted_items: refs_per_file = data['total_refs'] / data['file_count'] if data['file_count'] > 0 else 0.0 weighted_score = data['total_refs'] * data['file_count'] print(f"{lex_id:<{lex_id_width}} | {data['total_refs']:>{total_refs_width}} | {data['file_count']:>{file_count_width}} | {refs_per_file:>{refs_per_file_width}.1f} | {weighted_score:>{weighted_score_width}}") total_references += data['total_refs'] total_files_with_refs.update(data['files']) total_weighted_score += weighted_score # Calculate overall refs per file overall_refs_per_file = total_references / len(total_files_with_refs) if total_files_with_refs else 0.0 print("-" * table_width) print(f"{'Total':<{lex_id_width}} | {total_references:>{total_refs_width}} | {len(total_files_with_refs):>{file_count_width}} | {overall_refs_per_file:>{refs_per_file_width}.1f} | {total_weighted_score:>{weighted_score_width}}") # Print summary print(f"\nSummary:") print(f"- Total lex-id references found: {total_references}") print(f"- Total unique files with references: {len(total_files_with_refs)}") print(f"- Total lex-ids with at least one reference: {len([data for data in reference_data.values() if data['total_refs'] > 0])}") print(f"- Average references per file: {overall_refs_per_file:.1f}") print(f"- Total weighted score: {total_weighted_score}") print(f"- Results sorted by Weighted Score (Total Refs × File Count, highest to lowest)") print(f"- Files with 'I18N' or 'AdminLTE' in pathname were excluded from search") def main(): """Main function to orchestrate the program.""" if len(sys.argv) != 3: print("Usage: python3 count-references.py ") print("\nExample:") print(" python3 count-references.py lexicon.lex /path/to/search") print("\nNote: Files with 'I18N' or 'AdminLTE' in their pathname will be excluded from the search.") print("Results are sorted by Weighted Score (Total Refs × File Count, highest to lowest).") sys.exit(1) lex_file_path = sys.argv[1] search_directory = sys.argv[2] print(f"Parsing lex file: {lex_file_path}") print(f"Searching directory: {search_directory}") print() # Step 1: Parse the lex file to get lex-ids lex_ids = parse_lex_file(lex_file_path) if not lex_ids: print("No lex-ids found in the lex file.") sys.exit(1) # Step 2: Find all target files (.ep and .pm), excluding I18N and AdminLTE files target_files = find_target_files(search_directory) if not target_files: print("No .ep or .pm files found in the search directory (after exclusions).") sys.exit(1) # Step 3: Count references to lex-ids in target files print("Counting references...") reference_data = count_lex_references(lex_ids, target_files) # Step 4: Print results table print_results_table(reference_data) if __name__ == "__main__": main()