265 lines
10 KiB
Python
265 lines
10 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Lex ID Reference Counter
|
||
|
||
This program parses a lex file containing quoted lex-ids and strings,
|
||
then searches through *.ep and *.pm files in a directory hierarchy
|
||
to count references to those lex-ids and prints a sorted table.
|
||
Files with "I18N" or "AdminLTE" in their pathname are excluded from the search.
|
||
Results are sorted by Weighted Score (Total Refs × File Count) highest to lowest.
|
||
|
||
Usage: python3 count-references.py <lex_file> <search_directory>
|
||
"""
|
||
|
||
import os
|
||
import re
|
||
import sys
|
||
import glob
|
||
from collections import defaultdict
|
||
from pathlib import Path
|
||
|
||
|
||
def parse_lex_file(lex_file_path):
|
||
"""
|
||
Parse the lex file to extract lex-ids.
|
||
Expected format: "lex-id" => "quoted string",
|
||
|
||
Returns:
|
||
set: A set of lex-ids (without quotes)
|
||
"""
|
||
lex_ids = set()
|
||
|
||
try:
|
||
with open(lex_file_path, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
# Pattern to match "lex-id" => "quoted string",
|
||
# Captures the lex-id (first quoted string)
|
||
pattern = r"""['"]([^'"]+)['"]\s*=>\s*['"][^'"]*['"]\s*,"""
|
||
matches = re.findall(pattern, content)
|
||
|
||
for match in matches:
|
||
lex_ids.add(match)
|
||
|
||
print(f"Found {len(lex_ids)} lex-ids in {lex_file_path}")
|
||
|
||
except FileNotFoundError:
|
||
print(f"Error: Lex file '{lex_file_path}' not found.")
|
||
sys.exit(1)
|
||
except Exception as e:
|
||
print(f"Error reading lex file: {e}")
|
||
sys.exit(1)
|
||
|
||
return lex_ids
|
||
|
||
|
||
def find_target_files(search_directory):
|
||
"""
|
||
Find all *.ep and *.pm files in the directory hierarchy.
|
||
Excludes files whose pathname includes "I18N" or "AdminLTE".
|
||
|
||
Args:
|
||
search_directory (str): Root directory to search
|
||
|
||
Returns:
|
||
list: List of file paths (excluding I18N and AdminLTE files)
|
||
"""
|
||
target_files = []
|
||
|
||
if not os.path.exists(search_directory):
|
||
print(f"Error: Search directory '{search_directory}' not found.")
|
||
sys.exit(1)
|
||
|
||
# Use glob to find all .ep and .pm files recursively
|
||
ep_files = glob.glob(os.path.join(search_directory, '**', '*.ep'), recursive=True)
|
||
pm_files = glob.glob(os.path.join(search_directory, '**', '*.pm'), recursive=True)
|
||
|
||
all_files = ep_files + pm_files
|
||
|
||
# Filter out files with "I18N" or "AdminLTE" in their pathname
|
||
target_files = [f for f in all_files if "I18N" not in f and "AdminLTE" not in f]
|
||
|
||
excluded_count = len(all_files) - len(target_files)
|
||
i18n_excluded = [f for f in all_files if "I18N" in f]
|
||
adminlte_excluded = [f for f in all_files if "AdminLTE" in f]
|
||
|
||
print(f"Found {len(all_files)} total files (.ep and .pm)")
|
||
if excluded_count > 0:
|
||
print(f"Excluded {len(i18n_excluded)} files containing 'I18N' in pathname")
|
||
print(f"Excluded {len(adminlte_excluded)} files containing 'AdminLTE' in pathname")
|
||
print(f"Total excluded: {excluded_count} files")
|
||
print(f"Processing {len(target_files)} target files")
|
||
|
||
return target_files
|
||
|
||
|
||
def count_lex_references(lex_ids, target_files):
|
||
"""
|
||
Count references to lex-ids in target files and track file counts.
|
||
Looks for quoted lex-ids in the files.
|
||
|
||
Args:
|
||
lex_ids (set): Set of lex-ids to search for
|
||
target_files (list): List of file paths to search in
|
||
|
||
Returns:
|
||
dict: Dictionary with lex-id as key and dict containing 'total_refs' and 'file_count' as value
|
||
"""
|
||
# Structure: {lex_id: {'total_refs': count, 'file_count': count, 'files': set()}}
|
||
reference_data = defaultdict(lambda: {'total_refs': 0, 'file_count': 0, 'files': set()})
|
||
|
||
for file_path in target_files:
|
||
try:
|
||
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||
content = f.read()
|
||
|
||
# Search for each lex-id in quotes
|
||
for lex_id in lex_ids:
|
||
# Pattern to match the lex-id in quotes
|
||
quoted_pattern = f"['\"]{re.escape(lex_id)}['\"]"
|
||
matches = re.findall(quoted_pattern, content)
|
||
|
||
if matches:
|
||
# Add to total reference count
|
||
reference_data[lex_id]['total_refs'] += len(matches)
|
||
# Add file to the set of files containing this lex_id
|
||
reference_data[lex_id]['files'].add(file_path)
|
||
|
||
except Exception as e:
|
||
print(f"Warning: Could not read file {file_path}: {e}")
|
||
continue
|
||
|
||
# Calculate file counts from the sets
|
||
for lex_id in reference_data:
|
||
reference_data[lex_id]['file_count'] = len(reference_data[lex_id]['files'])
|
||
|
||
return reference_data
|
||
|
||
|
||
def print_results_table(reference_data):
|
||
"""
|
||
Print the results in a table format, sorted by Weighted Score (Total Refs × File Count) highest to lowest.
|
||
|
||
Args:
|
||
reference_data (dict): Dictionary with lex-id as key and data dict as value
|
||
"""
|
||
if not reference_data:
|
||
print("No references found.")
|
||
return
|
||
|
||
# Calculate weighted score for each lex_id and sort by it (descending), then by lex-id (ascending) for ties
|
||
def get_weighted_score(item):
|
||
lex_id, data = item
|
||
return data['total_refs'] * data['file_count']
|
||
|
||
sorted_items = sorted(reference_data.items(), key=lambda x: (-get_weighted_score(x), x[0]))
|
||
|
||
# Calculate column widths
|
||
max_lex_id_width = max(len(lex_id) for lex_id in reference_data.keys()) if reference_data else 0
|
||
max_total_refs_width = max(len(str(data['total_refs'])) for data in reference_data.values()) if reference_data else 0
|
||
max_file_count_width = max(len(str(data['file_count'])) for data in reference_data.values()) if reference_data else 0
|
||
|
||
# Calculate refs per file and weighted score values for width determination
|
||
refs_per_file_values = []
|
||
weighted_score_values = []
|
||
for data in reference_data.values():
|
||
if data['file_count'] > 0:
|
||
refs_per_file = data['total_refs'] / data['file_count']
|
||
refs_per_file_values.append(f"{refs_per_file:.1f}")
|
||
else:
|
||
refs_per_file_values.append("0.0")
|
||
|
||
weighted_score = data['total_refs'] * data['file_count']
|
||
weighted_score_values.append(str(weighted_score))
|
||
|
||
max_refs_per_file_width = max(len(val) for val in refs_per_file_values) if refs_per_file_values else 0
|
||
max_weighted_score_width = max(len(val) for val in weighted_score_values) if weighted_score_values else 0
|
||
|
||
# Ensure minimum widths for headers
|
||
lex_id_width = max(max_lex_id_width, len("Lex ID"))
|
||
total_refs_width = max(max_total_refs_width, len("Total Refs"))
|
||
file_count_width = max(max_file_count_width, len("Files"))
|
||
refs_per_file_width = max(max_refs_per_file_width, len("Refs/File"))
|
||
weighted_score_width = max(max_weighted_score_width, len("Weighted Score"))
|
||
|
||
# Calculate total table width
|
||
table_width = lex_id_width + total_refs_width + file_count_width + refs_per_file_width + weighted_score_width + 16 # 16 for separators
|
||
|
||
# Print header
|
||
print("\nReference Count Results (sorted by Weighted Score, excluding I18N and AdminLTE files):")
|
||
print("=" * table_width)
|
||
print(f"{'Lex ID':<{lex_id_width}} | {'Total Refs':>{total_refs_width}} | {'Files':>{file_count_width}} | {'Refs/File':>{refs_per_file_width}} | {'Weighted Score':>{weighted_score_width}}")
|
||
print("-" * table_width)
|
||
|
||
# Print results
|
||
total_references = 0
|
||
total_files_with_refs = set()
|
||
total_weighted_score = 0
|
||
|
||
for lex_id, data in sorted_items:
|
||
refs_per_file = data['total_refs'] / data['file_count'] if data['file_count'] > 0 else 0.0
|
||
weighted_score = data['total_refs'] * data['file_count']
|
||
print(f"{lex_id:<{lex_id_width}} | {data['total_refs']:>{total_refs_width}} | {data['file_count']:>{file_count_width}} | {refs_per_file:>{refs_per_file_width}.1f} | {weighted_score:>{weighted_score_width}}")
|
||
total_references += data['total_refs']
|
||
total_files_with_refs.update(data['files'])
|
||
total_weighted_score += weighted_score
|
||
|
||
# Calculate overall refs per file
|
||
overall_refs_per_file = total_references / len(total_files_with_refs) if total_files_with_refs else 0.0
|
||
|
||
print("-" * table_width)
|
||
print(f"{'Total':<{lex_id_width}} | {total_references:>{total_refs_width}} | {len(total_files_with_refs):>{file_count_width}} | {overall_refs_per_file:>{refs_per_file_width}.1f} | {total_weighted_score:>{weighted_score_width}}")
|
||
|
||
# Print summary
|
||
print(f"\nSummary:")
|
||
print(f"- Total lex-id references found: {total_references}")
|
||
print(f"- Total unique files with references: {len(total_files_with_refs)}")
|
||
print(f"- Total lex-ids with at least one reference: {len([data for data in reference_data.values() if data['total_refs'] > 0])}")
|
||
print(f"- Average references per file: {overall_refs_per_file:.1f}")
|
||
print(f"- Total weighted score: {total_weighted_score}")
|
||
print(f"- Results sorted by Weighted Score (Total Refs × File Count, highest to lowest)")
|
||
print(f"- Files with 'I18N' or 'AdminLTE' in pathname were excluded from search")
|
||
|
||
|
||
def main():
|
||
"""Main function to orchestrate the program."""
|
||
if len(sys.argv) != 3:
|
||
print("Usage: python3 count-references.py <lex_file> <search_directory>")
|
||
print("\nExample:")
|
||
print(" python3 count-references.py lexicon.lex /path/to/search")
|
||
print("\nNote: Files with 'I18N' or 'AdminLTE' in their pathname will be excluded from the search.")
|
||
print("Results are sorted by Weighted Score (Total Refs × File Count, highest to lowest).")
|
||
sys.exit(1)
|
||
|
||
lex_file_path = sys.argv[1]
|
||
search_directory = sys.argv[2]
|
||
|
||
print(f"Parsing lex file: {lex_file_path}")
|
||
print(f"Searching directory: {search_directory}")
|
||
print()
|
||
|
||
# Step 1: Parse the lex file to get lex-ids
|
||
lex_ids = parse_lex_file(lex_file_path)
|
||
|
||
if not lex_ids:
|
||
print("No lex-ids found in the lex file.")
|
||
sys.exit(1)
|
||
|
||
# Step 2: Find all target files (.ep and .pm), excluding I18N and AdminLTE files
|
||
target_files = find_target_files(search_directory)
|
||
|
||
if not target_files:
|
||
print("No .ep or .pm files found in the search directory (after exclusions).")
|
||
sys.exit(1)
|
||
|
||
# Step 3: Count references to lex-ids in target files
|
||
print("Counting references...")
|
||
reference_data = count_lex_references(lex_ids, target_files)
|
||
|
||
# Step 4: Print results table
|
||
print_results_table(reference_data)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|