SM2Gen/count-references.py

265 lines
10 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Lex ID Reference Counter
This program parses a lex file containing quoted lex-ids and strings,
then searches through *.ep and *.pm files in a directory hierarchy
to count references to those lex-ids and prints a sorted table.
Files with "I18N" or "AdminLTE" in their pathname are excluded from the search.
Results are sorted by Weighted Score (Total Refs × File Count) highest to lowest.
Usage: python3 count-references.py <lex_file> <search_directory>
"""
import os
import re
import sys
import glob
from collections import defaultdict
from pathlib import Path
def parse_lex_file(lex_file_path):
"""
Parse the lex file to extract lex-ids.
Expected format: "lex-id" => "quoted string",
Returns:
set: A set of lex-ids (without quotes)
"""
lex_ids = set()
try:
with open(lex_file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Pattern to match "lex-id" => "quoted string",
# Captures the lex-id (first quoted string)
pattern = r"""['"]([^'"]+)['"]\s*=>\s*['"][^'"]*['"]\s*,"""
matches = re.findall(pattern, content)
for match in matches:
lex_ids.add(match)
print(f"Found {len(lex_ids)} lex-ids in {lex_file_path}")
except FileNotFoundError:
print(f"Error: Lex file '{lex_file_path}' not found.")
sys.exit(1)
except Exception as e:
print(f"Error reading lex file: {e}")
sys.exit(1)
return lex_ids
def find_target_files(search_directory):
"""
Find all *.ep and *.pm files in the directory hierarchy.
Excludes files whose pathname includes "I18N" or "AdminLTE".
Args:
search_directory (str): Root directory to search
Returns:
list: List of file paths (excluding I18N and AdminLTE files)
"""
target_files = []
if not os.path.exists(search_directory):
print(f"Error: Search directory '{search_directory}' not found.")
sys.exit(1)
# Use glob to find all .ep and .pm files recursively
ep_files = glob.glob(os.path.join(search_directory, '**', '*.ep'), recursive=True)
pm_files = glob.glob(os.path.join(search_directory, '**', '*.pm'), recursive=True)
all_files = ep_files + pm_files
# Filter out files with "I18N" or "AdminLTE" in their pathname
target_files = [f for f in all_files if "I18N" not in f and "AdminLTE" not in f]
excluded_count = len(all_files) - len(target_files)
i18n_excluded = [f for f in all_files if "I18N" in f]
adminlte_excluded = [f for f in all_files if "AdminLTE" in f]
print(f"Found {len(all_files)} total files (.ep and .pm)")
if excluded_count > 0:
print(f"Excluded {len(i18n_excluded)} files containing 'I18N' in pathname")
print(f"Excluded {len(adminlte_excluded)} files containing 'AdminLTE' in pathname")
print(f"Total excluded: {excluded_count} files")
print(f"Processing {len(target_files)} target files")
return target_files
def count_lex_references(lex_ids, target_files):
"""
Count references to lex-ids in target files and track file counts.
Looks for quoted lex-ids in the files.
Args:
lex_ids (set): Set of lex-ids to search for
target_files (list): List of file paths to search in
Returns:
dict: Dictionary with lex-id as key and dict containing 'total_refs' and 'file_count' as value
"""
# Structure: {lex_id: {'total_refs': count, 'file_count': count, 'files': set()}}
reference_data = defaultdict(lambda: {'total_refs': 0, 'file_count': 0, 'files': set()})
for file_path in target_files:
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
# Search for each lex-id in quotes
for lex_id in lex_ids:
# Pattern to match the lex-id in quotes
quoted_pattern = f"['\"]{re.escape(lex_id)}['\"]"
matches = re.findall(quoted_pattern, content)
if matches:
# Add to total reference count
reference_data[lex_id]['total_refs'] += len(matches)
# Add file to the set of files containing this lex_id
reference_data[lex_id]['files'].add(file_path)
except Exception as e:
print(f"Warning: Could not read file {file_path}: {e}")
continue
# Calculate file counts from the sets
for lex_id in reference_data:
reference_data[lex_id]['file_count'] = len(reference_data[lex_id]['files'])
return reference_data
def print_results_table(reference_data):
"""
Print the results in a table format, sorted by Weighted Score (Total Refs × File Count) highest to lowest.
Args:
reference_data (dict): Dictionary with lex-id as key and data dict as value
"""
if not reference_data:
print("No references found.")
return
# Calculate weighted score for each lex_id and sort by it (descending), then by lex-id (ascending) for ties
def get_weighted_score(item):
lex_id, data = item
return data['total_refs'] * data['file_count']
sorted_items = sorted(reference_data.items(), key=lambda x: (-get_weighted_score(x), x[0]))
# Calculate column widths
max_lex_id_width = max(len(lex_id) for lex_id in reference_data.keys()) if reference_data else 0
max_total_refs_width = max(len(str(data['total_refs'])) for data in reference_data.values()) if reference_data else 0
max_file_count_width = max(len(str(data['file_count'])) for data in reference_data.values()) if reference_data else 0
# Calculate refs per file and weighted score values for width determination
refs_per_file_values = []
weighted_score_values = []
for data in reference_data.values():
if data['file_count'] > 0:
refs_per_file = data['total_refs'] / data['file_count']
refs_per_file_values.append(f"{refs_per_file:.1f}")
else:
refs_per_file_values.append("0.0")
weighted_score = data['total_refs'] * data['file_count']
weighted_score_values.append(str(weighted_score))
max_refs_per_file_width = max(len(val) for val in refs_per_file_values) if refs_per_file_values else 0
max_weighted_score_width = max(len(val) for val in weighted_score_values) if weighted_score_values else 0
# Ensure minimum widths for headers
lex_id_width = max(max_lex_id_width, len("Lex ID"))
total_refs_width = max(max_total_refs_width, len("Total Refs"))
file_count_width = max(max_file_count_width, len("Files"))
refs_per_file_width = max(max_refs_per_file_width, len("Refs/File"))
weighted_score_width = max(max_weighted_score_width, len("Weighted Score"))
# Calculate total table width
table_width = lex_id_width + total_refs_width + file_count_width + refs_per_file_width + weighted_score_width + 16 # 16 for separators
# Print header
print("\nReference Count Results (sorted by Weighted Score, excluding I18N and AdminLTE files):")
print("=" * table_width)
print(f"{'Lex ID':<{lex_id_width}} | {'Total Refs':>{total_refs_width}} | {'Files':>{file_count_width}} | {'Refs/File':>{refs_per_file_width}} | {'Weighted Score':>{weighted_score_width}}")
print("-" * table_width)
# Print results
total_references = 0
total_files_with_refs = set()
total_weighted_score = 0
for lex_id, data in sorted_items:
refs_per_file = data['total_refs'] / data['file_count'] if data['file_count'] > 0 else 0.0
weighted_score = data['total_refs'] * data['file_count']
print(f"{lex_id:<{lex_id_width}} | {data['total_refs']:>{total_refs_width}} | {data['file_count']:>{file_count_width}} | {refs_per_file:>{refs_per_file_width}.1f} | {weighted_score:>{weighted_score_width}}")
total_references += data['total_refs']
total_files_with_refs.update(data['files'])
total_weighted_score += weighted_score
# Calculate overall refs per file
overall_refs_per_file = total_references / len(total_files_with_refs) if total_files_with_refs else 0.0
print("-" * table_width)
print(f"{'Total':<{lex_id_width}} | {total_references:>{total_refs_width}} | {len(total_files_with_refs):>{file_count_width}} | {overall_refs_per_file:>{refs_per_file_width}.1f} | {total_weighted_score:>{weighted_score_width}}")
# Print summary
print(f"\nSummary:")
print(f"- Total lex-id references found: {total_references}")
print(f"- Total unique files with references: {len(total_files_with_refs)}")
print(f"- Total lex-ids with at least one reference: {len([data for data in reference_data.values() if data['total_refs'] > 0])}")
print(f"- Average references per file: {overall_refs_per_file:.1f}")
print(f"- Total weighted score: {total_weighted_score}")
print(f"- Results sorted by Weighted Score (Total Refs × File Count, highest to lowest)")
print(f"- Files with 'I18N' or 'AdminLTE' in pathname were excluded from search")
def main():
"""Main function to orchestrate the program."""
if len(sys.argv) != 3:
print("Usage: python3 count-references.py <lex_file> <search_directory>")
print("\nExample:")
print(" python3 count-references.py lexicon.lex /path/to/search")
print("\nNote: Files with 'I18N' or 'AdminLTE' in their pathname will be excluded from the search.")
print("Results are sorted by Weighted Score (Total Refs × File Count, highest to lowest).")
sys.exit(1)
lex_file_path = sys.argv[1]
search_directory = sys.argv[2]
print(f"Parsing lex file: {lex_file_path}")
print(f"Searching directory: {search_directory}")
print()
# Step 1: Parse the lex file to get lex-ids
lex_ids = parse_lex_file(lex_file_path)
if not lex_ids:
print("No lex-ids found in the lex file.")
sys.exit(1)
# Step 2: Find all target files (.ep and .pm), excluding I18N and AdminLTE files
target_files = find_target_files(search_directory)
if not target_files:
print("No .ep or .pm files found in the search directory (after exclusions).")
sys.exit(1)
# Step 3: Count references to lex-ids in target files
print("Counting references...")
reference_data = count_lex_references(lex_ids, target_files)
# Step 4: Print results table
print_results_table(reference_data)
if __name__ == "__main__":
main()