SM2Gen/count-references.py

#!/usr/bin/env python3
"""
Lex ID Reference Counter

This program parses a lex file containing quoted lex-ids and strings,
then searches through *.ep and *.pm files in a directory hierarchy
to count references to those lex-ids and prints a sorted table.
Files with "I18N" or "AdminLTE" in their pathname are excluded from the search.
Results are sorted by Weighted Score (Total Refs × File Count) highest to lowest.

Usage: python3 count-references.py <lex_file> <search_directory>
"""

import os
import re
import sys
import glob
from collections import defaultdict
from pathlib import Path


def parse_lex_file(lex_file_path):
    """
    Parse the lex file to extract lex-ids.
    Expected format: "lex-id" => "quoted string",

    Returns:
        set: A set of lex-ids (without quotes)
    """
    lex_ids = set()

    try:
        with open(lex_file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Pattern to match "lex-id" => "quoted string",
        # Captures the lex-id (first quoted string)
        pattern = r"""['"]([^'"]+)['"]\s*=>\s*['"][^'"]*['"]\s*,"""
        matches = re.findall(pattern, content)

        for match in matches:
            lex_ids.add(match)

        print(f"Found {len(lex_ids)} lex-ids in {lex_file_path}")

    except FileNotFoundError:
        print(f"Error: Lex file '{lex_file_path}' not found.")
        sys.exit(1)
    except Exception as e:
        print(f"Error reading lex file: {e}")
        sys.exit(1)

    return lex_ids


def find_target_files(search_directory):
    """
    Find all *.ep and *.pm files in the directory hierarchy.
    Excludes files whose pathname includes "I18N" or "AdminLTE".

    Args:
        search_directory (str): Root directory to search

    Returns:
        list: List of file paths (excluding I18N and AdminLTE files)
    """
    target_files = []

    if not os.path.exists(search_directory):
        print(f"Error: Search directory '{search_directory}' not found.")
        sys.exit(1)

    # Use glob to find all .ep and .pm files recursively
    ep_files = glob.glob(os.path.join(search_directory, '**', '*.ep'), recursive=True)
    pm_files = glob.glob(os.path.join(search_directory, '**', '*.pm'), recursive=True)

    all_files = ep_files + pm_files

    # Filter out files with "I18N" or "AdminLTE" in their pathname
    target_files = [f for f in all_files if "I18N" not in f and "AdminLTE" not in f]

    excluded_count = len(all_files) - len(target_files)
    i18n_excluded = [f for f in all_files if "I18N" in f]
    adminlte_excluded = [f for f in all_files if "AdminLTE" in f]

    print(f"Found {len(all_files)} total files (.ep and .pm)")
    if excluded_count > 0:
        print(f"Excluded {len(i18n_excluded)} files containing 'I18N' in pathname")
        print(f"Excluded {len(adminlte_excluded)} files containing 'AdminLTE' in pathname")
        print(f"Total excluded: {excluded_count} files")
    print(f"Processing {len(target_files)} target files")

    return target_files


def count_lex_references(lex_ids, target_files):
    """
    Count references to lex-ids in target files and track file counts.
    Looks for quoted lex-ids in the files.

    Args:
        lex_ids (set): Set of lex-ids to search for
        target_files (list): List of file paths to search in

    Returns:
        dict: Dictionary with lex-id as key and dict containing 'total_refs' and 'file_count' as value
    """
    # Structure: {lex_id: {'total_refs': count, 'file_count': count, 'files': set()}}
    reference_data = defaultdict(lambda: {'total_refs': 0, 'file_count': 0, 'files': set()})

    for file_path in target_files:
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()

            # Search for each lex-id in quotes
            for lex_id in lex_ids:
                # Pattern to match the lex-id in quotes
                quoted_pattern = f"['\"]{re.escape(lex_id)}['\"]"
                matches = re.findall(quoted_pattern, content)

                if matches:
                    # Add to total reference count
                    reference_data[lex_id]['total_refs'] += len(matches)
                    # Add file to the set of files containing this lex_id
                    reference_data[lex_id]['files'].add(file_path)

        except Exception as e:
            print(f"Warning: Could not read file {file_path}: {e}")
            continue

    # Calculate file counts from the sets
    for lex_id in reference_data:
        reference_data[lex_id]['file_count'] = len(reference_data[lex_id]['files'])

    return reference_data


def print_results_table(reference_data):
    """
    Print the results in a table format, sorted by Weighted Score (Total Refs × File Count) highest to lowest.

    Args:
        reference_data (dict): Dictionary with lex-id as key and data dict as value
    """
    if not reference_data:
        print("No references found.")
        return

    # Calculate weighted score for each lex_id and sort by it (descending), then by lex-id (ascending) for ties
    def get_weighted_score(item):
        lex_id, data = item
        return data['total_refs'] * data['file_count']

    sorted_items = sorted(reference_data.items(), key=lambda x: (-get_weighted_score(x), x[0]))

    # Calculate column widths
    max_lex_id_width = max(len(lex_id) for lex_id in reference_data.keys()) if reference_data else 0
    max_total_refs_width = max(len(str(data['total_refs'])) for data in reference_data.values()) if reference_data else 0
    max_file_count_width = max(len(str(data['file_count'])) for data in reference_data.values()) if reference_data else 0

    # Calculate refs per file and weighted score values for width determination
    refs_per_file_values = []
    weighted_score_values = []
    for data in reference_data.values():
        if data['file_count'] > 0:
            refs_per_file = data['total_refs'] / data['file_count']
            refs_per_file_values.append(f"{refs_per_file:.1f}")
        else:
            refs_per_file_values.append("0.0")

        weighted_score = data['total_refs'] * data['file_count']
        weighted_score_values.append(str(weighted_score))

    max_refs_per_file_width = max(len(val) for val in refs_per_file_values) if refs_per_file_values else 0
    max_weighted_score_width = max(len(val) for val in weighted_score_values) if weighted_score_values else 0

    # Ensure minimum widths for headers
    lex_id_width = max(max_lex_id_width, len("Lex ID"))
    total_refs_width = max(max_total_refs_width, len("Total Refs"))
    file_count_width = max(max_file_count_width, len("Files"))
    refs_per_file_width = max(max_refs_per_file_width, len("Refs/File"))
    weighted_score_width = max(max_weighted_score_width, len("Weighted Score"))

    # Calculate total table width
    table_width = lex_id_width + total_refs_width + file_count_width + refs_per_file_width + weighted_score_width + 16  # 16 for separators

    # Print header
    print("\nReference Count Results (sorted by Weighted Score, excluding I18N and AdminLTE files):")
    print("=" * table_width)
    print(f"{'Lex ID':<{lex_id_width}} | {'Total Refs':>{total_refs_width}} | {'Files':>{file_count_width}} | {'Refs/File':>{refs_per_file_width}} | {'Weighted Score':>{weighted_score_width}}")
    print("-" * table_width)

    # Print results
    total_references = 0
    total_files_with_refs = set()
    total_weighted_score = 0

    for lex_id, data in sorted_items:
        refs_per_file = data['total_refs'] / data['file_count'] if data['file_count'] > 0 else 0.0
        weighted_score = data['total_refs'] * data['file_count']
        print(f"{lex_id:<{lex_id_width}} | {data['total_refs']:>{total_refs_width}} | {data['file_count']:>{file_count_width}} | {refs_per_file:>{refs_per_file_width}.1f} | {weighted_score:>{weighted_score_width}}")
        total_references += data['total_refs']
        total_files_with_refs.update(data['files'])
        total_weighted_score += weighted_score

    # Calculate overall refs per file
    overall_refs_per_file = total_references / len(total_files_with_refs) if total_files_with_refs else 0.0

    print("-" * table_width)
    print(f"{'Total':<{lex_id_width}} | {total_references:>{total_refs_width}} | {len(total_files_with_refs):>{file_count_width}} | {overall_refs_per_file:>{refs_per_file_width}.1f} | {total_weighted_score:>{weighted_score_width}}")

    # Print summary
    print(f"\nSummary:")
    print(f"- Total lex-id references found: {total_references}")
    print(f"- Total unique files with references: {len(total_files_with_refs)}")
    print(f"- Total lex-ids with at least one reference: {len([data for data in reference_data.values() if data['total_refs'] > 0])}")
    print(f"- Average references per file: {overall_refs_per_file:.1f}")
    print(f"- Total weighted score: {total_weighted_score}")
    print(f"- Results sorted by Weighted Score (Total Refs × File Count, highest to lowest)")
    print(f"- Files with 'I18N' or 'AdminLTE' in pathname were excluded from search")


def main():
    """Main function to orchestrate the program."""
    if len(sys.argv) != 3:
        print("Usage: python3 count-references.py <lex_file> <search_directory>")
        print("\nExample:")
        print("  python3 count-references.py lexicon.lex /path/to/search")
        print("\nNote: Files with 'I18N' or 'AdminLTE' in their pathname will be excluded from the search.")
        print("Results are sorted by Weighted Score (Total Refs × File Count, highest to lowest).")
        sys.exit(1)

    lex_file_path = sys.argv[1]
    search_directory = sys.argv[2]

    print(f"Parsing lex file: {lex_file_path}")
    print(f"Searching directory: {search_directory}")
    print()

    # Step 1: Parse the lex file to get lex-ids
    lex_ids = parse_lex_file(lex_file_path)

    if not lex_ids:
        print("No lex-ids found in the lex file.")
        sys.exit(1)

    # Step 2: Find all target files (.ep and .pm), excluding I18N and AdminLTE files
    target_files = find_target_files(search_directory)

    if not target_files:
        print("No .ep or .pm files found in the search directory (after exclusions).")
        sys.exit(1)

    # Step 3: Count references to lex-ids in target files
    print("Counting references...")
    reference_data = count_lex_references(lex_ids, target_files)

    # Step 4: Print results table
    print_results_table(reference_data)


if __name__ == "__main__":
    main()