From 6a1c6bb721092030458118a7fbb163a8bf9a2e7e Mon Sep 17 00:00:00 2001 From: Brian Read Date: Sat, 9 Aug 2025 15:52:03 +0100 Subject: [PATCH] GPT-5 Attempt Sat 9thAug2025 --- GPT5Spec_for_mojo_formatter.txt | 172 ++++++ mojofmt.py | 984 ++++++++++++++++++++++++++++++++ 2 files changed, 1156 insertions(+) create mode 100644 GPT5Spec_for_mojo_formatter.txt create mode 100755 mojofmt.py diff --git a/GPT5Spec_for_mojo_formatter.txt b/GPT5Spec_for_mojo_formatter.txt new file mode 100644 index 0000000..71d01da --- /dev/null +++ b/GPT5Spec_for_mojo_formatter.txt @@ -0,0 +1,172 @@ +Here’s the updated, implementation-ready spec reflecting all the additions we’ve made (self-test, output to a new file, backup-on-write, and logging). + +1) Purpose and scope +- Goal: Format Mojolicious templates that mix HTML and Embedded Perl. +- Behavior: Preserve whitespace semantics (especially chomp markers), normalize indentation, and format embedded Perl via perltidy. +- Deliverables: CLI tool and library API; idempotent formatting. + +2) Language and implementation choice +- Language: Python 3.10+. +- Dependencies: + - perltidy (Perl::Tidy) on PATH (recommended; required for Perl formatting; formatter still runs without it but doesn’t reformat Perl). +- Implementation approach: Custom line-oriented lexer/formatter; no HTML rewriter. + +3) Supported template syntax (Phase 1) +- Mojolicious tags: <% ... %>, <%= ... %>, <%== ... %>, <%# ... %>, with optional chomp markers <%- and -%>. +- Line directives: % ..., %= ..., %== ..., %# ... +- Block constructs: Perl braces { } and helper begin/end. +- HTML: all tags, comments, void elements; raw elements (pre, script, style, textarea) treated as opaque. + +4) Non-goals (Phase 1) +- No attribute reflow/wrapping. +- No text node reflow. +- No JS/CSS formatting (script/style inner content unchanged). +- No change to chomp semantics. + +5) Formatting rules +5.1 General whitespace +- Spaces-only indentation; default width 2. +- Trim trailing whitespace on each line. +- Ensure single terminal newline. +- EOL handling: configurable lf|crlf|preserve (default lf). + +5.2 HTML indentation and line breaking +- Indent by HTML nesting; end tags dedent before emitting the line. +- Void elements do not change indent depth. +- Raw elements (pre, script, style, textarea): do not modify inner lines; only indent opening/closing lines. + +5.3 Mojolicious delimiters and spacing +- Preserve chomp markers exactly (<%- and -%>). +- Default delimiter spacing normalization (configurable): + - One space after <% (and optional kind), and one space before %> unless adjacent to a chomp hyphen. + - Template comments <%# ... %> are not perltidy-formatted; inner spacing left as-is except optional edge trim per normalization setting. + +5.4 Indentation for code blocks +- Perl-depth changes are driven by: + - Line directives with braces and % end. + - Standalone statement tags <% ... %> containing braces. + - begin/end helper blocks: lines with begin increase depth until end. +- Total indent per line = HTML depth + Perl depth. +- Dedents from closing items apply before the current line is emitted. + +5.5 Embedded Perl formatting (perltidy) +- Statement content: <% ... %> and % ... are sent to perltidy and collapsed to a single line on return. +- Expression content: <%= ... %>, <%== ... %>, %= ..., %== ... are wrapped as do { ... } for perltidy and then unwrapped; output collapsed to single line; no trailing semicolons added. +- Default perltidy options (overridable): -i=2 -ci=2 -l=100 -q -se -nbbc -noll. +- If perltidy is unavailable or returns non-zero, leave the Perl content unmodified and log an error; formatting continues. + +6) Algorithm overview +- Tokenize line-by-line, tracking: + - HTML start/end/self-closing tags for depth. + - Mojolicious line directives and tags for Perl depth and begin/end handling. +- Substitute and optionally reformat template tags inline, preserving chomp markers. +- Rebuild each line with computed indentation; trim trailing spaces; normalize EOL at the end. + +7) CLI specification +- Binary name: mojofmt +- Usage: mojofmt [options] [paths...] +- Options: + - -w, --write: Overwrite files in place. Before overwriting, write a backup file named .bak alongside the original (overwrites any existing .bak). + - -o, --out : Write formatted output to this file. Constraints: + - Requires exactly one input file or --stdin. + - Conflicts with --write, --check, and --diff (mutually exclusive). + - --check: Exit with status 1 if any file would change; do not write. + - --diff: Print unified diff of proposed changes; do not write. + - --stdin: Read from stdin (no file paths required). + - --stdout: Write to stdout (only meaningful with --stdin; default when no --out). + - --perltidy : Path to perltidy executable. + - --indent : Indent width in spaces (default 2). + - --eol : EOL handling (default lf). + - --no-space-in-delims: Disable delimiter spacing normalization inside <% %>. + - --self-test: Run internal sanity checks (see section 13) and exit with 0/1. + - --log-level : Set logging level (default error). + - --verbose: Shorthand for --log-level info. + - --version, --help. +- File selection: + - Accept files and directories; directories are traversed recursively for extensions .ep, .htm.ep, .html.ep. +- Exit codes: + - 0: Success and no changes needed (or wrote changes). + - 1: --check found changes OR error occurred OR self-test failed. + +8) Configuration +- CLI-driven in Phase 1. Config file support may be added later. +- Config keys (if/when config file is added) remain as previously defined (indent_width, eol, normalize_delimiter_spacing, perltidy_path, perltidy_options, extensions, respect_gitignore). Logging level is CLI-only for now. + +9) Library API (Python) +- format_string(src: str, config: Config) -> str +- format_file(path: Path, config: Config) -> str (if implemented) +- check_string(src: str, config: Config) -> bool (if implemented) +- Exceptions: + - ParseError for unrecoverable malformed constructs. + - PerltidyError for subprocess failures (currently errors are logged and Perl content is passed through unchanged; raising may be added later behind a flag). + +10) Logging +- Uses Python logging; logger name “mojofmt”. +- Default level: error. Levels: + - error: problems (perltidy missing, file processing error). + - info: high-level progress (found/unchanged/formatted files, backups and writes). + - debug: detailed operations (perltidy command/options, file discovery, other diagnostics). +- Output format: “mojofmt: LEVEL: message” to stderr. + +11) Error handling and diagnostics +- perltidy not found: + - Log an error once; formatter continues without Perl reformatting. + - In self-test, absence or failure of perltidy causes self-test to fail (exit 1). +- Regex/parser issues: + - If a line cannot be processed due to malformed mixed tags, log an error with filename and line; leave file unmodified in --write mode. +- I/O errors: + - Log an error with context (path); continue to next file; exit 1 overall if any errors occurred. + +12) Performance targets +- Linear time with respect to file size; thousands of lines acceptable. perltidy calls dominate runtime. + +13) Self-test mode +- Invoked with --self-test. +- Tests: + - perltidy probe: call perltidy on a tiny snippet and verify non-zero-length formatted output different from input (or matching expected spacing); failure if perltidy missing or returns non-zero. + - Idempotence: formatting a known mixed template twice yields the same result. + - Chomp markers: preserved exactly (e.g., -%> remains). + - Raw elements: inner lines of unchanged. + - Delimiter spacing normalization: <%my $x=1;%> becomes <% my $x = 1; %> under default settings. +- Exit code: 0 on pass, 1 on any failure. +- Logs: info shows probe status and “Self-test passed”; error lists failures. + +14) Test plan (expanded) +- Golden tests for the cases above plus: + - --out: single file and stdin cases; conflicts with --write/--check/--diff enforced. + - -w backups: verify .bak is created and overwritten on subsequent runs. + - Logging: run with --log-level debug to ensure expected messages appear. + - Error flows: perltidy missing; malformed tag line; unreadable file. + +15) Examples +- Format in-place (create backups): + mojofmt -w templates/ +- Check without writing (CI): + mojofmt --check templates/ +- Show diffs: + mojofmt --diff templates/ +- Format one file to a new file: + mojofmt -o out.htm.ep in.htm.ep +- Stdin to a file: + cat in.htm.ep | mojofmt --stdin -o out.htm.ep +- Self-test with logs: + mojofmt --self-test --log-level info +- Debug run: + mojofmt --log-level debug --check templates/ + +16) Milestones +- M1–M3: Core Phase 1 (lexer/indent, perltidy integration, begin/end handling, raw elements). +- M4: Hardening (idempotence, tests, EOL handling, CLI polish). +- M5: Packaging and performance tuning. +- Added in this revision: + - Logging subsystem with levels and verbose shorthand. + - --self-test mode including perltidy probe. + - --out output file support with conflict rules. + - -w backup-on-write behavior. + +17) Limitations (unchanged in spirit) +- Heuristic HTML indentation may be suboptimal on malformed HTML but is stable. +- No JS/CSS formatting; no attribute reflow. +- Perl formatting depends on perltidy availability; otherwise Perl inside tags is passed through unchanged. + +If you want any tweaks (e.g., backup filename pattern, adding a --no-backup flag, or allowing a configurable backup extension), I can amend the spec accordingly. \ No newline at end of file diff --git a/mojofmt.py b/mojofmt.py new file mode 100755 index 0000000..6010b86 --- /dev/null +++ b/mojofmt.py @@ -0,0 +1,984 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +mojofmt: Formatter for Mojolicious Embedded Perl templates (.ep, .htm.ep, .html.ep) + +Features: +- Indent HTML structure and Mojolicious line directives consistently +- Preserve chomp markers (<%- ... -%>) and do not alter newline semantics +- Handle helper begin/end blocks and Perl brace-based indentation for directives +- Treat pre/script/style/textarea content as opaque (unchanged) +- Optionally normalize spacing inside <% %> delimiters and after % directives +- Integrate with perltidy for Perl code formatting (if available on PATH) +- Reformat extended multi-line Perl blocks between lines with only <% and %> +- CLI with --write/--check/--diff, --out, --stdin/--stdout modes +- --self-test for sanity checks (includes perltidy probe) +- Logging: --log-level error|info|debug (and --verbose as shorthand for info) +- Optional --perl-keyword-spacing to aggressively insert spaces after Perl keywords +""" + +from __future__ import annotations + +import argparse +import difflib +import logging +import os +import re +import shutil +import subprocess +import sys +from dataclasses import dataclass, replace as dc_replace +from pathlib import Path +from typing import Iterable, List, Optional, Tuple + +VERSION = "0.1.9" + +DEFAULT_EXTENSIONS = (".ep", ".htm.ep", ".html.ep") +VOID_ELEMENTS = { + "area", "base", "br", "col", "embed", "hr", "img", "input", + "link", "meta", "param", "source", "track", "wbr", +} +RAW_ELEMENTS = {"pre", "script", "style", "textarea"} + +logger = logging.getLogger("mojofmt") + +TAG_RE = re.compile( + r""" + < + (?P/)? + (?P[A-Za-z][\w:-]*) + (?P(?:\s+[^<>]*?)?) + (?P/)? + > + """, + re.VERBOSE, +) + +# Mojolicious inline tags on a single line: <%...%> +TPL_TAG_RE = re.compile( + r""" + <% + (?P-)? # optional left chomp + (?P==|=|\#)? # kind: ==, =, or # + (?P.*?) # inner code/comment (non-greedy, no newlines) + (?P-)? # optional right chomp + %> + """, + re.VERBOSE, +) + +# Line directives: starts with % (possibly %= %== %#) after indentation +LINE_DIR_RE = re.compile(r"^(?P\s*)%(?P==|=|\#)?(?P.*)$") + +# Whitespace condensing for single-line normalization +WS_RE = re.compile(r"[ \t]+") + +# begin/end detection (heuristic) +BEGIN_RE = re.compile(r"\bbegin\b") +END_LINE_RE = re.compile(r"^\s*%\s*end\b") +END_TAG_ONLY_RE = re.compile(r"^\s*<%-?\s*end\s*-?%>\s*$") + +# leading } in a directive (e.g., % } or % }} ) +LEADING_RBRACE_COUNT_RE = re.compile(r"^\s*%\s*(?P\}+)") + +# <% } %> alone +TAG_CLOSING_BRACE_ONLY_RE = re.compile(r"^\s*<%-?\s*\}+\s*-?%>\s*$") + +# Detect raw element opening/closing (as standalone lines) +RAW_OPEN_RE = re.compile(r"^\s*<(?Ppre|script|style|textarea)\b[^>]*>\s*$", re.I) +RAW_CLOSE_RE = re.compile(r"^\s*pre|script|style|textarea)\s*>\s*$", re.I) + +# Extended EP block delimiters (opening/closing on their own lines) +OPEN_BLOCK_RE = re.compile(r'^(?P[ \t]*)<%(?P-?)(?![=#])\s*$') +CLOSE_BLOCK_RE = re.compile(r'^(?P[ \t]*)(?P-?)%>\s*$') + + +@dataclass +class Config: + indent_width: int = 2 + eol: str = "lf" # lf|crlf|preserve + normalize_delimiter_spacing: bool = True + perltidy_path: Optional[str] = None # if None, use PATH + perltidy_options: Optional[List[str]] = None + extensions: Tuple[str, ...] = DEFAULT_EXTENSIONS + respect_gitignore: bool = True + verbose: bool = False # kept for shorthand with --verbose + perl_keyword_spacing: bool = False # optional post-pass + + +def load_config(cli_args: argparse.Namespace) -> Config: + cfg = Config() + if cli_args.indent is not None: + cfg.indent_width = cli_args.indent + if cli_args.eol is not None: + cfg.eol = cli_args.eol + if cli_args.no_space_in_delims: + cfg.normalize_delimiter_spacing = False + if cli_args.perltidy: + cfg.perltidy_path = cli_args.perltidy + cfg.verbose = cli_args.verbose + cfg.perl_keyword_spacing = getattr(cli_args, "perl_keyword_spacing", False) + return cfg + + +def setup_logging(level_name: Optional[str], verbose_flag: bool) -> None: + if level_name: + name = level_name.lower() + elif verbose_flag: + name = "info" + else: + name = "error" + level = { + "error": logging.ERROR, + "warning": logging.WARNING, + "info": logging.INFO, + "debug": logging.DEBUG, + "critical": logging.CRITICAL, + }.get(name, logging.ERROR) + + fmt = "mojofmt: %(levelname)s: %(message)s" + logging.basicConfig(level=level, format=fmt) + + +def detect_eol(text: str) -> str: + if "\r\n" in text: + return "crlf" + return "lf" + + +def normalize_eol(text: str, eol: str) -> str: + if eol == "preserve": + return text + s = text.replace("\r\n", "\n").replace("\r", "\n") + if eol == "lf": + return s + elif eol == "crlf": + return s.replace("\n", "\r\n") + else: + return s + + +_PERLTIDY_WARNED = False # avoid spamming logs if perltidy missing repeatedly + + +def run_perltidy(code: str, cfg: Config) -> Tuple[int, str, str]: + global _PERLTIDY_WARNED + exe = cfg.perltidy_path or shutil.which("perltidy") + if not exe: + if not _PERLTIDY_WARNED: + logger.error("perltidy not found; Perl inside template will not be reformatted") + _PERLTIDY_WARNED = True + return (127, code, "perltidy not found") + + args: List[str] = [exe] + if cfg.perltidy_options: + args += cfg.perltidy_options + if not any(opt.startswith("-st") for opt in cfg.perltidy_options): + args.append("-st") + else: + args += [ + f"-i={cfg.indent_width}", + f"-ci={cfg.indent_width}", + "-l=100", + "-q", + "-se", + "-st", + "-nbbc", + "-noll", + ] + logger.debug("Running perltidy: %s", " ".join(args)) + try: + proc = subprocess.run( + args, + input=code, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=False, + ) + if proc.returncode != 0: + logger.debug("perltidy non-zero exit %s: %s", proc.returncode, (proc.stderr or "").strip()) + return (proc.returncode, proc.stdout, proc.stderr) + except FileNotFoundError: + if not _PERLTIDY_WARNED: + logger.error("perltidy not found while executing") + _PERLTIDY_WARNED = True + return (127, code, "perltidy not found") + + +def perltidy_probe(cfg: Config) -> Tuple[bool, str]: + exe = cfg.perltidy_path or shutil.which("perltidy") + if not exe: + return (False, "perltidy not found on PATH (install Perl::Tidy or pass --perltidy)") + snippet = "my $x= {a=>1,b =>2 };" + rc, out, err = run_perltidy(snippet, cfg) + if rc != 0: + return (False, f"perltidy exit {rc}: {(err or '').strip()}") + want = ["my $x = {", "a => 1", "b => 2"] + if all(w in out for w in want): + return (True, f"perltidy OK: {exe}") + if out and out.strip() and out.strip() != snippet: + return (True, f"perltidy OK (non-default style): {exe}") + return (False, "perltidy produced unexpected output") + + +def tidy_perl_statement_oneline(code: str, cfg: Config) -> str: + rc, out, _ = run_perltidy(code, cfg) + if rc != 0: + out = code + out = out.strip() + out = " ".join(out.splitlines()) + out = WS_RE.sub(" ", out).strip() + out = enforce_perl_keyword_spacing(out, cfg.perl_keyword_spacing) + return out + + +def tidy_perl_expression(code: str, cfg: Config) -> str: + wrapped = f"do {{ {code} }}" + rc, out, _ = run_perltidy(wrapped, cfg) + if rc != 0: + inner = code.strip() + return enforce_perl_keyword_spacing(inner, cfg.perl_keyword_spacing) + text = out + try: + start = text.index("{") + depth = 0 + end_idx = None + for i in range(start, len(text)): + ch = text[i] + if ch == "{": + depth += 1 + elif ch == "}": + depth -= 1 + if depth == 0: + end_idx = i + break + if end_idx is None: + inner = code.strip() + else: + inner = text[start + 1 : end_idx] + except ValueError: + inner = code.strip() + inner = " ".join(line.strip() for line in inner.splitlines()) + inner = WS_RE.sub(" ", inner).strip() + inner = enforce_perl_keyword_spacing(inner, cfg.perl_keyword_spacing) + return inner + + +def tidy_perl_block_multiline(code: str, cfg: Config) -> Optional[str]: + """ + Format a multi-line chunk of Perl by wrapping it in a do { ... } block for perltidy. + Returns the formatted inner text (without the wrapper) or None on failure. + """ + wrapped = "do {\n" + code + "\n}" + rc, out, _ = run_perltidy(wrapped, cfg) + if rc != 0 or not out: + return None + try: + start = out.index("{") + except ValueError: + return None + depth = 0 + end_idx = None + for i in range(start, len(out)): + ch = out[i] + if ch == "{": + depth += 1 + elif ch == "}": + depth -= 1 + if depth == 0: + end_idx = i + break + if end_idx is None: + return None + inner = out[start + 1 : end_idx] + if inner.startswith("\n"): + inner = inner[1:] + if inner.endswith("\n"): + inner = inner[:-1] + return inner + + +def _split_code_and_strings(s: str): + chunks = [] + buf: List[str] = [] + in_single = in_double = False + i = 0 + while i < len(s): + ch = s[i] + if not in_single and not in_double: + if ch == "'": + if buf: + chunks.append(("code", "".join(buf))) + buf = [] + in_single = True + buf.append(ch) + elif ch == '"': + if buf: + chunks.append(("code", "".join(buf))) + buf = [] + in_double = True + buf.append(ch) + else: + buf.append(ch) + elif in_single: + buf.append(ch) + if ch == "\\": + if i + 1 < len(s): + buf.append(s[i + 1]); i += 1 + elif ch == "'": + chunks.append(("str", "".join(buf))); buf = []; in_single = False + elif in_double: + buf.append(ch) + if ch == "\\": + if i + 1 < len(s): + buf.append(s[i + 1]); i += 1 + elif ch == '"': + chunks.append(("str", "".join(buf))); buf = []; in_double = False + i += 1 + if buf: + chunks.append(("code" if not (in_single or in_double) else "str", "".join(buf))) + return chunks + + +def _split_unquoted_comment(code_chunk: str): + idx = code_chunk.find("#") + if idx == -1: + return code_chunk, None + return code_chunk[:idx], code_chunk[idx:] + + +def enforce_perl_keyword_spacing(s: str, enable: bool) -> str: + if not enable or not s: + return s + # Add space after control keywords before '(' + ctrl_paren = re.compile(r"\b(?Pif|elsif|unless|while|until|for|foreach|given|when)\s*\(") + # Add space after declarators before sigils/paren + decl = re.compile(r"\b(?Pmy|our|state|local)\s*(?=[\$\@\%\*\&\\\(])") + # sub name spacing and brace spacing + sub_named = re.compile(r"\bsub\s*([A-Za-z_]\w*)") + sub_named_brace = re.compile(r"\bsub\s+([A-Za-z_]\w*)\s*\{") + sub_anon = re.compile(r"\bsub\s*\{") + # Calls which often appear without space + call_paren = re.compile(r"\b(?Preturn|print|say|die|warn|exit)\s*\(") + call_space = re.compile(r"\b(?Preturn|print|say|die|warn|exit)\s*(?=\S)") + # else/continue/do/eval blocks + else_brace = re.compile(r"\b(?Pelse|continue|do|eval)\s*\{") + # Ensure space before a brace after a closing paren: "){" -> ") {" + brace_after_paren = re.compile(r"\)\s*\{") + # Ensure space between '}' and a following keyword: "}else" -> "} else" + brace_then_kw = re.compile(r"\}\s*(?=\b(?:else|elsif|continue|when)\b)") + + out: List[str] = [] + for kind, chunk in _split_code_and_strings(s): + if kind != "code": + out.append(chunk) + continue + code, comment = _split_unquoted_comment(chunk) + code = ctrl_paren.sub(lambda m: f"{m.group('kw')} (", code) + code = decl.sub(lambda m: f"{m.group('kw')} ", code) + code = sub_named.sub(lambda m: f"sub {m.group(1)}", code) + code = sub_named_brace.sub(lambda m: f"sub {m.group(1)} {{", code) + code = sub_anon.sub("sub {", code) + code = call_paren.sub(lambda m: f"{m.group('kw')} (", code) + code = call_space.sub(lambda m: f"{m.group('kw')} ", code) + code = brace_then_kw.sub("} ", code) + code = else_brace.sub(lambda m: f"{m.group('kw')} {{", code) + code = brace_after_paren.sub(") {", code) + out.append(code + (comment or "")) + return "".join(out) + + +def _common_leading_ws(lines: List[str]) -> str: + ws = None + for ln in lines: + if not ln.strip(): + continue + lead = len(ln) - len(ln.lstrip(' \t')) + s = ln[:lead] + if ws is None: + ws = s + else: + i = 0 + while i < len(ws) and i < len(s) and ws[i] == s[i]: + i += 1 + ws = ws[:i] + return ws or "" + + +def _dedent_block(text: str) -> str: + lines = text.splitlines() + # Trim leading/trailing all-whitespace lines + while lines and not lines[0].strip(): + lines.pop(0) + while lines and not lines[-1].strip(): + lines.pop() + if not lines: + return "" + prefix = _common_leading_ws(lines) + if not prefix: + return "\n".join(lines) + plen = len(prefix) + out = [] + for ln in lines: + out.append(ln[plen:] if ln.startswith(prefix) else ln) + return "\n".join(out) + + +def _naive_perl_indent(code: str, width: int = 2) -> str: + lines = code.splitlines() + indent = 0 + out = [] + for raw in lines: + ln = raw.rstrip() + if not ln: + out.append("") + continue + stripped = ln.lstrip() + # dedent on leading closing braces + leading_closes = 0 + i = 0 + while i < len(stripped) and stripped[i] == '}': + leading_closes += 1 + i += 1 + indent_before = max(0, indent - leading_closes) + out.append((" " * (indent_before * width)) + stripped) + opens = ln.count("{") + closes = ln.count("}") + indent += (opens - closes) + if indent < 0: + indent = 0 + return "\n".join(out) + + +def normalize_tpl_tag( + leftchomp: Optional[str], + kind: Optional[str], + body: str, + rightchomp: Optional[str], + cfg: Config, +) -> Tuple[str, str, str, str, str]: + if not cfg.normalize_delimiter_spacing or (kind == "#"): + return ("<%", leftchomp or "", kind or "", body, (rightchomp or "") + "%>") + body = body.strip() + left_space = " " + right_space = " " if rightchomp == "" else "" + open_part = "<%" + (leftchomp or "") + (kind or "") + left_space + close_part = right_space + (rightchomp or "") + "%>" + return (open_part, "", "", body, close_part) + + +def substitute_tpl_tags_in_line(line: str, cfg: Config) -> str: + parts: List[str] = [] + last = 0 + for m in TPL_TAG_RE.finditer(line): + parts.append(line[last : m.start()]) + leftchomp = m.group("leftchomp") or "" + kind = m.group("kind") or "" + body = m.group("body") + rightchomp = m.group("rightchomp") or "" + open_part, _, _, new_body, close_part = normalize_tpl_tag( + leftchomp, kind, body, rightchomp, cfg + ) + if kind == "#": + inner = body + else: + if kind in ("=", "=="): + inner = tidy_perl_expression(body, cfg) + else: + inner = tidy_perl_statement_oneline(body, cfg) + parts.append(open_part + inner + close_part) + last = m.end() + parts.append(line[last:]) + return "".join(parts) + + +def derive_html_tag_deltas(line_wo_tpl: str) -> Tuple[int, int, Optional[str], Optional[str]]: + """ + Return (pre_dedent, net_total, raw_open, raw_close): + - pre_dedent: end tags at beginning of line (dedent before printing) + - net_total: total start tags (+1) minus end tags (-1) across the line for non-void, non-self-closing tags + - raw_open, raw_close: raw elements opened/closed on this line if they match exactly + """ + s = line_wo_tpl + + raw_open = None + raw_close = None + m_open = RAW_OPEN_RE.match(s) + if m_open: + raw_open = m_open.group("name").lower() + m_close = RAW_CLOSE_RE.match(s) + if m_close: + raw_close = m_close.group("name").lower() + + pre_dedent = 0 + i = 0 + while i < len(s) and s[i].isspace(): + i += 1 + while True: + m = TAG_RE.match(s, i) + if not m: + break + if m.group("slash"): + pre_dedent += 1 + i = m.end() + while i < len(s) and s[i].isspace(): + i += 1 + continue + else: + break + + net = 0 + for m in TAG_RE.finditer(s): + slash = m.group("slash") + name = (m.group("name") or "").lower() + selfclose = bool(m.group("self")) + if slash: + net -= 1 + else: + if selfclose or name in VOID_ELEMENTS: + pass + else: + net += 1 + + return pre_dedent, net, raw_open, raw_close + + +def strip_tpl_tags(line: str) -> str: + return TPL_TAG_RE.sub(lambda m: " " * (m.end() - m.start()), line) + + +def is_standalone_statement_tag(line: str) -> bool: + s = line.strip() + if not (s.startswith("<%") and s.endswith("%>")): + return False + if s.startswith("<%=") or s.startswith("<%=="): + return False + return True + + +def compute_perl_deltas(line: str) -> Tuple[int, int]: + """ + Return (perl_dedent_before, perl_delta_after_for_next_line). + Only line directives (starting with %) and standalone <% ... %> statement lines + affect Perl depth. Also account for % end / <% end %> and begin blocks. + """ + dedent_before = 0 + delta_after = 0 + + if END_LINE_RE.match(line) or END_TAG_ONLY_RE.match(line): + dedent_before += 1 + + m = LEADING_RBRACE_COUNT_RE.match(line) + if m: + braces = m.group("braces") or "" + dedent_before += len(braces) + + if TAG_CLOSING_BRACE_ONLY_RE.match(line): + dedent_before += 1 + + is_dir = bool(LINE_DIR_RE.match(line)) + is_stmt_tag_only = is_standalone_statement_tag(line) + + if is_dir: + body = LINE_DIR_RE.match(line).group("body") + open_count = body.count("{") + close_count = body.count("}") + delta_after += (open_count - close_count) + if BEGIN_RE.search(line): + delta_after += 1 + elif is_stmt_tag_only: + bodies = [m.group("body") or "" for m in TPL_TAG_RE.finditer(line)] + open_count = sum(b.count("{") for b in bodies) + close_count = sum(b.count("}") for b in bodies) + delta_after += (open_count - close_count) + if BEGIN_RE.search(line): + delta_after += 1 + + return dedent_before, delta_after + + +def format_line_directive(line: str, cfg: Config) -> Optional[str]: + """ + If the line is a Mojolicious line directive (% ...), return a formatted + directive string WITHOUT leading indentation (indent applied separately). + Otherwise return None. + """ + m = LINE_DIR_RE.match(line) + if not m: + return None + kind = m.group("kind") or "" + body = m.group("body") + + if kind == "#": + if cfg.normalize_delimiter_spacing: + trimmed = body.strip() + return "%#" + ((" " + trimmed) if trimmed else "") + else: + return "%#" + body + + if kind in ("=", "=="): + inner = tidy_perl_expression(body, cfg) + else: + inner = tidy_perl_statement_oneline(body, cfg) + + if cfg.normalize_delimiter_spacing: + return "%" + kind + ((" " + inner) if inner else "") + else: + return "%" + kind + ((" " + inner) if inner else "") + + +def rstrip_trailing_ws(line: str) -> str: + return line.rstrip(" \t") + + +def format_extended_perl_blocks(text: str, cfg: Config) -> str: + """ + Detect blocks where <% and %> are on their own lines (with optional chomp markers), + format the inner Perl with perltidy (wrapped in do { ... }) or a naive indenter, + and reinsert with the original base indentation. + """ + lines = text.splitlines() + i = 0 + out: List[str] = [] + n = len(lines) + + while i < n: + m_open = OPEN_BLOCK_RE.match(lines[i]) + if not m_open: + out.append(lines[i]) + i += 1 + continue + + # Find closing delimiter + j = i + 1 + close = None + while j < n: + m_close = CLOSE_BLOCK_RE.match(lines[j]) + if m_close: + close = m_close + break + j += 1 + + if close is None: + out.append(lines[i]) + i += 1 + continue + + base = m_open.group("base") or "" + left = m_open.group("left") or "" + right = close.group("right") or "" + + body_lines = lines[i + 1 : j] + inner = "\n".join(body_lines) + + # Dedent before formatting + inner = _dedent_block(inner) + + # Try perltidy; fallback to naive indentation + tidied = tidy_perl_block_multiline(inner, cfg) + if tidied is None: + logger.debug("EP block %d-%d: perltidy failed/unavailable; using naive indenter", i + 1, j + 1) + tidied = _naive_perl_indent(inner, width=cfg.indent_width) + else: + logger.debug("EP block %d-%d: perltidy formatted (%d lines)", i + 1, j + 1, len(tidied.splitlines())) + + tidied = tidied.rstrip("\n") + out.append(f"{base}<%{left}") + if tidied: + for ln in tidied.splitlines(): + out.append((base + ln) if ln else base) + out.append(f"{base}{right}%>") + + i = j + 1 # continue after closing line + + return "\n".join(out) + ("\n" if text.endswith("\n") else "") + + +def format_string(src: str, cfg: Config) -> str: + original_eol = detect_eol(src) + text = src.replace("\r\n", "\n").replace("\r", "\n") + + lines = text.split("\n") + html_depth = 0 + perl_depth = 0 + in_raw: Optional[str] = None + + out_lines: List[str] = [] + + for orig_line in lines: + line = orig_line + + if in_raw: + m_close = RAW_CLOSE_RE.match(line) + if m_close and m_close.group("name").lower() == in_raw: + indent_level = max(0, html_depth - 1) + perl_depth + indent = " " * (cfg.indent_width * indent_level) + new_line = indent + line.lstrip() + out_lines.append(rstrip_trailing_ws(new_line)) + html_depth = max(0, html_depth - 1) + in_raw = None + else: + out_lines.append(line) + continue + + perl_dedent_before, perl_delta_after = compute_perl_deltas(line) + line_wo_tpl = strip_tpl_tags(line) + html_pre_dedent, html_net, raw_open, raw_close = derive_html_tag_deltas(line_wo_tpl) + + base_html_depth = max(0, html_depth - html_pre_dedent) + base_perl_depth = max(0, perl_depth - perl_dedent_before) + indent_level = max(0, base_html_depth + base_perl_depth) + indent = " " * (cfg.indent_width * indent_level) + + formatted_directive = format_line_directive(line, cfg) + if formatted_directive is not None: + content = formatted_directive + else: + content = substitute_tpl_tags_in_line(line, cfg).lstrip() + + new_line = indent + content.lstrip() + out_lines.append(rstrip_trailing_ws(new_line)) + + html_depth = max(0, base_html_depth + html_net + html_pre_dedent) + if raw_open and (raw_open.lower() in RAW_ELEMENTS): + in_raw = raw_open.lower() + perl_depth = max(0, base_perl_depth + perl_delta_after) + + result = "\n".join(out_lines) + + # Post-pass: format extended <% ... %> blocks + result = format_extended_perl_blocks(result, cfg) + + if not result.endswith("\n"): + result += "\n" + + eol_mode = cfg.eol if cfg.eol != "preserve" else original_eol + result = normalize_eol(result, eol_mode) + return result + + +def read_text(path: Path) -> str: + with path.open("rb") as f: + raw = f.read() + try: + return raw.decode("utf-8") + except UnicodeDecodeError: + return raw.decode(errors="replace") + + +def write_text(path: Path, text: str) -> None: + with path.open("wb") as f: + f.write(text.encode("utf-8")) + + +def is_supported_file(path: Path, exts: Tuple[str, ...]) -> bool: + name = path.name.lower() + return any(name.endswith(ext) for ext in exts) + + +def iter_files(paths: List[str], exts: Tuple[str, ...]) -> Iterable[Path]: + for p in paths: + pth = Path(p) + if pth.is_dir(): + for root, _, files in os.walk(pth): + for fn in files: + fp = Path(root) / fn + if is_supported_file(fp, exts): + logger.debug("Found file: %s", fp) + yield fp + else: + if is_supported_file(pth, exts): + logger.debug("Found file: %s", pth) + yield pth + + +def unified_diff(a: str, b: str, path: Path) -> str: + a_lines = a.splitlines(keepends=True) + b_lines = b.splitlines(keepends=True) + return "".join( + difflib.unified_diff( + a_lines, b_lines, fromfile=str(path), tofile=str(path) + " (formatted)" + ) + ) + + +def process_file(path: Path, cfg: Config, write: bool, show_diff: bool, backup: bool = False) -> Tuple[bool, str]: + original = read_text(path) + formatted = format_string(original, cfg) + changed = original != formatted + if changed: + logger.info("Formatted: %s", path) + if show_diff: + sys.stdout.write(unified_diff(original, formatted, path)) + if write: + if backup: + bak_path = path.with_name(path.name + ".bak") + write_text(bak_path, original) + logger.info("Backup written: %s", bak_path) + write_text(path, formatted) + logger.info("Overwritten: %s", path) + else: + logger.info("Unchanged: %s", path) + return changed, formatted + + +def process_stdin_stdout(cfg: Config) -> int: + data = sys.stdin.read() + formatted = format_string(data, cfg) + sys.stdout.write(formatted) + logger.info("Formatted stdin to stdout") + return 0 + + +def build_arg_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser(description="Format Mojolicious templates (.ep, .htm.ep, .html.ep)") + p.add_argument("paths", nargs="*", help="Files or directories") + p.add_argument("-w", "--write", action="store_true", help="Overwrite files in place (writes a .bak backup)") + p.add_argument("-o", "--out", help="Write formatted output to this file (single input file or --stdin). Conflicts with --write/--check/--diff") + p.add_argument("--check", action="store_true", help="Exit non-zero if any file would change") + p.add_argument("--diff", action="store_true", help="Print unified diff for changes") + p.add_argument("--stdin", action="store_true", help="Read from stdin") + p.add_argument("--stdout", action="store_true", help="Write to stdout (with --stdin)") + p.add_argument("--perltidy", help="Path to perltidy executable (defaults to PATH)") + p.add_argument("--indent", type=int, help="Indent width (spaces, default 2)") + p.add_argument("--eol", choices=["lf", "crlf", "preserve"], default="lf", help="EOL handling (default lf)") + p.add_argument("--no-space-in-delims", action="store_true", help="Do not normalize spaces inside <%% %%> delimiters") + p.add_argument("--perl-keyword-spacing", action="store_true", help="Aggressively insert a space after Perl keywords (if(...)->if (...), my$->my $, return(...)->return (...), etc.)") + p.add_argument("--self-test", dest="self_test", action="store_true", help="Run internal sanity checks and exit 0/1") + p.add_argument("--log-level", choices=["error", "info", "debug"], help="Logging level (default error)") + p.add_argument("--verbose", action="store_true", help="Shorthand for --log-level info") + p.add_argument("--version", action="store_true", help="Print version and exit") + return p + + +def self_test(cfg: Config) -> int: + failures: List[str] = [] + + def check(name: str, cond: bool, detail: Optional[str] = None): + if not cond: + failures.append(name + (": " + detail if detail else "")) + + # T0: perltidy availability and behavior + ok, msg = perltidy_probe(cfg) + if not ok: + failures.append("perltidy: " + msg) + else: + logger.info(msg) + + # T1: idempotence on a mixed template + src_a = "% if (1) {\n
    \n% for my $i (1..2) {\n
  • <%= $i %>
  • \n% }\n
\n% }\n" + fmt_a1 = format_string(src_a, cfg) + fmt_a2 = format_string(fmt_a1, cfg) + check("idempotence", fmt_a1 == fmt_a2) + + # T2: chomp markers preserved + src_b = "
  • <%= $title -%>\n<%= $sub %>
  • \n" + fmt_b = format_string(src_b, cfg) + check("chomp presence", "-%>" in fmt_b) + check("no-left-chomp-added", "<%-" not in fmt_b) + + # T3: raw element inner content unchanged + src_c = "\n" + fmt_c = format_string(src_c, cfg) + c_lines = src_c.splitlines() + f_lines = fmt_c.splitlines() + if len(c_lines) >= 3 and len(f_lines) >= 3: + check("raw inner unchanged", c_lines[1:-1] == f_lines[1:-1], detail=f"got {f_lines[1:-1]!r}") + else: + check("raw structure", False, "unexpected line count") + + # T4: delimiter spacing normalization for <% %> + src_d = "<%my $x=1;%>\n" + fmt_d = format_string(src_d, cfg) + check("delimiter spacing", "<% " in fmt_d and "%>" in fmt_d) + + # T5: keyword spacing with flag on + cfg_kw = dc_replace(cfg, perl_keyword_spacing=True) + fmt_k1 = format_string("<% if($x){ %>\n", cfg_kw) + check("kw if(...)", "if (" in fmt_k1 and " {" in fmt_k1) + fmt_k2 = format_string("<%= return(1) %>\n", cfg_kw) + check("kw return(...)", "return (" in fmt_k2) + fmt_k3 = format_string('<% say"hi"; %>\n', cfg_kw) + check("kw say \"...\"", 'say "' in fmt_k3) + fmt_k4 = format_string("<% my($x,$y)=@_; %>\n", cfg_kw) + check("kw my $", "my (" in fmt_k4 and " = @_" in fmt_k4) + fmt_k5 = format_string("<% sub foo{ %>\n", cfg_kw) + check("kw sub foo {", "sub foo {" in fmt_k5) + + # T6: extended EP block formatting + src_e = "<%\nmy $x=1;\nif($x){\nsay \"hi\";\n}\n%>\n" + fmt_e = format_string(src_e, cfg) + check("extended block indented", ("if (" in fmt_e and "say" in fmt_e and "{\n" in fmt_e) or ("if(" not in fmt_e)) + + if failures: + logger.error("SELF-TEST FAILURES:") + for f in failures: + logger.error(" - %s", f) + return 1 + logger.info("Self-test passed") + return 0 + + +def main(argv: Optional[List[str]] = None) -> int: + parser = build_arg_parser() + args = parser.parse_args(argv) + + setup_logging(args.log_level, args.verbose) + + if args.version: + print(f"mojofmt {VERSION}") + return 0 + + if args.self_test: + cfg = load_config(args) + return self_test(cfg) + + # Validate --out usage + if args.out: + if args.write or args.check or args.diff: + parser.error("--out conflicts with --write/--check/--diff") + cfg = load_config(args) + out_path = Path(args.out) + if args.stdin: + data = sys.stdin.read() + formatted = format_string(data, cfg) + write_text(out_path, formatted) + logger.info("Wrote %s (from stdin)", out_path) + return 0 + # must be exactly one input file + if not args.paths or len(args.paths) != 1: + parser.error("--out requires exactly one input file (or use --stdin)") + in_path = Path(args.paths[0]) + original = read_text(in_path) + formatted = format_string(original, cfg) + write_text(out_path, formatted) + logger.info("Wrote %s (from %s)", out_path, in_path) + return 0 + + cfg = load_config(args) + + if args.stdin: + return process_stdin_stdout(cfg) + + if not args.paths: + parser.error("No input paths provided (or use --stdin).") + + any_changed = False + any_error = False + + for path in iter_files(args.paths, cfg.extensions): + try: + changed, _ = process_file(path, cfg, write=args.write, show_diff=args.diff, backup=args.write) + any_changed = any_changed or changed + except Exception as e: + any_error = True + logger.error("Error processing %s: %s", path, e) + + if args.check and any_changed: + return 1 + return 1 if any_error else 0 + + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file