MojoTemplateFormatter/mojofmt.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
mojofmt: Formatter for Mojolicious Embedded Perl templates (.ep, .htm.ep, .html.ep)

Features:
- Indent HTML structure and Mojolicious line directives consistently
- Preserve chomp markers (<%- ... -%>) and do not alter newline semantics
- Handle helper begin/end blocks and Perl brace-based indentation for directives
- Treat pre/script/style/textarea content as opaque (unchanged)
- Optionally normalize spacing inside <% %> delimiters and after % directives
- Integrate with perltidy for Perl code formatting (if available on PATH)
- Reformat extended multi-line Perl blocks between lines with only <% and %>
- CLI with --write/--check/--diff, --out, --stdin/--stdout modes
- --self-test for sanity checks (includes perltidy probe)
- Logging: --log-level error|info|debug (and --verbose as shorthand for info)
- Optional --perl-keyword-spacing to aggressively insert spaces after Perl keywords
"""

from __future__ import annotations

import argparse
import difflib
import logging
import os
import re
import shutil
import subprocess
import sys
from dataclasses import dataclass, replace as dc_replace
from pathlib import Path
from typing import Iterable, List, Optional, Tuple

VERSION = "0.1.9"

DEFAULT_EXTENSIONS = (".ep", ".htm.ep", ".html.ep")
VOID_ELEMENTS = {
    "area", "base", "br", "col", "embed", "hr", "img", "input",
    "link", "meta", "param", "source", "track", "wbr",
}
RAW_ELEMENTS = {"pre", "script", "style", "textarea"}

logger = logging.getLogger("mojofmt")

TAG_RE = re.compile(
    r"""
    <
      (?P<slash>/)?
      (?P<name>[A-Za-z][\w:-]*)
      (?P<attrs>(?:\s+[^<>]*?)?)
      (?P<self>/)?
    >
    """,
    re.VERBOSE,
)

# Mojolicious inline tags on a single line: <%...%>
TPL_TAG_RE = re.compile(
    r"""
    <%
      (?P<leftchomp>-)?                # optional left chomp
      (?P<kind>==|=|\#)?               # kind: ==, =, or #
      (?P<body>.*?)                    # inner code/comment (non-greedy, no newlines)
      (?P<rightchomp>-)?               # optional right chomp
    %>
    """,
    re.VERBOSE,
)

# Line directives: starts with % (possibly %= %== %#) after indentation
LINE_DIR_RE = re.compile(r"^(?P<indent>\s*)%(?P<kind>==|=|\#)?(?P<body>.*)$")

# Whitespace condensing for single-line normalization
WS_RE = re.compile(r"[ \t]+")

# begin/end detection (heuristic)
BEGIN_RE = re.compile(r"\bbegin\b")
END_LINE_RE = re.compile(r"^\s*%\s*end\b")
END_TAG_ONLY_RE = re.compile(r"^\s*<%-?\s*end\s*-?%>\s*$")

# leading } in a directive (e.g., % } or % }} )
LEADING_RBRACE_COUNT_RE = re.compile(r"^\s*%\s*(?P<braces>\}+)")

# <% } %> alone
TAG_CLOSING_BRACE_ONLY_RE = re.compile(r"^\s*<%-?\s*\}+\s*-?%>\s*$")

# Detect raw element opening/closing (as standalone lines)
RAW_OPEN_RE = re.compile(r"^\s*<(?P<name>pre|script|style|textarea)\b[^>]*>\s*$", re.I)
RAW_CLOSE_RE = re.compile(r"^\s*</(?P<name>pre|script|style|textarea)\s*>\s*$", re.I)

# Extended EP block delimiters (opening/closing on their own lines)
OPEN_BLOCK_RE = re.compile(r'^(?P<base>[ \t]*)<%(?P<left>-?)(?![=#])\s*$')
CLOSE_BLOCK_RE = re.compile(r'^(?P<base>[ \t]*)(?P<right>-?)%>\s*$')


@dataclass
class Config:
    indent_width: int = 2
    eol: str = "lf"  # lf|crlf|preserve
    normalize_delimiter_spacing: bool = True
    perltidy_path: Optional[str] = None  # if None, use PATH
    perltidy_options: Optional[List[str]] = None
    extensions: Tuple[str, ...] = DEFAULT_EXTENSIONS
    respect_gitignore: bool = True
    verbose: bool = False  # kept for shorthand with --verbose
    perl_keyword_spacing: bool = False  # optional post-pass


def load_config(cli_args: argparse.Namespace) -> Config:
    cfg = Config()
    if cli_args.indent is not None:
        cfg.indent_width = cli_args.indent
    if cli_args.eol is not None:
        cfg.eol = cli_args.eol
    if cli_args.no_space_in_delims:
        cfg.normalize_delimiter_spacing = False
    if cli_args.perltidy:
        cfg.perltidy_path = cli_args.perltidy
    cfg.verbose = cli_args.verbose
    cfg.perl_keyword_spacing = getattr(cli_args, "perl_keyword_spacing", False)
    return cfg


def setup_logging(level_name: Optional[str], verbose_flag: bool) -> None:
    if level_name:
        name = level_name.lower()
    elif verbose_flag:
        name = "info"
    else:
        name = "error"
    level = {
        "error": logging.ERROR,
        "warning": logging.WARNING,
        "info": logging.INFO,
        "debug": logging.DEBUG,
        "critical": logging.CRITICAL,
    }.get(name, logging.ERROR)

    fmt = "mojofmt: %(levelname)s: %(message)s"
    logging.basicConfig(level=level, format=fmt)


def detect_eol(text: str) -> str:
    if "\r\n" in text:
        return "crlf"
    return "lf"


def normalize_eol(text: str, eol: str) -> str:
    if eol == "preserve":
        return text
    s = text.replace("\r\n", "\n").replace("\r", "\n")
    if eol == "lf":
        return s
    elif eol == "crlf":
        return s.replace("\n", "\r\n")
    else:
        return s


_PERLTIDY_WARNED = False  # avoid spamming logs if perltidy missing repeatedly


def run_perltidy(code: str, cfg: Config) -> Tuple[int, str, str]:
    global _PERLTIDY_WARNED
    exe = cfg.perltidy_path or shutil.which("perltidy")
    if not exe:
        if not _PERLTIDY_WARNED:
            logger.error("perltidy not found; Perl inside template will not be reformatted")
            _PERLTIDY_WARNED = True
        return (127, code, "perltidy not found")

    args: List[str] = [exe]
    if cfg.perltidy_options:
        args += cfg.perltidy_options
        if not any(opt.startswith("-st") for opt in cfg.perltidy_options):
            args.append("-st")
    else:
        args += [
            f"-i={cfg.indent_width}",
            f"-ci={cfg.indent_width}",
            "-l=100",
            "-q",
            "-se",
            "-st",
            "-nbbc",
            "-noll",
        ]
    logger.debug("Running perltidy: %s", " ".join(args))
    try:
        proc = subprocess.run(
            args,
            input=code,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            check=False,
        )
        if proc.returncode != 0:
            logger.debug("perltidy non-zero exit %s: %s", proc.returncode, (proc.stderr or "").strip())
        return (proc.returncode, proc.stdout, proc.stderr)
    except FileNotFoundError:
        if not _PERLTIDY_WARNED:
            logger.error("perltidy not found while executing")
            _PERLTIDY_WARNED = True
        return (127, code, "perltidy not found")


def perltidy_probe(cfg: Config) -> Tuple[bool, str]:
    exe = cfg.perltidy_path or shutil.which("perltidy")
    if not exe:
        return (False, "perltidy not found on PATH (install Perl::Tidy or pass --perltidy)")
    snippet = "my $x=  {a=>1,b =>2 };"
    rc, out, err = run_perltidy(snippet, cfg)
    if rc != 0:
        return (False, f"perltidy exit {rc}: {(err or '').strip()}")
    want = ["my $x = {", "a => 1", "b => 2"]
    if all(w in out for w in want):
        return (True, f"perltidy OK: {exe}")
    if out and out.strip() and out.strip() != snippet:
        return (True, f"perltidy OK (non-default style): {exe}")
    return (False, "perltidy produced unexpected output")


def tidy_perl_statement_oneline(code: str, cfg: Config) -> str:
    rc, out, _ = run_perltidy(code, cfg)
    if rc != 0:
        out = code
    out = out.strip()
    out = " ".join(out.splitlines())
    out = WS_RE.sub(" ", out).strip()
    out = enforce_perl_keyword_spacing(out, cfg.perl_keyword_spacing)
    return out


def tidy_perl_expression(code: str, cfg: Config) -> str:
    wrapped = f"do {{ {code} }}"
    rc, out, _ = run_perltidy(wrapped, cfg)
    if rc != 0:
        inner = code.strip()
        return enforce_perl_keyword_spacing(inner, cfg.perl_keyword_spacing)
    text = out
    try:
        start = text.index("{")
        depth = 0
        end_idx = None
        for i in range(start, len(text)):
            ch = text[i]
            if ch == "{":
                depth += 1
            elif ch == "}":
                depth -= 1
                if depth == 0:
                    end_idx = i
                    break
        if end_idx is None:
            inner = code.strip()
        else:
            inner = text[start + 1 : end_idx]
    except ValueError:
        inner = code.strip()
    inner = " ".join(line.strip() for line in inner.splitlines())
    inner = WS_RE.sub(" ", inner).strip()
    inner = enforce_perl_keyword_spacing(inner, cfg.perl_keyword_spacing)
    return inner


def tidy_perl_block_multiline(code: str, cfg: Config) -> Optional[str]:
    """
    Format a multi-line chunk of Perl by wrapping it in a do { ... } block for perltidy.
    Returns the formatted inner text (without the wrapper) or None on failure.
    """
    wrapped = "do {\n" + code + "\n}"
    rc, out, _ = run_perltidy(wrapped, cfg)
    if rc != 0 or not out:
        return None
    try:
        start = out.index("{")
    except ValueError:
        return None
    depth = 0
    end_idx = None
    for i in range(start, len(out)):
        ch = out[i]
        if ch == "{":
            depth += 1
        elif ch == "}":
            depth -= 1
            if depth == 0:
                end_idx = i
                break
    if end_idx is None:
        return None
    inner = out[start + 1 : end_idx]
    if inner.startswith("\n"):
        inner = inner[1:]
    if inner.endswith("\n"):
        inner = inner[:-1]
    return inner


def _split_code_and_strings(s: str):
    chunks = []
    buf: List[str] = []
    in_single = in_double = False
    i = 0
    while i < len(s):
        ch = s[i]
        if not in_single and not in_double:
            if ch == "'":
                if buf:
                    chunks.append(("code", "".join(buf)))
                    buf = []
                in_single = True
                buf.append(ch)
            elif ch == '"':
                if buf:
                    chunks.append(("code", "".join(buf)))
                    buf = []
                in_double = True
                buf.append(ch)
            else:
                buf.append(ch)
        elif in_single:
            buf.append(ch)
            if ch == "\\":
                if i + 1 < len(s):
                    buf.append(s[i + 1]); i += 1
            elif ch == "'":
                chunks.append(("str", "".join(buf))); buf = []; in_single = False
        elif in_double:
            buf.append(ch)
            if ch == "\\":
                if i + 1 < len(s):
                    buf.append(s[i + 1]); i += 1
            elif ch == '"':
                chunks.append(("str", "".join(buf))); buf = []; in_double = False
        i += 1
    if buf:
        chunks.append(("code" if not (in_single or in_double) else "str", "".join(buf)))
    return chunks


def _split_unquoted_comment(code_chunk: str):
    idx = code_chunk.find("#")
    if idx == -1:
        return code_chunk, None
    return code_chunk[:idx], code_chunk[idx:]


def enforce_perl_keyword_spacing(s: str, enable: bool) -> str:
    if not enable or not s:
        return s
    # Add space after control keywords before '('
    ctrl_paren = re.compile(r"\b(?P<kw>if|elsif|unless|while|until|for|foreach|given|when)\s*\(")
    # Add space after declarators before sigils/paren
    decl = re.compile(r"\b(?P<kw>my|our|state|local)\s*(?=[\$\@\%\*\&\\\(])")
    # sub name spacing and brace spacing
    sub_named = re.compile(r"\bsub\s*([A-Za-z_]\w*)")
    sub_named_brace = re.compile(r"\bsub\s+([A-Za-z_]\w*)\s*\{")
    sub_anon = re.compile(r"\bsub\s*\{")
    # Calls which often appear without space
    call_paren = re.compile(r"\b(?P<kw>return|print|say|die|warn|exit)\s*\(")
    call_space = re.compile(r"\b(?P<kw>return|print|say|die|warn|exit)\s*(?=\S)")
    # else/continue/do/eval blocks
    else_brace = re.compile(r"\b(?P<kw>else|continue|do|eval)\s*\{")
    # Ensure space before a brace after a closing paren: "){" -> ") {"
    brace_after_paren = re.compile(r"\)\s*\{")
    # Ensure space between '}' and a following keyword: "}else" -> "} else"
    brace_then_kw = re.compile(r"\}\s*(?=\b(?:else|elsif|continue|when)\b)")

    out: List[str] = []
    for kind, chunk in _split_code_and_strings(s):
        if kind != "code":
            out.append(chunk)
            continue
        code, comment = _split_unquoted_comment(chunk)
        code = ctrl_paren.sub(lambda m: f"{m.group('kw')} (", code)
        code = decl.sub(lambda m: f"{m.group('kw')} ", code)
        code = sub_named.sub(lambda m: f"sub {m.group(1)}", code)
        code = sub_named_brace.sub(lambda m: f"sub {m.group(1)} {{", code)
        code = sub_anon.sub("sub {", code)
        code = call_paren.sub(lambda m: f"{m.group('kw')} (", code)
        code = call_space.sub(lambda m: f"{m.group('kw')} ", code)
        code = brace_then_kw.sub("} ", code)
        code = else_brace.sub(lambda m: f"{m.group('kw')} {{", code)
        code = brace_after_paren.sub(") {", code)
        out.append(code + (comment or ""))
    return "".join(out)


def _common_leading_ws(lines: List[str]) -> str:
    ws = None
    for ln in lines:
        if not ln.strip():
            continue
        lead = len(ln) - len(ln.lstrip(' \t'))
        s = ln[:lead]
        if ws is None:
            ws = s
        else:
            i = 0
            while i < len(ws) and i < len(s) and ws[i] == s[i]:
                i += 1
            ws = ws[:i]
    return ws or ""


def _dedent_block(text: str) -> str:
    lines = text.splitlines()
    # Trim leading/trailing all-whitespace lines
    while lines and not lines[0].strip():
        lines.pop(0)
    while lines and not lines[-1].strip():
        lines.pop()
    if not lines:
        return ""
    prefix = _common_leading_ws(lines)
    if not prefix:
        return "\n".join(lines)
    plen = len(prefix)
    out = []
    for ln in lines:
        out.append(ln[plen:] if ln.startswith(prefix) else ln)
    return "\n".join(out)


def _naive_perl_indent(code: str, width: int = 2) -> str:
    lines = code.splitlines()
    indent = 0
    out = []
    for raw in lines:
        ln = raw.rstrip()
        if not ln:
            out.append("")
            continue
        stripped = ln.lstrip()
        # dedent on leading closing braces
        leading_closes = 0
        i = 0
        while i < len(stripped) and stripped[i] == '}':
            leading_closes += 1
            i += 1
        indent_before = max(0, indent - leading_closes)
        out.append((" " * (indent_before * width)) + stripped)
        opens = ln.count("{")
        closes = ln.count("}")
        indent += (opens - closes)
        if indent < 0:
            indent = 0
    return "\n".join(out)


def normalize_tpl_tag(
    leftchomp: Optional[str],
    kind: Optional[str],
    body: str,
    rightchomp: Optional[str],
    cfg: Config,
) -> Tuple[str, str, str, str, str]:
    if not cfg.normalize_delimiter_spacing or (kind == "#"):
        return ("<%", leftchomp or "", kind or "", body, (rightchomp or "") + "%>")
    body = body.strip()
    left_space = " "
    right_space = " " if rightchomp == "" else ""
    open_part = "<%" + (leftchomp or "") + (kind or "") + left_space
    close_part = right_space + (rightchomp or "") + "%>"
    return (open_part, "", "", body, close_part)


def substitute_tpl_tags_in_line(line: str, cfg: Config) -> str:
    parts: List[str] = []
    last = 0
    for m in TPL_TAG_RE.finditer(line):
        parts.append(line[last : m.start()])
        leftchomp = m.group("leftchomp") or ""
        kind = m.group("kind") or ""
        body = m.group("body")
        rightchomp = m.group("rightchomp") or ""
        open_part, _, _, new_body, close_part = normalize_tpl_tag(
            leftchomp, kind, body, rightchomp, cfg
        )
        if kind == "#":
            inner = body
        else:
            if kind in ("=", "=="):
                inner = tidy_perl_expression(body, cfg)
            else:
                inner = tidy_perl_statement_oneline(body, cfg)
        parts.append(open_part + inner + close_part)
        last = m.end()
    parts.append(line[last:])
    return "".join(parts)


def derive_html_tag_deltas(line_wo_tpl: str) -> Tuple[int, int, Optional[str], Optional[str]]:
    """
    Return (pre_dedent, net_total, raw_open, raw_close):
    - pre_dedent: end tags at beginning of line (dedent before printing)
    - net_total: total start tags (+1) minus end tags (-1) across the line for non-void, non-self-closing tags
    - raw_open, raw_close: raw elements opened/closed on this line if they match exactly
    """
    s = line_wo_tpl

    raw_open = None
    raw_close = None
    m_open = RAW_OPEN_RE.match(s)
    if m_open:
        raw_open = m_open.group("name").lower()
    m_close = RAW_CLOSE_RE.match(s)
    if m_close:
        raw_close = m_close.group("name").lower()

    pre_dedent = 0
    i = 0
    while i < len(s) and s[i].isspace():
        i += 1
    while True:
        m = TAG_RE.match(s, i)
        if not m:
            break
        if m.group("slash"):
            pre_dedent += 1
            i = m.end()
            while i < len(s) and s[i].isspace():
                i += 1
            continue
        else:
            break

    net = 0
    for m in TAG_RE.finditer(s):
        slash = m.group("slash")
        name = (m.group("name") or "").lower()
        selfclose = bool(m.group("self"))
        if slash:
            net -= 1
        else:
            if selfclose or name in VOID_ELEMENTS:
                pass
            else:
                net += 1

    return pre_dedent, net, raw_open, raw_close


def strip_tpl_tags(line: str) -> str:
    return TPL_TAG_RE.sub(lambda m: " " * (m.end() - m.start()), line)


def is_standalone_statement_tag(line: str) -> bool:
    s = line.strip()
    if not (s.startswith("<%") and s.endswith("%>")):
        return False
    if s.startswith("<%=") or s.startswith("<%=="):
        return False
    return True


def compute_perl_deltas(line: str) -> Tuple[int, int]:
    """
    Return (perl_dedent_before, perl_delta_after_for_next_line).
    Only line directives (starting with %) and standalone <% ... %> statement lines
    affect Perl depth. Also account for % end / <% end %> and begin blocks.
    """
    dedent_before = 0
    delta_after = 0

    if END_LINE_RE.match(line) or END_TAG_ONLY_RE.match(line):
        dedent_before += 1

    m = LEADING_RBRACE_COUNT_RE.match(line)
    if m:
        braces = m.group("braces") or ""
        dedent_before += len(braces)

    if TAG_CLOSING_BRACE_ONLY_RE.match(line):
        dedent_before += 1

    is_dir = bool(LINE_DIR_RE.match(line))
    is_stmt_tag_only = is_standalone_statement_tag(line)

    if is_dir:
        body = LINE_DIR_RE.match(line).group("body")
        open_count = body.count("{")
        close_count = body.count("}")
        delta_after += (open_count - close_count)
        if BEGIN_RE.search(line):
            delta_after += 1
    elif is_stmt_tag_only:
        bodies = [m.group("body") or "" for m in TPL_TAG_RE.finditer(line)]
        open_count = sum(b.count("{") for b in bodies)
        close_count = sum(b.count("}") for b in bodies)
        delta_after += (open_count - close_count)
        if BEGIN_RE.search(line):
            delta_after += 1

    return dedent_before, delta_after


def format_line_directive(line: str, cfg: Config) -> Optional[str]:
    """
    If the line is a Mojolicious line directive (% ...), return a formatted
    directive string WITHOUT leading indentation (indent applied separately).
    Otherwise return None.
    """
    m = LINE_DIR_RE.match(line)
    if not m:
        return None
    kind = m.group("kind") or ""
    body = m.group("body")

    if kind == "#":
        if cfg.normalize_delimiter_spacing:
            trimmed = body.strip()
            return "%#" + ((" " + trimmed) if trimmed else "")
        else:
            return "%#" + body

    if kind in ("=", "=="):
        inner = tidy_perl_expression(body, cfg)
    else:
        inner = tidy_perl_statement_oneline(body, cfg)

    if cfg.normalize_delimiter_spacing:
        return "%" + kind + ((" " + inner) if inner else "")
    else:
        return "%" + kind + ((" " + inner) if inner else "")


def rstrip_trailing_ws(line: str) -> str:
    return line.rstrip(" \t")


def format_extended_perl_blocks(text: str, cfg: Config) -> str:
    """
    Detect blocks where <% and %> are on their own lines (with optional chomp markers),
    format the inner Perl with perltidy (wrapped in do { ... }) or a naive indenter,
    and reinsert with the original base indentation.
    """
    lines = text.splitlines()
    i = 0
    out: List[str] = []
    n = len(lines)

    while i < n:
        m_open = OPEN_BLOCK_RE.match(lines[i])
        if not m_open:
            out.append(lines[i])
            i += 1
            continue

        # Find closing delimiter
        j = i + 1
        close = None
        while j < n:
            m_close = CLOSE_BLOCK_RE.match(lines[j])
            if m_close:
                close = m_close
                break
            j += 1

        if close is None:
            out.append(lines[i])
            i += 1
            continue

        base = m_open.group("base") or ""
        left = m_open.group("left") or ""
        right = close.group("right") or ""

        body_lines = lines[i + 1 : j]
        inner = "\n".join(body_lines)

        # Dedent before formatting
        inner = _dedent_block(inner)

        # Try perltidy; fallback to naive indentation
        tidied = tidy_perl_block_multiline(inner, cfg)
        if tidied is None:
            logger.debug("EP block %d-%d: perltidy failed/unavailable; using naive indenter", i + 1, j + 1)
            tidied = _naive_perl_indent(inner, width=cfg.indent_width)
        else:
            logger.debug("EP block %d-%d: perltidy formatted (%d lines)", i + 1, j + 1, len(tidied.splitlines()))

        tidied = tidied.rstrip("\n")
        out.append(f"{base}<%{left}")
        if tidied:
            for ln in tidied.splitlines():
                out.append((base + ln) if ln else base)
        out.append(f"{base}{right}%>")

        i = j + 1  # continue after closing line

    return "\n".join(out) + ("\n" if text.endswith("\n") else "")


def format_string(src: str, cfg: Config) -> str:
    original_eol = detect_eol(src)
    text = src.replace("\r\n", "\n").replace("\r", "\n")

    lines = text.split("\n")
    html_depth = 0
    perl_depth = 0
    in_raw: Optional[str] = None

    out_lines: List[str] = []

    for orig_line in lines:
        line = orig_line

        if in_raw:
            m_close = RAW_CLOSE_RE.match(line)
            if m_close and m_close.group("name").lower() == in_raw:
                indent_level = max(0, html_depth - 1) + perl_depth
                indent = " " * (cfg.indent_width * indent_level)
                new_line = indent + line.lstrip()
                out_lines.append(rstrip_trailing_ws(new_line))
                html_depth = max(0, html_depth - 1)
                in_raw = None
            else:
                out_lines.append(line)
            continue

        perl_dedent_before, perl_delta_after = compute_perl_deltas(line)
        line_wo_tpl = strip_tpl_tags(line)
        html_pre_dedent, html_net, raw_open, raw_close = derive_html_tag_deltas(line_wo_tpl)

        base_html_depth = max(0, html_depth - html_pre_dedent)
        base_perl_depth = max(0, perl_depth - perl_dedent_before)
        indent_level = max(0, base_html_depth + base_perl_depth)
        indent = " " * (cfg.indent_width * indent_level)

        formatted_directive = format_line_directive(line, cfg)
        if formatted_directive is not None:
            content = formatted_directive
        else:
            content = substitute_tpl_tags_in_line(line, cfg).lstrip()

        new_line = indent + content.lstrip()
        out_lines.append(rstrip_trailing_ws(new_line))

        html_depth = max(0, base_html_depth + html_net + html_pre_dedent)
        if raw_open and (raw_open.lower() in RAW_ELEMENTS):
            in_raw = raw_open.lower()
        perl_depth = max(0, base_perl_depth + perl_delta_after)

    result = "\n".join(out_lines)

    # Post-pass: format extended <% ... %> blocks
    result = format_extended_perl_blocks(result, cfg)

    if not result.endswith("\n"):
        result += "\n"

    eol_mode = cfg.eol if cfg.eol != "preserve" else original_eol
    result = normalize_eol(result, eol_mode)
    return result


def read_text(path: Path) -> str:
    with path.open("rb") as f:
        raw = f.read()
    try:
        return raw.decode("utf-8")
    except UnicodeDecodeError:
        return raw.decode(errors="replace")


def write_text(path: Path, text: str) -> None:
    with path.open("wb") as f:
        f.write(text.encode("utf-8"))


def is_supported_file(path: Path, exts: Tuple[str, ...]) -> bool:
    name = path.name.lower()
    return any(name.endswith(ext) for ext in exts)


def iter_files(paths: List[str], exts: Tuple[str, ...]) -> Iterable[Path]:
    for p in paths:
        pth = Path(p)
        if pth.is_dir():
            for root, _, files in os.walk(pth):
                for fn in files:
                    fp = Path(root) / fn
                    if is_supported_file(fp, exts):
                        logger.debug("Found file: %s", fp)
                        yield fp
        else:
            if is_supported_file(pth, exts):
                logger.debug("Found file: %s", pth)
                yield pth


def unified_diff(a: str, b: str, path: Path) -> str:
    a_lines = a.splitlines(keepends=True)
    b_lines = b.splitlines(keepends=True)
    return "".join(
        difflib.unified_diff(
            a_lines, b_lines, fromfile=str(path), tofile=str(path) + " (formatted)"
        )
    )


def process_file(path: Path, cfg: Config, write: bool, show_diff: bool, backup: bool = False) -> Tuple[bool, str]:
    original = read_text(path)
    formatted = format_string(original, cfg)
    changed = original != formatted
    if changed:
        logger.info("Formatted: %s", path)
        if show_diff:
            sys.stdout.write(unified_diff(original, formatted, path))
        if write:
            if backup:
                bak_path = path.with_name(path.name + ".bak")
                write_text(bak_path, original)
                logger.info("Backup written: %s", bak_path)
            write_text(path, formatted)
            logger.info("Overwritten: %s", path)
    else:
        logger.info("Unchanged: %s", path)
    return changed, formatted


def process_stdin_stdout(cfg: Config) -> int:
    data = sys.stdin.read()
    formatted = format_string(data, cfg)
    sys.stdout.write(formatted)
    logger.info("Formatted stdin to stdout")
    return 0


def build_arg_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(description="Format Mojolicious templates (.ep, .htm.ep, .html.ep)")
    p.add_argument("paths", nargs="*", help="Files or directories")
    p.add_argument("-w", "--write", action="store_true", help="Overwrite files in place (writes a .bak backup)")
    p.add_argument("-o", "--out", help="Write formatted output to this file (single input file or --stdin). Conflicts with --write/--check/--diff")
    p.add_argument("--check", action="store_true", help="Exit non-zero if any file would change")
    p.add_argument("--diff", action="store_true", help="Print unified diff for changes")
    p.add_argument("--stdin", action="store_true", help="Read from stdin")
    p.add_argument("--stdout", action="store_true", help="Write to stdout (with --stdin)")
    p.add_argument("--perltidy", help="Path to perltidy executable (defaults to PATH)")
    p.add_argument("--indent", type=int, help="Indent width (spaces, default 2)")
    p.add_argument("--eol", choices=["lf", "crlf", "preserve"], default="lf", help="EOL handling (default lf)")
    p.add_argument("--no-space-in-delims", action="store_true", help="Do not normalize spaces inside <%% %%> delimiters")
    p.add_argument("--perl-keyword-spacing", action="store_true", help="Aggressively insert a space after Perl keywords (if(...)->if (...), my$->my $, return(...)->return (...), etc.)")
    p.add_argument("--self-test", dest="self_test", action="store_true", help="Run internal sanity checks and exit 0/1")
    p.add_argument("--log-level", choices=["error", "info", "debug"], help="Logging level (default error)")
    p.add_argument("--verbose", action="store_true", help="Shorthand for --log-level info")
    p.add_argument("--version", action="store_true", help="Print version and exit")
    return p


def self_test(cfg: Config) -> int:
    failures: List[str] = []

    def check(name: str, cond: bool, detail: Optional[str] = None):
        if not cond:
            failures.append(name + (": " + detail if detail else ""))

    # T0: perltidy availability and behavior
    ok, msg = perltidy_probe(cfg)
    if not ok:
        failures.append("perltidy: " + msg)
    else:
        logger.info(msg)

    # T1: idempotence on a mixed template
    src_a = "% if (1) {\n<ul>\n% for my $i (1..2) {\n<li><%= $i %></li>\n% }\n</ul>\n% }\n"
    fmt_a1 = format_string(src_a, cfg)
    fmt_a2 = format_string(fmt_a1, cfg)
    check("idempotence", fmt_a1 == fmt_a2)

    # T2: chomp markers preserved
    src_b = "<li><%= $title -%>\n<%= $sub %></li>\n"
    fmt_b = format_string(src_b, cfg)
    check("chomp presence", "-%>" in fmt_b)
    check("no-left-chomp-added", "<%-" not in fmt_b)

    # T3: raw element inner content unchanged
    src_c = "<script>\n  var x=1;    // keep spacing\nif(true){console.log(x)}\n</script>\n"
    fmt_c = format_string(src_c, cfg)
    c_lines = src_c.splitlines()
    f_lines = fmt_c.splitlines()
    if len(c_lines) >= 3 and len(f_lines) >= 3:
        check("raw inner unchanged", c_lines[1:-1] == f_lines[1:-1], detail=f"got {f_lines[1:-1]!r}")
    else:
        check("raw structure", False, "unexpected line count")

    # T4: delimiter spacing normalization for <% %>
    src_d = "<%my $x=1;%>\n"
    fmt_d = format_string(src_d, cfg)
    check("delimiter spacing", "<% " in fmt_d and "%>" in fmt_d)

    # T5: keyword spacing with flag on
    cfg_kw = dc_replace(cfg, perl_keyword_spacing=True)
    fmt_k1 = format_string("<% if($x){ %>\n", cfg_kw)
    check("kw if(...)", "if (" in fmt_k1 and " {" in fmt_k1)
    fmt_k2 = format_string("<%= return(1) %>\n", cfg_kw)
    check("kw return(...)", "return (" in fmt_k2)
    fmt_k3 = format_string('<% say"hi"; %>\n', cfg_kw)
    check("kw say \"...\"", 'say "' in fmt_k3)
    fmt_k4 = format_string("<% my($x,$y)=@_; %>\n", cfg_kw)
    check("kw my $", "my (" in fmt_k4 and " = @_" in fmt_k4)
    fmt_k5 = format_string("<% sub foo{ %>\n", cfg_kw)
    check("kw sub foo {", "sub foo {" in fmt_k5)

    # T6: extended EP block formatting
    src_e = "<%\nmy $x=1;\nif($x){\nsay \"hi\";\n}\n%>\n"
    fmt_e = format_string(src_e, cfg)
    check("extended block indented", ("if (" in fmt_e and "say" in fmt_e and "{\n" in fmt_e) or ("if(" not in fmt_e))

    if failures:
        logger.error("SELF-TEST FAILURES:")
        for f in failures:
            logger.error(" - %s", f)
        return 1
    logger.info("Self-test passed")
    return 0


def main(argv: Optional[List[str]] = None) -> int:
    parser = build_arg_parser()
    args = parser.parse_args(argv)

    setup_logging(args.log_level, args.verbose)

    if args.version:
        print(f"mojofmt {VERSION}")
        return 0

    if args.self_test:
        cfg = load_config(args)
        return self_test(cfg)

    # Validate --out usage
    if args.out:
        if args.write or args.check or args.diff:
            parser.error("--out conflicts with --write/--check/--diff")
        cfg = load_config(args)
        out_path = Path(args.out)
        if args.stdin:
            data = sys.stdin.read()
            formatted = format_string(data, cfg)
            write_text(out_path, formatted)
            logger.info("Wrote %s (from stdin)", out_path)
            return 0
        # must be exactly one input file
        if not args.paths or len(args.paths) != 1:
            parser.error("--out requires exactly one input file (or use --stdin)")
        in_path = Path(args.paths[0])
        original = read_text(in_path)
        formatted = format_string(original, cfg)
        write_text(out_path, formatted)
        logger.info("Wrote %s (from %s)", out_path, in_path)
        return 0

    cfg = load_config(args)

    if args.stdin:
        return process_stdin_stdout(cfg)

    if not args.paths:
        parser.error("No input paths provided (or use --stdin).")

    any_changed = False
    any_error = False

    for path in iter_files(args.paths, cfg.extensions):
        try:
            changed, _ = process_file(path, cfg, write=args.write, show_diff=args.diff, backup=args.write)
            any_changed = any_changed or changed
        except Exception as e:
            any_error = True
            logger.error("Error processing %s: %s", path, e)

    if args.check and any_changed:
        return 1
    return 1 if any_error else 0


if __name__ == "__main__":
    sys.exit(main())