SM2Gen/sm1-html-2-json5.py

import json
import os
import re
from bs4 import BeautifulSoup
from lxml import etree  # Import lxml for HTML validation
import html
import argparse


def read_html_file(filename):
    """Read HTML content from a file."""
    with open(filename, "r", encoding="utf-8") as file:
        return file.read()


def validate_html(html):
    """Validate the HTML content."""
    try:
        parser = etree.HTMLParser()
        etree.fromstring(html, parser)  # Attempt to parse the HTML
    except Exception as e:
        raise ValueError("Invalid HTML document") from e

def convert_double_quotes_to_span(text):
    """Convert single-quoted text to <span>...</span>."""
    # Use a regular expression to find single-quoted text and replace it
    return re.sub(r'"(.*?)"', r"<span class=emphasis-para>\1</span>", text)


# def sanitize_text(text):
    # # Replace newlines with spaces
    # print(f"--{text}--")
    # decoded_text = html.unescape(text)

    # sanitized_text = decoded_text.replace("\n", "").replace(
        # "\r", " "
    # )    # Handle both Unix and Windows line endings
    # # Replace tabs with spaces
    # sanitized_text = sanitized_text.replace("\t", "")
    # # map single quotes to double
    # # sanitized_text = sanitized_text.replace("'", '"')
    # #Map signle and double quotes to nothing
    # sanitized_text.replace("'","").replace('"','')
    # #Take out any multiple spaces - reduce to one.
    # sanitized_text = ' '.join(sanitized_text.split())
    # # Strip leading and trailing whitespace
    # sanitized_text = sanitized_text.strip()
    # #sanitized_text = convert_double_quotes_to_span(sanitized_text)
    # print(f"++{sanitized_text}++")
    # return sanitized_text
    
def sanitize_text(text):
    # Replace newlines with spaces
    print(f"--{text}--")
    # Take out html entities
    decoded_text = html.unescape(text)
	# Take out newlines
    sanitized_text = decoded_text.replace('\n', ' ').replace('\r', ' ')  # Handle both Unix and Windows line endings
    # Replace tabs with spaces
    sanitized_text = sanitized_text.replace('\t', ' ')
    # Replace quote characters
    sanitized_text = sanitized_text.replace('"', '').replace("'", '')  # Remove double and single quotes
    #Take out any multiple spaces - reduce to one.
    sanitized_text = ' '.join(sanitized_text.split())
    # Strip leading and trailing whitespace
    sanitized_text = sanitized_text.strip()
    print(f"++{sanitized_text}++")
    return sanitized_text


def extract_data(html):
    """Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
    soup = BeautifulSoup(html, "lxml")
    records = []

    hidden_input_names = ["page", "page_stack", ".id", "csrf_token"]

    header_text = None
    sub_header_text = None

    # Counter for tables
    table_counter = 0

    # Extract elements while preserving order
    for element in soup.find_all(
        ["h1", "h2", "p", "pre", "input", "select", "textarea", "button", "table"]
    ):
        if element.name == "h1":
            header_text = element.get_text(strip=True)
            records.append({"Type": "Header", "Text": header_text})

        elif element.name == "h2":
            sub_header_text = element.get_text(strip=True)
            records.append({"Type": "SubHeader", "Text": sub_header_text})

        elif element.name == "p":
            text = element.get_text(strip=True)
            if text:  # Ignore empty paragraphs
                # Sanitise text freom newlines,tabs and escape quotes.
                sanitised_text = sanitize_text(text)
                if sanitised_text == "":
                   continue
                records.append({"Type": "Paragraph", "Text": sanitised_text})

        elif element.name == "pre":
            text = element.get_text(strip=True)
            if text:  # Ensure non-empty before adding
                records.append({"Type": "Preformatted", "Text": text})

        elif element.name == "input":
            if (
                element.get("type") == "hidden"
                or element.get("name") in hidden_input_names
            ):
                continue

            input_info = {
                "Type": element.get("type", "text").capitalize(),
                "Name": element.get("name"),
                "Value": element.get("value", ""),
            }
            label = element.find_next("label")
            input_info["Label"] = label.get_text(strip=True) if label else None
            records.append(input_info)

        elif element.name == "select":
            options = [
                {"Value": option.get("value"), "Text": option.get_text(strip=True)}
                for option in element.find_all("option")
            ]
            select_info = {
                "Type": "Select",
                "Name": element.get("name"),
                "Options": options,
                "Label": element.find_previous("label").get_text(strip=True)
                if element.find_previous("label")
                else None,
            }
            records.append(select_info)

        elif element.name == "textarea":
            textarea_info = {
                "Type": "Textarea",
                "Name": element.get("name"),
                "Value": element.get_text(strip=True),
            }
            label = element.find_previous("label")
            textarea_info["Label"] = label.get_text(strip=True) if label else None
            records.append(textarea_info)

        elif element.name == "button":
            button_info = {
                "Type": "Button",
                "Name": element.get("name"),
                "Value": element.get_text(strip=True),
                "Label": element.find_previous("label").get_text(strip=True)
                if label
                else None,
            }
            records.append(button_info)

        elif element.name == "table" and "sme-border" in element.get("class", []):
            # Increment the table counter
            table_counter += 1

            # Prepare the TableControl format
            table_control = f"Table{table_counter}"  # e.g., "Table1", "Table2"
            top_headings = []
            columns = []

            # Extract headings from the first row
            first_row = element.find("tr")
            if first_row:
                for th in first_row.find_all("th"):
                    top_headings.append(th.get_text(strip=True))

            # Extract only the first data row's cell values for Columns
            data_rows = element.find_all("tr")[1:]  # Skip the heading row
            if data_rows:
                first_data_row = data_rows[0]  # Take the first row of data
                for idx, th in enumerate(first_row.find_all("th")):
                    td = (
                        first_data_row.find_all("td")[idx]
                        if idx < len(first_data_row.find_all("td"))
                        else None
                    )
                    if td:
                        columns.append(
                            f"{table_control}-{th.get_text(strip=True)}"
                        )  # Format as desired

            records.append(
                {
                    "Type": "Table",
                    "TableControl": table_control,
                    "TopHeadings": top_headings,
                    "Columns": columns,
                }
            )

    return records, header_text, sub_header_text


def insert_spaces_before_caps(text):
    """Insert spaces before each capital letter in a given string."""
    return re.sub(r"(?<!^)(?=[A-Z])", " ", text)


def save_to_json5(data, output_filename, package_name, header, sub_header):
    """Save extracted data to a JSON5 file with a specific structure."""
    # Generate prefix from uppercase letters in PackageName made into lowercase
    prefix = "".join(re.findall(r"[A-Z]", package_name)).lower()

    # Prepare structured html list
    structured_html = []
    paragraph_count = 1
    preformatted_count = 1
    input_count = 1
    table_count = 1

    for record in data:
        if record["Type"] == "Paragraph":
            structured_html.append({f"Paragraph{paragraph_count}": record["Text"]})
            paragraph_count += 1
        elif record["Type"] == "Preformatted":
            structured_html.append(
                {f"Preformatted{preformatted_count}": record["Text"]}
            )
            preformatted_count += 1
        elif record["Type"] == "Header" or record["Type"] == "SubHeader":
            continue  # Skip headers for input count
        elif record["Type"] == "Table":
            # Construct the table entry
            table_structure = {
                "Type": record["Type"],
                "TableControl": record["TableControl"],
                "TopHeadings": record["TopHeadings"],
                "Columns": record["Columns"],
            }
            structured_html.append({f"Table{table_count}": table_structure})
            table_count += 1
        else:  # For inputs, selects, textareas, and buttons
            input_structure = {
                "Type": record["Type"],
                "Value": record.get("Value", ""),  # Safely access Value
            }

            # Use .get() for the Name key to avoid KeyError
            input_structure["Name"] = record.get(
                "Name", None
            )  # Set to None if not present
            input_structure["Label"] = record.get(
                "Label", None
            )  # Set to None if not present

            # Handle specific case for Select options
            if "Options" in record:
                input_structure["Options"] = record["Options"]

            structured_html.append({f"Input{input_count}": input_structure})
            input_count += 1

    # Wrap the records with the required fields
    json5_data = {
        "PackageName": package_name,
        "prefix": prefix,
        "MenuHeading": "Miscellaneous",
        "MenuDescription": insert_spaces_before_caps(package_name),
        "MenuNavigation": "2000 400",
        "firstPanel": "PARAMS",
        "signalEvent": f"smeserver-{package_name.lower()}-update",
        "html": [{
            "Name": "params",
            "route": "PARAMS",
            "Header": header if header else f"{package_name} Contrib",
            "SubHeader": sub_header
            if sub_header
            else f"Manage {package_name} settings:",
            **{
                k: v for item in structured_html for k, v in item.items()
            },  # Flatten the structured_html into the dict
        }],
    }

    # Save in JSON5 format (JSON with comments and unquoted keys)
    with open(output_filename, "w", encoding="utf-8") as json_file:
        json.dump(json5_data, json_file, ensure_ascii=False, indent=4)

    # Manually format as JSON5 by adding single quotes (for simplicity)
    with open(output_filename, "r+", encoding="utf-8") as json_file:
        content = json_file.read()
        content = content.replace(
            '"', "'"
        )  # Replace double quotes with single quotes for JSON5
        json_file.seek(0)
        json_file.write(content)
        json_file.truncate()  # Remove any old content beyond the new content length


def main():
	# command line parameters
	parser = argparse.ArgumentParser(description="sm1--html-2-jsopn5")
	parser.add_argument(
		"-f",
		"--filename",
		help="Specify a filename for the html file",
		default="CreateStarterWebsite.html",
	)
	args = parser.parse_args()
	input_file = "/home/brianr/clients/SM2/SM2Gen/venv/html/" + args.filename
	if not input_file.lower().endswith(".html"):
		# Add .html extension
		input_file += ".html"
	print(input_file)

	# Read HTML content
	html_content = read_html_file(input_file)

	# Validate the HTML before extracting data
	validate_html(html_content)

	# Extract data from HTML
	data, header, sub_header = extract_data(html_content)
	#
	# Generate output JSON5 filename based on input file name
	#
	 # Split the original path into directory and file name
	directory, filename = os.path.split(input_file)

	# Replace 'html' with 'json5' in the directory path
	new_directory = directory.replace('/html', '/json5')
	#print(new_directory)

	# Construct the new path
	output_file = os.path.join(new_directory, filename.replace('.html', '.json5'))
	print(output_file)
	#quit(1)
	
	# Generate output JSON5 filename based on input file name
	base_name = os.path.basename(input_file)  # Get the file name (with extension)
	package_name = os.path.splitext(base_name)[0]  # Use the filename without extension

	# Save extracted data to JSON5
	save_to_json5(data, output_file, package_name, header, sub_header)
	print(f"Extracted data saved to '{output_file}'.")


if __name__ == "__main__":
    main()
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`import json`
			`import os`
			`import re`
			`from bs4 import BeautifulSoup`
			`from lxml import etree # Import lxml for HTML validation`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`import html`
			`import argparse`

Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00
			`def read_html_file(filename):`
			`"""Read HTML content from a file."""`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`with open(filename, "r", encoding="utf-8") as file:`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`return file.read()`

Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`def validate_html(html):`
			`"""Validate the HTML content."""`
			`try:`
			`parser = etree.HTMLParser()`
			`etree.fromstring(html, parser) # Attempt to parse the HTML`
			`except Exception as e:`
			`raise ValueError("Invalid HTML document") from e`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00
More fixing paragraph text quotes etc 2024-09-16 15:01:17 +02:00			`def convert_double_quotes_to_span(text):`
			`"""Convert single-quoted text to <span>...</span>."""`
			`# Use a regular expression to find single-quoted text and replace it`
			`return re.sub(r'"(.*?)"', r"<span class=emphasis-para>\1</span>", text)`


			`# def sanitize_text(text):`
			`# # Replace newlines with spaces`
			`# print(f"--{text}--")`
			`# decoded_text = html.unescape(text)`

			`# sanitized_text = decoded_text.replace("\n", "").replace(`
			`# "\r", " "`
			`# ) # Handle both Unix and Windows line endings`
			`# # Replace tabs with spaces`
			`# sanitized_text = sanitized_text.replace("\t", "")`
			`# # map single quotes to double`
			`# # sanitized_text = sanitized_text.replace("'", '"')`
			`# #Map signle and double quotes to nothing`
			`# sanitized_text.replace("'","").replace('"','')`
			`# #Take out any multiple spaces - reduce to one.`
			`# sanitized_text = ' '.join(sanitized_text.split())`
			`# # Strip leading and trailing whitespace`
			`# sanitized_text = sanitized_text.strip()`
			`# #sanitized_text = convert_double_quotes_to_span(sanitized_text)`
			`# print(f"++{sanitized_text}++")`
			`# return sanitized_text`

Add in original html - play with sanitising paragraphs 2024-09-14 19:08:48 +02:00			`def sanitize_text(text):`
			`# Replace newlines with spaces`
More fixing paragraph text quotes etc 2024-09-16 15:01:17 +02:00			`print(f"--{text}--")`
			`# Take out html entities`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`decoded_text = html.unescape(text)`
More fixing paragraph text quotes etc 2024-09-16 15:01:17 +02:00			`# Take out newlines`
			`sanitized_text = decoded_text.replace('\n', ' ').replace('\r', ' ') # Handle both Unix and Windows line endings`
Add in original html - play with sanitising paragraphs 2024-09-14 19:08:48 +02:00			`# Replace tabs with spaces`
More fixing paragraph text quotes etc 2024-09-16 15:01:17 +02:00			`sanitized_text = sanitized_text.replace('\t', ' ')`
			`# Replace quote characters`
			`sanitized_text = sanitized_text.replace('"', '').replace("'", '') # Remove double and single quotes`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`#Take out any multiple spaces - reduce to one.`
			`sanitized_text = ' '.join(sanitized_text.split())`
Add in original html - play with sanitising paragraphs 2024-09-14 19:08:48 +02:00			`# Strip leading and trailing whitespace`
			`sanitized_text = sanitized_text.strip()`
More fixing paragraph text quotes etc 2024-09-16 15:01:17 +02:00			`print(f"++{sanitized_text}++")`
Add in original html - play with sanitising paragraphs 2024-09-14 19:08:48 +02:00			`return sanitized_text`

Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00
More fixing paragraph text quotes etc 2024-09-16 15:01:17 +02:00
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`def extract_data(html):`
			`"""Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`soup = BeautifulSoup(html, "lxml")`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`records = []`

Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`hidden_input_names = ["page", "page_stack", ".id", "csrf_token"]`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00
			`header_text = None`
			`sub_header_text = None`

			`# Counter for tables`
			`table_counter = 0`

			`# Extract elements while preserving order`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`for element in soup.find_all(`
			`["h1", "h2", "p", "pre", "input", "select", "textarea", "button", "table"]`
			`):`
			`if element.name == "h1":`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`header_text = element.get_text(strip=True)`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`records.append({"Type": "Header", "Text": header_text})`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`elif element.name == "h2":`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`sub_header_text = element.get_text(strip=True)`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`records.append({"Type": "SubHeader", "Text": sub_header_text})`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`elif element.name == "p":`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`text = element.get_text(strip=True)`
			`if text: # Ignore empty paragraphs`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`# Sanitise text freom newlines,tabs and escape quotes.`
More fixing paragraph text quotes etc 2024-09-16 15:01:17 +02:00			`sanitised_text = sanitize_text(text)`
			`if sanitised_text == "":`
			`continue`
			`records.append({"Type": "Paragraph", "Text": sanitised_text})`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`elif element.name == "pre":`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`text = element.get_text(strip=True)`
			`if text: # Ensure non-empty before adding`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`records.append({"Type": "Preformatted", "Text": text})`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`elif element.name == "input":`
			`if (`
			`element.get("type") == "hidden"`
			`or element.get("name") in hidden_input_names`
			`):`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`continue`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`input_info = {`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`"Type": element.get("type", "text").capitalize(),`
			`"Name": element.get("name"),`
			`"Value": element.get("value", ""),`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`}`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`label = element.find_next("label")`
			`input_info["Label"] = label.get_text(strip=True) if label else None`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`records.append(input_info)`

Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`elif element.name == "select":`
			`options = [`
			`{"Value": option.get("value"), "Text": option.get_text(strip=True)}`
			`for option in element.find_all("option")`
			`]`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`select_info = {`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`"Type": "Select",`
			`"Name": element.get("name"),`
			`"Options": options,`
			`"Label": element.find_previous("label").get_text(strip=True)`
			`if element.find_previous("label")`
			`else None,`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`}`
			`records.append(select_info)`

Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`elif element.name == "textarea":`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`textarea_info = {`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`"Type": "Textarea",`
			`"Name": element.get("name"),`
			`"Value": element.get_text(strip=True),`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`}`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`label = element.find_previous("label")`
			`textarea_info["Label"] = label.get_text(strip=True) if label else None`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`records.append(textarea_info)`

Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`elif element.name == "button":`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`button_info = {`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`"Type": "Button",`
			`"Name": element.get("name"),`
			`"Value": element.get_text(strip=True),`
			`"Label": element.find_previous("label").get_text(strip=True)`
			`if label`
			`else None,`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`}`
			`records.append(button_info)`

Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`elif element.name == "table" and "sme-border" in element.get("class", []):`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`# Increment the table counter`
			`table_counter += 1`

			`# Prepare the TableControl format`
			`table_control = f"Table{table_counter}" # e.g., "Table1", "Table2"`
			`top_headings = []`
			`columns = []`

			`# Extract headings from the first row`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`first_row = element.find("tr")`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`if first_row:`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`for th in first_row.find_all("th"):`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`top_headings.append(th.get_text(strip=True))`

			`# Extract only the first data row's cell values for Columns`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`data_rows = element.find_all("tr")[1:] # Skip the heading row`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`if data_rows:`
			`first_data_row = data_rows[0] # Take the first row of data`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`for idx, th in enumerate(first_row.find_all("th")):`
			`td = (`
			`first_data_row.find_all("td")[idx]`
			`if idx < len(first_data_row.find_all("td"))`
			`else None`
			`)`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`if td:`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`columns.append(`
			`f"{table_control}-{th.get_text(strip=True)}"`
			`) # Format as desired`

			`records.append(`
			`{`
			`"Type": "Table",`
			`"TableControl": table_control,`
			`"TopHeadings": top_headings,`
			`"Columns": columns,`
			`}`
			`)`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00
			`return records, header_text, sub_header_text`

Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`def insert_spaces_before_caps(text):`
			`"""Insert spaces before each capital letter in a given string."""`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`return re.sub(r"(?<!^)(?=[A-Z])", " ", text)`

Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00
			`def save_to_json5(data, output_filename, package_name, header, sub_header):`
			`"""Save extracted data to a JSON5 file with a specific structure."""`
First rnun of SM1-2-json created json5 2024-09-12 20:37:27 +02:00			`# Generate prefix from uppercase letters in PackageName made into lowercase`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`prefix = "".join(re.findall(r"[A-Z]", package_name)).lower()`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00
			`# Prepare structured html list`
			`structured_html = []`
			`paragraph_count = 1`
			`preformatted_count = 1`
			`input_count = 1`
			`table_count = 1`

			`for record in data:`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`if record["Type"] == "Paragraph":`
			`structured_html.append({f"Paragraph{paragraph_count}": record["Text"]})`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`paragraph_count += 1`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`elif record["Type"] == "Preformatted":`
			`structured_html.append(`
			`{f"Preformatted{preformatted_count}": record["Text"]}`
			`)`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`preformatted_count += 1`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`elif record["Type"] == "Header" or record["Type"] == "SubHeader":`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`continue # Skip headers for input count`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`elif record["Type"] == "Table":`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`# Construct the table entry`
			`table_structure = {`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`"Type": record["Type"],`
			`"TableControl": record["TableControl"],`
			`"TopHeadings": record["TopHeadings"],`
			`"Columns": record["Columns"],`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`}`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`structured_html.append({f"Table{table_count}": table_structure})`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`table_count += 1`
			`else: # For inputs, selects, textareas, and buttons`
			`input_structure = {`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`"Type": record["Type"],`
			`"Value": record.get("Value", ""), # Safely access Value`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`}`

			`# Use .get() for the Name key to avoid KeyError`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`input_structure["Name"] = record.get(`
			`"Name", None`
			`) # Set to None if not present`
			`input_structure["Label"] = record.get(`
			`"Label", None`
			`) # Set to None if not present`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00
			`# Handle specific case for Select options`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`if "Options" in record:`
			`input_structure["Options"] = record["Options"]`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`structured_html.append({f"Input{input_count}": input_structure})`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`input_count += 1`

			`# Wrap the records with the required fields`
			`json5_data = {`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`"PackageName": package_name,`
			`"prefix": prefix,`
			`"MenuHeading": "Miscellaneous",`
			`"MenuDescription": insert_spaces_before_caps(package_name),`
			`"MenuNavigation": "2000 400",`
			`"firstPanel": "PARAMS",`
			`"signalEvent": f"smeserver-{package_name.lower()}-update",`
			`"html": [{`
			`"Name": "params",`
			`"route": "PARAMS",`
			`"Header": header if header else f"{package_name} Contrib",`
			`"SubHeader": sub_header`
			`if sub_header`
			`else f"Manage {package_name} settings:",`
			`**{`
			`k: v for item in structured_html for k, v in item.items()`
			`}, # Flatten the structured_html into the dict`
			`}],`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`}`

			`# Save in JSON5 format (JSON with comments and unquoted keys)`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`with open(output_filename, "w", encoding="utf-8") as json_file:`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`json.dump(json5_data, json_file, ensure_ascii=False, indent=4)`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`# Manually format as JSON5 by adding single quotes (for simplicity)`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`with open(output_filename, "r+", encoding="utf-8") as json_file:`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`content = json_file.read()`
Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`content = content.replace(`
			`'"', "'"`
			`) # Replace double quotes with single quotes for JSON5`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`json_file.seek(0)`
			`json_file.write(content)`
			`json_file.truncate() # Remove any old content beyond the new content length`


Add in entries for other mojo input tags 2024-09-15 13:06:34 +02:00			`def main():`
			`# command line parameters`
			`parser = argparse.ArgumentParser(description="sm1--html-2-jsopn5")`
			`parser.add_argument(`
			`"-f",`
			`"--filename",`
			`help="Specify a filename for the html file",`
			`default="CreateStarterWebsite.html",`
			`)`
			`args = parser.parse_args()`
			`input_file = "/home/brianr/clients/SM2/SM2Gen/venv/html/" + args.filename`
			`if not input_file.lower().endswith(".html"):`
			`# Add .html extension`
			`input_file += ".html"`
			`print(input_file)`

			`# Read HTML content`
			`html_content = read_html_file(input_file)`

			`# Validate the HTML before extracting data`
			`validate_html(html_content)`

			`# Extract data from HTML`
			`data, header, sub_header = extract_data(html_content)`
			`#`
			`# Generate output JSON5 filename based on input file name`
			`#`
			`# Split the original path into directory and file name`
			`directory, filename = os.path.split(input_file)`

			`# Replace 'html' with 'json5' in the directory path`
			`new_directory = directory.replace('/html', '/json5')`
			`#print(new_directory)`

			`# Construct the new path`
			`output_file = os.path.join(new_directory, filename.replace('.html', '.json5'))`
			`print(output_file)`
			`#quit(1)`

			`# Generate output JSON5 filename based on input file name`
			`base_name = os.path.basename(input_file) # Get the file name (with extension)`
			`package_name = os.path.splitext(base_name)[0] # Use the filename without extension`

			`# Save extracted data to JSON5`
			`save_to_json5(data, output_file, package_name, header, sub_header)`
			`print(f"Extracted data saved to '{output_file}'.")`


			`if __name__ == "__main__":`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`main()`