Add in entries for other mojo input tags

2024-09-15 12:06:34 +01:00
parent 3b6e1930a0
commit 4d7cc1f076
18 changed files with 821 additions and 754 deletions
--- a/sm1-html-2-json5.py
+++ b/sm1-html-2-json5.py
@@ -3,12 +3,16 @@ import os
 import re
 from bs4 import BeautifulSoup
 from lxml import etree  # Import lxml for HTML validation
+import html
+import argparse
+

 def read_html_file(filename):
    """Read HTML content from a file."""
-    with open(filename, 'r', encoding='utf-8') as file:
+    with open(filename, "r", encoding="utf-8") as file:
        return file.read()

+
 def validate_html(html):
    """Validate the HTML content."""
    try:
@@ -16,14 +20,20 @@ def validate_html(html):
        etree.fromstring(html, parser)  # Attempt to parse the HTML
    except Exception as e:
        raise ValueError("Invalid HTML document") from e
-        
+
+
 def sanitize_text(text):
    # Replace newlines with spaces
-    sanitized_text = text.replace('\n', ' ').replace('\r', ' ')  # Handle both Unix and Windows line endings
+    decoded_text = html.unescape(text)
+    sanitized_text = decoded_text.replace("\n", "").replace(
+        "\r", " "
+    )  # Handle both Unix and Windows line endings
    # Replace tabs with spaces
-    sanitized_text = sanitized_text.replace('\t', ' ')
-    # Escape quote characters
-    sanitized_text = sanitized_text.replace('"', '\\"').replace("'", "\\'")
+    sanitized_text = sanitized_text.replace("\t", "")
+    # map single quotes to double
+    sanitized_text = sanitized_text.replace("'", '"')
+    #Take out any multiple spaces - reduce to one.
+    sanitized_text = ' '.join(sanitized_text.split())
    # Strip leading and trailing whitespace
    sanitized_text = sanitized_text.strip()
    return sanitized_text
@@ -31,15 +41,10 @@ def sanitize_text(text):

 def extract_data(html):
    """Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
-    soup = BeautifulSoup(html, 'lxml')
+    soup = BeautifulSoup(html, "lxml")
    records = []

-    hidden_input_names = [
-        'page',
-        'page_stack',
-        '.id',
-        'csrf_token'
-    ]
+    hidden_input_names = ["page", "page_stack", ".id", "csrf_token"]

    header_text = None
    sub_header_text = None
@@ -48,81 +53,81 @@ def extract_data(html):
    table_counter = 0

    # Extract elements while preserving order
-    for element in soup.find_all(['h1', 'h2', 'p', 'pre', 'input', 'select', 'textarea', 'button', 'table']):
-        if element.name == 'h1':
+    for element in soup.find_all(
+        ["h1", "h2", "p", "pre", "input", "select", "textarea", "button", "table"]
+    ):
+        if element.name == "h1":
            header_text = element.get_text(strip=True)
-            records.append({
-                'Type': 'Header',
-                'Text': header_text
-            })
+            records.append({"Type": "Header", "Text": header_text})

-        elif element.name == 'h2':
+        elif element.name == "h2":
            sub_header_text = element.get_text(strip=True)
-            records.append({
-                'Type': 'SubHeader',
-                'Text': sub_header_text
-            })
+            records.append({"Type": "SubHeader", "Text": sub_header_text})

-        elif element.name == 'p':
+        elif element.name == "p":
            text = element.get_text(strip=True)
            if text:  # Ignore empty paragraphs
-				#Sanitise text freom newlines,tabs and escape quotes.
-                records.append({
-                    'Type': 'Paragraph',
-                    'Text': sanitize_text(text)
-                })
+                # Sanitise text freom newlines,tabs and escape quotes.
+                records.append({"Type": "Paragraph", "Text": sanitize_text(text)})

-        elif element.name == 'pre':
+        elif element.name == "pre":
            text = element.get_text(strip=True)
            if text:  # Ensure non-empty before adding
-                records.append({
-                    'Type': 'Preformatted',
-                    'Text': text
-                })
+                records.append({"Type": "Preformatted", "Text": text})

-        elif element.name == 'input':
-            if element.get('type') == 'hidden' or element.get('name') in hidden_input_names:
+        elif element.name == "input":
+            if (
+                element.get("type") == "hidden"
+                or element.get("name") in hidden_input_names
+            ):
                continue
-            
+
            input_info = {
-                'Type': element.get('type', 'text').capitalize(),
-                'Name': element.get('name'),
-                'Value': element.get('value', ''),
+                "Type": element.get("type", "text").capitalize(),
+                "Name": element.get("name"),
+                "Value": element.get("value", ""),
            }
-            label = element.find_next('label')
-            input_info['Label'] = label.get_text(strip=True) if label else None
+            label = element.find_next("label")
+            input_info["Label"] = label.get_text(strip=True) if label else None
            records.append(input_info)

-        elif element.name == 'select':
-            options = [{'Value': option.get('value'), 'Text': option.get_text(strip=True)} for option in element.find_all('option')]
+        elif element.name == "select":
+            options = [
+                {"Value": option.get("value"), "Text": option.get_text(strip=True)}
+                for option in element.find_all("option")
+            ]
            select_info = {
-                'Type': 'Select',
-                'Name': element.get('name'),
-                'Options': options,
-                'Label': element.find_previous('label').get_text(strip=True) if element.find_previous('label') else None,
+                "Type": "Select",
+                "Name": element.get("name"),
+                "Options": options,
+                "Label": element.find_previous("label").get_text(strip=True)
+                if element.find_previous("label")
+                else None,
            }
            records.append(select_info)

-        elif element.name == 'textarea':
+        elif element.name == "textarea":
            textarea_info = {
-                'Type': 'Textarea',
-                'Name': element.get('name'),
-                'Value': element.get_text(strip=True),
+                "Type": "Textarea",
+                "Name": element.get("name"),
+                "Value": element.get_text(strip=True),
            }
-            label = element.find_previous('label')
-            textarea_info['Label'] = label.get_text(strip=True) if label else None
+            label = element.find_previous("label")
+            textarea_info["Label"] = label.get_text(strip=True) if label else None
            records.append(textarea_info)

-        elif element.name == 'button':
+        elif element.name == "button":
            button_info = {
-                'Type': 'Button',
-                'Name': element.get('name'),
-                'Value': element.get_text(strip=True),
-                'Label': element.find_previous('label').get_text(strip=True) if label else None,
+                "Type": "Button",
+                "Name": element.get("name"),
+                "Value": element.get_text(strip=True),
+                "Label": element.find_previous("label").get_text(strip=True)
+                if label
+                else None,
            }
            records.append(button_info)

-        elif element.name == 'table' and 'sme-border' in element.get('class', []):
+        elif element.name == "table" and "sme-border" in element.get("class", []):
            # Increment the table counter
            table_counter += 1

@@ -132,37 +137,47 @@ def extract_data(html):
            columns = []

            # Extract headings from the first row
-            first_row = element.find('tr')
+            first_row = element.find("tr")
            if first_row:
-                for th in first_row.find_all('th'):
+                for th in first_row.find_all("th"):
                    top_headings.append(th.get_text(strip=True))

            # Extract only the first data row's cell values for Columns
-            data_rows = element.find_all('tr')[1:]  # Skip the heading row
+            data_rows = element.find_all("tr")[1:]  # Skip the heading row
            if data_rows:
                first_data_row = data_rows[0]  # Take the first row of data
-                for idx, th in enumerate(first_row.find_all('th')):
-                    td = first_data_row.find_all('td')[idx] if idx < len(first_data_row.find_all('td')) else None
+                for idx, th in enumerate(first_row.find_all("th")):
+                    td = (
+                        first_data_row.find_all("td")[idx]
+                        if idx < len(first_data_row.find_all("td"))
+                        else None
+                    )
                    if td:
-                        columns.append(f"{table_control}-{th.get_text(strip=True)}")  # Format as desired
+                        columns.append(
+                            f"{table_control}-{th.get_text(strip=True)}"
+                        )  # Format as desired

-            records.append({
-                'Type': 'Table',
-                'TableControl': table_control,
-                'TopHeadings': top_headings,
-                'Columns': columns,
-            })
+            records.append(
+                {
+                    "Type": "Table",
+                    "TableControl": table_control,
+                    "TopHeadings": top_headings,
+                    "Columns": columns,
+                }
+            )

    return records, header_text, sub_header_text

+
 def insert_spaces_before_caps(text):
    """Insert spaces before each capital letter in a given string."""
-    return re.sub(r'(?<!^)(?=[A-Z])', ' ', text)
+    return re.sub(r"(?<!^)(?=[A-Z])", " ", text)
+

 def save_to_json5(data, output_filename, package_name, header, sub_header):
    """Save extracted data to a JSON5 file with a specific structure."""
    # Generate prefix from uppercase letters in PackageName made into lowercase
-    prefix = ''.join(re.findall(r'[A-Z]', package_name)).lower()
+    prefix = "".join(re.findall(r"[A-Z]", package_name)).lower()

    # Prepare structured html list
    structured_html = []
@@ -172,103 +187,131 @@ def save_to_json5(data, output_filename, package_name, header, sub_header):
    table_count = 1

    for record in data:
-        if record['Type'] == 'Paragraph':
-            structured_html.append({
-                f'Paragraph{paragraph_count}': record['Text']
-            })
+        if record["Type"] == "Paragraph":
+            structured_html.append({f"Paragraph{paragraph_count}": record["Text"]})
            paragraph_count += 1
-        elif record['Type'] == 'Preformatted':
-            structured_html.append({
-                f'Preformatted{preformatted_count}': record['Text']
-            })
+        elif record["Type"] == "Preformatted":
+            structured_html.append(
+                {f"Preformatted{preformatted_count}": record["Text"]}
+            )
            preformatted_count += 1
-        elif record['Type'] == 'Header' or record['Type'] == 'SubHeader':
+        elif record["Type"] == "Header" or record["Type"] == "SubHeader":
            continue  # Skip headers for input count
-        elif record['Type'] == 'Table':
+        elif record["Type"] == "Table":
            # Construct the table entry
            table_structure = {
-                'Type': record['Type'],
-                'TableControl': record['TableControl'],
-                'TopHeadings': record['TopHeadings'],
-                'Columns': record['Columns']
+                "Type": record["Type"],
+                "TableControl": record["TableControl"],
+                "TopHeadings": record["TopHeadings"],
+                "Columns": record["Columns"],
            }
-            structured_html.append({
-                f'Table{table_count}': table_structure
-            })
+            structured_html.append({f"Table{table_count}": table_structure})
            table_count += 1
        else:  # For inputs, selects, textareas, and buttons
            input_structure = {
-                'Type': record['Type'],
-                'Value': record.get('Value', ''),  # Safely access Value
+                "Type": record["Type"],
+                "Value": record.get("Value", ""),  # Safely access Value
            }

            # Use .get() for the Name key to avoid KeyError
-            input_structure['Name'] = record.get('Name', None)  # Set to None if not present
-            input_structure['Label'] = record.get('Label', None)  # Set to None if not present
+            input_structure["Name"] = record.get(
+                "Name", None
+            )  # Set to None if not present
+            input_structure["Label"] = record.get(
+                "Label", None
+            )  # Set to None if not present

            # Handle specific case for Select options
-            if 'Options' in record:
-                input_structure['Options'] = record['Options']
+            if "Options" in record:
+                input_structure["Options"] = record["Options"]

-            structured_html.append({
-                f'Input{input_count}': input_structure
-            })
+            structured_html.append({f"Input{input_count}": input_structure})
            input_count += 1

    # Wrap the records with the required fields
    json5_data = {
-        'PackageName': package_name,
-        'prefix': prefix,
-        'MenuHeading': 'Miscellaneous',
-        'MenuDescription': insert_spaces_before_caps(package_name),
-        'MenuNavigation': '2000 400',
-        'firstPanel': 'PARAMS',
-        'signalEvent': f'smeserver-{package_name.lower()}-update',
-        'html': {
-            'Name': 'params',
-            'route': 'PARAMS',
-            'Header': header if header else f'{package_name} Contrib',
-            'SubHeader': sub_header if sub_header else f'Manage {package_name} settings:',
-            **{k: v for item in structured_html for k, v in item.items()}  # Flatten the structured_html into the dict
-        }
+        "PackageName": package_name,
+        "prefix": prefix,
+        "MenuHeading": "Miscellaneous",
+        "MenuDescription": insert_spaces_before_caps(package_name),
+        "MenuNavigation": "2000 400",
+        "firstPanel": "PARAMS",
+        "signalEvent": f"smeserver-{package_name.lower()}-update",
+        "html": [{
+            "Name": "params",
+            "route": "PARAMS",
+            "Header": header if header else f"{package_name} Contrib",
+            "SubHeader": sub_header
+            if sub_header
+            else f"Manage {package_name} settings:",
+            **{
+                k: v for item in structured_html for k, v in item.items()
+            },  # Flatten the structured_html into the dict
+        }],
    }

    # Save in JSON5 format (JSON with comments and unquoted keys)
-    with open(output_filename, 'w', encoding='utf-8') as json_file:
+    with open(output_filename, "w", encoding="utf-8") as json_file:
        json.dump(json5_data, json_file, ensure_ascii=False, indent=4)
-    
+
    # Manually format as JSON5 by adding single quotes (for simplicity)
-    with open(output_filename, 'r+', encoding='utf-8') as json_file:
+    with open(output_filename, "r+", encoding="utf-8") as json_file:
        content = json_file.read()
-        content = content.replace('"', "'")  # Replace double quotes with single quotes for JSON5
+        content = content.replace(
+            '"', "'"
+        )  # Replace double quotes with single quotes for JSON5
        json_file.seek(0)
        json_file.write(content)
        json_file.truncate()  # Remove any old content beyond the new content length

+
 def main():
-    input_file = '/home/brianr/clients/SM2/SM2Gen/venv/html/CreateStarterWebsite.html'  # Specify the input HTML file path
+	# command line parameters
+	parser = argparse.ArgumentParser(description="sm1--html-2-jsopn5")
+	parser.add_argument(
+		"-f",
+		"--filename",
+		help="Specify a filename for the html file",
+		default="CreateStarterWebsite.html",
+	)
+	args = parser.parse_args()
+	input_file = "/home/brianr/clients/SM2/SM2Gen/venv/html/" + args.filename
+	if not input_file.lower().endswith(".html"):
+		# Add .html extension
+		input_file += ".html"
+	print(input_file)

-    # Read HTML content
-    html_content = read_html_file(input_file)
+	# Read HTML content
+	html_content = read_html_file(input_file)

-    # Validate the HTML before extracting data
-    validate_html(html_content)
+	# Validate the HTML before extracting data
+	validate_html(html_content)

-    # Extract data from HTML
-    data, header, sub_header = extract_data(html_content)
+	# Extract data from HTML
+	data, header, sub_header = extract_data(html_content)
+	#
+	# Generate output JSON5 filename based on input file name
+	#
+	 # Split the original path into directory and file name
+	directory, filename = os.path.split(input_file)

-    # Generate output JSON5 filename based on input file name
-    base_name = os.path.basename(input_file)  # Get the file name (with extension)
-    package_name = os.path.splitext(base_name)[0]  # Use the filename without extension
-    json_filename = package_name + '.json5'  # Change extension to .json5
+	# Replace 'html' with 'json5' in the directory path
+	new_directory = directory.replace('/html', '/json5')
+	#print(new_directory)

-    # Create the output file path in the same directory
-    output_directory = os.path.dirname(input_file)
-    output_file = os.path.join(output_directory, json_filename)
+	# Construct the new path
+	output_file = os.path.join(new_directory, filename.replace('.html', '.json5'))
+	print(output_file)
+	#quit(1)
+	
+	# Generate output JSON5 filename based on input file name
+	base_name = os.path.basename(input_file)  # Get the file name (with extension)
+	package_name = os.path.splitext(base_name)[0]  # Use the filename without extension

-    # Save extracted data to JSON5
-    save_to_json5(data, output_file, package_name, header, sub_header)
-    print(f"Extracted data saved to '{output_file}'.")
+	# Save extracted data to JSON5
+	save_to_json5(data, output_file, package_name, header, sub_header)
+	print(f"Extracted data saved to '{output_file}'.")

-if __name__ == '__main__':
+
+if __name__ == "__main__":
    main()