
318 lines
11 KiB

import json
import os
import re
from bs4 import BeautifulSoup
from lxml import etree # Import lxml for HTML validation
import html
import argparse
def read_html_file(filename):
"""Read HTML content from a file."""
with open(filename, "r", encoding="utf-8") as file:
def validate_html(html):
"""Validate the HTML content."""
parser = etree.HTMLParser()
etree.fromstring(html, parser) # Attempt to parse the HTML
except Exception as e:
raise ValueError("Invalid HTML document") from e
def sanitize_text(text):
# Replace newlines with spaces
decoded_text = html.unescape(text)
sanitized_text = decoded_text.replace("\n", "").replace(
"\r", " "
) # Handle both Unix and Windows line endings
# Replace tabs with spaces
sanitized_text = sanitized_text.replace("\t", "")
# map single quotes to double
sanitized_text = sanitized_text.replace("'", '"')
#Take out any multiple spaces - reduce to one.
sanitized_text = ' '.join(sanitized_text.split())
# Strip leading and trailing whitespace
sanitized_text = sanitized_text.strip()
return sanitized_text
def extract_data(html):
"""Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
soup = BeautifulSoup(html, "lxml")
records = []
hidden_input_names = ["page", "page_stack", ".id", "csrf_token"]
header_text = None
sub_header_text = None
# Counter for tables
table_counter = 0
# Extract elements while preserving order
for element in soup.find_all(
["h1", "h2", "p", "pre", "input", "select", "textarea", "button", "table"]
if == "h1":
header_text = element.get_text(strip=True)
records.append({"Type": "Header", "Text": header_text})
elif == "h2":
sub_header_text = element.get_text(strip=True)
records.append({"Type": "SubHeader", "Text": sub_header_text})
elif == "p":
text = element.get_text(strip=True)
if text: # Ignore empty paragraphs
# Sanitise text freom newlines,tabs and escape quotes.
records.append({"Type": "Paragraph", "Text": sanitize_text(text)})
elif == "pre":
text = element.get_text(strip=True)
if text: # Ensure non-empty before adding
records.append({"Type": "Preformatted", "Text": text})
elif == "input":
if (
element.get("type") == "hidden"
or element.get("name") in hidden_input_names
input_info = {
"Type": element.get("type", "text").capitalize(),
"Name": element.get("name"),
"Value": element.get("value", ""),
label = element.find_next("label")
input_info["Label"] = label.get_text(strip=True) if label else None
elif == "select":
options = [
{"Value": option.get("value"), "Text": option.get_text(strip=True)}
for option in element.find_all("option")
select_info = {
"Type": "Select",
"Name": element.get("name"),
"Options": options,
"Label": element.find_previous("label").get_text(strip=True)
if element.find_previous("label")
else None,
elif == "textarea":
textarea_info = {
"Type": "Textarea",
"Name": element.get("name"),
"Value": element.get_text(strip=True),
label = element.find_previous("label")
textarea_info["Label"] = label.get_text(strip=True) if label else None
elif == "button":
button_info = {
"Type": "Button",
"Name": element.get("name"),
"Value": element.get_text(strip=True),
"Label": element.find_previous("label").get_text(strip=True)
if label
else None,
elif == "table" and "sme-border" in element.get("class", []):
# Increment the table counter
table_counter += 1
# Prepare the TableControl format
table_control = f"Table{table_counter}" # e.g., "Table1", "Table2"
top_headings = []
columns = []
# Extract headings from the first row
first_row = element.find("tr")
if first_row:
for th in first_row.find_all("th"):
# Extract only the first data row's cell values for Columns
data_rows = element.find_all("tr")[1:] # Skip the heading row
if data_rows:
first_data_row = data_rows[0] # Take the first row of data
for idx, th in enumerate(first_row.find_all("th")):
td = (
if idx < len(first_data_row.find_all("td"))
else None
if td:
) # Format as desired
"Type": "Table",
"TableControl": table_control,
"TopHeadings": top_headings,
"Columns": columns,
return records, header_text, sub_header_text
def insert_spaces_before_caps(text):
"""Insert spaces before each capital letter in a given string."""
return re.sub(r"(?<!^)(?=[A-Z])", " ", text)
def save_to_json5(data, output_filename, package_name, header, sub_header):
"""Save extracted data to a JSON5 file with a specific structure."""
# Generate prefix from uppercase letters in PackageName made into lowercase
prefix = "".join(re.findall(r"[A-Z]", package_name)).lower()
# Prepare structured html list
structured_html = []
paragraph_count = 1
preformatted_count = 1
input_count = 1
table_count = 1
for record in data:
if record["Type"] == "Paragraph":
structured_html.append({f"Paragraph{paragraph_count}": record["Text"]})
paragraph_count += 1
elif record["Type"] == "Preformatted":
{f"Preformatted{preformatted_count}": record["Text"]}
preformatted_count += 1
elif record["Type"] == "Header" or record["Type"] == "SubHeader":
continue # Skip headers for input count
elif record["Type"] == "Table":
# Construct the table entry
table_structure = {
"Type": record["Type"],
"TableControl": record["TableControl"],
"TopHeadings": record["TopHeadings"],
"Columns": record["Columns"],
structured_html.append({f"Table{table_count}": table_structure})
table_count += 1
else: # For inputs, selects, textareas, and buttons
input_structure = {
"Type": record["Type"],
"Value": record.get("Value", ""), # Safely access Value
# Use .get() for the Name key to avoid KeyError
input_structure["Name"] = record.get(
"Name", None
) # Set to None if not present
input_structure["Label"] = record.get(
"Label", None
) # Set to None if not present
# Handle specific case for Select options
if "Options" in record:
input_structure["Options"] = record["Options"]
structured_html.append({f"Input{input_count}": input_structure})
input_count += 1
# Wrap the records with the required fields
json5_data = {
"PackageName": package_name,
"prefix": prefix,
"MenuHeading": "Miscellaneous",
"MenuDescription": insert_spaces_before_caps(package_name),
"MenuNavigation": "2000 400",
"firstPanel": "PARAMS",
"signalEvent": f"smeserver-{package_name.lower()}-update",
"html": [{
"Name": "params",
"route": "PARAMS",
"Header": header if header else f"{package_name} Contrib",
"SubHeader": sub_header
if sub_header
else f"Manage {package_name} settings:",
k: v for item in structured_html for k, v in item.items()
}, # Flatten the structured_html into the dict
# Save in JSON5 format (JSON with comments and unquoted keys)
with open(output_filename, "w", encoding="utf-8") as json_file:
json.dump(json5_data, json_file, ensure_ascii=False, indent=4)
# Manually format as JSON5 by adding single quotes (for simplicity)
with open(output_filename, "r+", encoding="utf-8") as json_file:
content =
content = content.replace(
'"', "'"
) # Replace double quotes with single quotes for JSON5
json_file.truncate() # Remove any old content beyond the new content length
def main():
# command line parameters
parser = argparse.ArgumentParser(description="sm1--html-2-jsopn5")
help="Specify a filename for the html file",
args = parser.parse_args()
input_file = "/home/brianr/clients/SM2/SM2Gen/venv/html/" + args.filename
if not input_file.lower().endswith(".html"):
# Add .html extension
input_file += ".html"
# Read HTML content
html_content = read_html_file(input_file)
# Validate the HTML before extracting data
# Extract data from HTML
data, header, sub_header = extract_data(html_content)
# Generate output JSON5 filename based on input file name
# Split the original path into directory and file name
directory, filename = os.path.split(input_file)
# Replace 'html' with 'json5' in the directory path
new_directory = directory.replace('/html', '/json5')
# Construct the new path
output_file = os.path.join(new_directory, filename.replace('.html', '.json5'))
# Generate output JSON5 filename based on input file name
base_name = os.path.basename(input_file) # Get the file name (with extension)
package_name = os.path.splitext(base_name)[0] # Use the filename without extension
# Save extracted data to JSON5
save_to_json5(data, output_file, package_name, header, sub_header)
print(f"Extracted data saved to '{output_file}'.")
if __name__ == "__main__":