2024-09-12 19:54:38 +02:00
|
|
|
import json
|
|
|
|
import re
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from lxml import etree # Import lxml for HTML validation
|
2024-09-15 13:06:34 +02:00
|
|
|
import html
|
|
|
|
import argparse
|
2024-09-18 12:46:26 +02:00
|
|
|
import pkg_resources
|
|
|
|
import sys
|
|
|
|
import traceback
|
|
|
|
import os
|
|
|
|
from datetime import datetime, timedelta
|
|
|
|
|
|
|
|
|
|
|
|
sm1_html_2_json5_version = "0.5"
|
|
|
|
|
|
|
|
def assemble_version_string():
|
|
|
|
try:
|
|
|
|
chameleon_version = pkg_resources.get_distribution("Chameleon").version
|
|
|
|
except pkg_resources.DistributionNotFound:
|
|
|
|
chameleon_version = "No version information"
|
|
|
|
python_version = sys.version
|
|
|
|
version_pattern = r"(\d{1,3}\.\d{1,3}\.\d{1,3})"
|
|
|
|
version_match = re.search(version_pattern, python_version)
|
|
|
|
python_version = version_match.group(0) if version_match else "Unknown"
|
|
|
|
current_datetime = datetime.now()
|
|
|
|
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
strVersion = (
|
|
|
|
"sm1-html-2-json5 version:"
|
|
|
|
+ sm1_html_2_json5_version
|
|
|
|
+ " Chameleon version:"
|
|
|
|
+ chameleon_version
|
|
|
|
+ " On Python:"
|
|
|
|
+ python_version
|
|
|
|
+ " at "
|
|
|
|
+ formatted_datetime
|
|
|
|
)
|
|
|
|
return strVersion
|
2024-09-15 13:06:34 +02:00
|
|
|
|
2024-09-18 12:46:26 +02:00
|
|
|
def check_file_version(filename, ThresholdSecs=3):
|
|
|
|
#
|
|
|
|
# Check modified versus creation date of the file and return +".new" if modified since creation + ThresholdSecs
|
|
|
|
#
|
|
|
|
try:
|
|
|
|
with open(filename, 'r') as file:
|
|
|
|
# Read the first three lines
|
|
|
|
header_lines = [file.readline().strip() for _ in range(5)]
|
|
|
|
|
|
|
|
# Extract the timestamp
|
|
|
|
timestamp_str = None
|
|
|
|
for line in header_lines:
|
|
|
|
if ' at ' in line:
|
|
|
|
# Split at 'at', expect the timestamp to be in the third part
|
|
|
|
print(line)
|
|
|
|
timestamp_str = line.split('at')[2].strip()
|
|
|
|
break
|
|
|
|
|
|
|
|
if timestamp_str is None:
|
|
|
|
print("Warning: No timestamp found. Returning original filename.")
|
|
|
|
return filename # Return the original filename if no timestamp is found
|
|
|
|
|
|
|
|
# Convert the string timestamp to a datetime object
|
|
|
|
file_timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S')
|
|
|
|
|
|
|
|
# Add the threshold seconds to the creation date
|
|
|
|
file_timestamp += timedelta(seconds=ThresholdSecs)
|
|
|
|
|
|
|
|
# Get the last modified time of the file, ignoring milliseconds
|
|
|
|
file_modified_time = datetime.fromtimestamp(os.path.getmtime(filename)).replace(microsecond=0)
|
|
|
|
|
|
|
|
print(file_modified_time,file_timestamp)
|
|
|
|
|
|
|
|
# Compare the timestamps
|
|
|
|
if file_modified_time > file_timestamp:
|
|
|
|
return f"{filename}.new"
|
|
|
|
else:
|
|
|
|
return filename
|
|
|
|
except FileNotFoundError:
|
|
|
|
print(f"Error: The file '{filename}' does not exist.")
|
|
|
|
return filename
|
|
|
|
except Exception as e:
|
|
|
|
print(f"An error occurred: {traceback.format_exc()}")
|
|
|
|
return filename
|
|
|
|
|
2024-09-12 19:54:38 +02:00
|
|
|
|
|
|
|
def read_html_file(filename):
|
|
|
|
"""Read HTML content from a file."""
|
2024-09-15 13:06:34 +02:00
|
|
|
with open(filename, "r", encoding="utf-8") as file:
|
2024-09-12 19:54:38 +02:00
|
|
|
return file.read()
|
|
|
|
|
2024-09-15 13:06:34 +02:00
|
|
|
|
2024-09-12 19:54:38 +02:00
|
|
|
def validate_html(html):
|
|
|
|
"""Validate the HTML content."""
|
|
|
|
try:
|
|
|
|
parser = etree.HTMLParser()
|
|
|
|
etree.fromstring(html, parser) # Attempt to parse the HTML
|
|
|
|
except Exception as e:
|
|
|
|
raise ValueError("Invalid HTML document") from e
|
2024-09-15 13:06:34 +02:00
|
|
|
|
2024-09-18 12:46:26 +02:00
|
|
|
|
2024-09-16 15:01:17 +02:00
|
|
|
def convert_double_quotes_to_span(text):
|
|
|
|
"""Convert single-quoted text to <span>...</span>."""
|
|
|
|
# Use a regular expression to find single-quoted text and replace it
|
|
|
|
return re.sub(r'"(.*?)"', r"<span class=emphasis-para>\1</span>", text)
|
|
|
|
|
|
|
|
|
|
|
|
# def sanitize_text(text):
|
2024-09-18 12:46:26 +02:00
|
|
|
# # Replace newlines with spaces
|
|
|
|
# print(f"--{text}--")
|
|
|
|
# decoded_text = html.unescape(text)
|
|
|
|
|
|
|
|
# sanitized_text = decoded_text.replace("\n", "").replace(
|
|
|
|
# "\r", " "
|
|
|
|
# ) # Handle both Unix and Windows line endings
|
|
|
|
# # Replace tabs with spaces
|
|
|
|
# sanitized_text = sanitized_text.replace("\t", "")
|
|
|
|
# # map single quotes to double
|
|
|
|
# # sanitized_text = sanitized_text.replace("'", '"')
|
|
|
|
# #Map signle and double quotes to nothing
|
|
|
|
# sanitized_text.replace("'","").replace('"','')
|
|
|
|
# #Take out any multiple spaces - reduce to one.
|
|
|
|
# sanitized_text = ' '.join(sanitized_text.split())
|
|
|
|
# # Strip leading and trailing whitespace
|
|
|
|
# sanitized_text = sanitized_text.strip()
|
|
|
|
# #sanitized_text = convert_double_quotes_to_span(sanitized_text)
|
|
|
|
# print(f"++{sanitized_text}++")
|
|
|
|
# return sanitized_text
|
|
|
|
|
|
|
|
|
2024-09-14 19:08:48 +02:00
|
|
|
def sanitize_text(text):
|
|
|
|
# Replace newlines with spaces
|
2024-09-16 15:01:17 +02:00
|
|
|
print(f"--{text}--")
|
|
|
|
# Take out html entities
|
2024-09-15 13:06:34 +02:00
|
|
|
decoded_text = html.unescape(text)
|
2024-09-18 12:46:26 +02:00
|
|
|
# Take out newlines
|
|
|
|
sanitized_text = decoded_text.replace("\n", " ").replace(
|
|
|
|
"\r", " "
|
|
|
|
) # Handle both Unix and Windows line endings
|
2024-09-14 19:08:48 +02:00
|
|
|
# Replace tabs with spaces
|
2024-09-18 12:46:26 +02:00
|
|
|
sanitized_text = sanitized_text.replace("\t", " ")
|
2024-09-16 15:01:17 +02:00
|
|
|
# Replace quote characters
|
2024-09-18 12:46:26 +02:00
|
|
|
sanitized_text = sanitized_text.replace('"', "").replace(
|
|
|
|
"'", ""
|
|
|
|
) # Remove double and single quotes
|
|
|
|
# Take out any multiple spaces - reduce to one.
|
|
|
|
sanitized_text = " ".join(sanitized_text.split())
|
2024-09-14 19:08:48 +02:00
|
|
|
# Strip leading and trailing whitespace
|
|
|
|
sanitized_text = sanitized_text.strip()
|
2024-09-16 15:01:17 +02:00
|
|
|
print(f"++{sanitized_text}++")
|
2024-09-14 19:08:48 +02:00
|
|
|
return sanitized_text
|
|
|
|
|
2024-09-12 19:54:38 +02:00
|
|
|
|
|
|
|
def extract_data(html):
|
2024-11-04 18:08:38 +01:00
|
|
|
"""Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
records = []
|
|
|
|
|
|
|
|
hidden_input_names = ["page", "page_stack", ".id", "csrf_token"]
|
|
|
|
|
|
|
|
header_text = None
|
|
|
|
sub_header_text = None
|
|
|
|
|
|
|
|
# Counter for tables
|
|
|
|
table_counter = 0
|
|
|
|
|
|
|
|
# Extract elements while preserving order
|
|
|
|
for element in soup.find_all(
|
|
|
|
["h1", "h2", "p", "pre", "input", "select", "textarea", "button", "table","a"]
|
|
|
|
):
|
|
|
|
if element.name == "h1":
|
|
|
|
header_text = element.get_text(strip=True)
|
|
|
|
records.append({"Type": "Header", "Text": header_text})
|
|
|
|
|
|
|
|
elif element.name == "h2":
|
|
|
|
sub_header_text = element.get_text(strip=True)
|
|
|
|
records.append({"Type": "SubHeader", "Text": sub_header_text})
|
|
|
|
|
|
|
|
elif element.name == "p":
|
|
|
|
text = element.get_text(strip=True)
|
|
|
|
if text: # Ignore empty paragraphs
|
|
|
|
# Sanitise text from newlines,tabs and escape quotes.
|
|
|
|
sanitised_text = sanitize_text(text)
|
|
|
|
if sanitised_text == "":
|
|
|
|
continue
|
|
|
|
records.append({"Type": "Paragraph", "Text": sanitised_text})
|
|
|
|
|
|
|
|
elif element.name == "pre":
|
|
|
|
text = element.get_text(strip=True)
|
|
|
|
if text: # Ensure non-empty before adding
|
|
|
|
records.append({"Type": "Preformatted", "Text": text})
|
|
|
|
|
|
|
|
elif element.name == "a":
|
|
|
|
title = element.get_text(strip=True)
|
|
|
|
href = element.get("href")
|
|
|
|
records.append({"Type": "Link", "href": href, "title": title})
|
|
|
|
|
|
|
|
elif element.name == "input":
|
|
|
|
if (
|
|
|
|
element.get("type") == "hidden"
|
|
|
|
or element.get("name") in hidden_input_names
|
|
|
|
):
|
|
|
|
continue
|
|
|
|
|
|
|
|
input_info = {
|
|
|
|
"Type": element.get("type", "text").capitalize(),
|
|
|
|
"Name": element.get("name"),
|
|
|
|
"Value": element.get("value", ""),
|
|
|
|
}
|
|
|
|
label = element.find_next("label")
|
|
|
|
input_info["Label"] = label.get_text(strip=True) if label else None
|
|
|
|
records.append(input_info)
|
|
|
|
|
|
|
|
elif element.name == "select":
|
|
|
|
options = [
|
|
|
|
{"Value": option.get("value"), "Text": option.get_text(strip=True)}
|
|
|
|
for option in element.find_all("option")
|
|
|
|
]
|
|
|
|
select_info = {
|
|
|
|
"Type": "Select",
|
|
|
|
"Name": element.get("name"),
|
|
|
|
"Options": options,
|
|
|
|
"Label": element.find_previous("label").get_text(strip=True)
|
|
|
|
if element.find_previous("label")
|
|
|
|
else None,
|
|
|
|
}
|
|
|
|
records.append(select_info)
|
|
|
|
|
|
|
|
|
|
|
|
elif element.name == "textarea":
|
|
|
|
textarea_info = {
|
|
|
|
"Type": "Textarea",
|
|
|
|
"Name": element.get("name"),
|
|
|
|
"Value": element.get_text(strip=True),
|
|
|
|
}
|
|
|
|
label = element.find_previous("label")
|
|
|
|
textarea_info["Label"] = label.get_text(strip=True) if label else None
|
|
|
|
records.append(textarea_info)
|
|
|
|
|
|
|
|
elif element.name == "button":
|
|
|
|
button_info = {
|
|
|
|
"Type": "Button",
|
|
|
|
"Name": element.get("name"),
|
|
|
|
"Value": element.get_text(strip=True),
|
|
|
|
"Label": element.find_previous("label").get_text(strip=True)
|
|
|
|
if label
|
|
|
|
else None,
|
|
|
|
}
|
|
|
|
records.append(button_info)
|
|
|
|
|
|
|
|
elif element.name == "table" and "sme-border" in element.get("class", []):
|
|
|
|
# Increment the table counter
|
|
|
|
table_counter += 1
|
|
|
|
|
|
|
|
# Prepare the TableControl format
|
|
|
|
table_control = f"Table{table_counter}" # e.g., "Table1", "Table2"
|
|
|
|
top_headings = []
|
|
|
|
columns = []
|
|
|
|
|
|
|
|
# Extract headings from the first row
|
|
|
|
first_row = element.find("tr")
|
|
|
|
if first_row:
|
|
|
|
for th in first_row.find_all("th"):
|
|
|
|
top_headings.append(th.get_text(strip=True))
|
|
|
|
|
|
|
|
# Extract only the first data row's cell values for Columns
|
|
|
|
data_rows = element.find_all("tr")[1:] # Skip the heading row
|
|
|
|
if data_rows:
|
|
|
|
first_data_row = data_rows[0] # Take the first row of data
|
|
|
|
for idx, th in enumerate(first_row.find_all("th")):
|
|
|
|
td = (
|
|
|
|
first_data_row.find_all("td")[idx]
|
|
|
|
if idx < len(first_data_row.find_all("td"))
|
|
|
|
else None
|
|
|
|
)
|
|
|
|
if td:
|
|
|
|
columns.append(
|
|
|
|
f"{table_control}-{th.get_text(strip=True)}"
|
|
|
|
) # Format as desired
|
|
|
|
|
|
|
|
records.append(
|
|
|
|
{
|
|
|
|
"Type": "Table",
|
|
|
|
"TableControl": table_control,
|
|
|
|
"TopHeadings": top_headings,
|
|
|
|
"Columns": columns,
|
|
|
|
}
|
|
|
|
)
|
|
|
|
return records, header_text, sub_header_text
|
2024-09-12 19:54:38 +02:00
|
|
|
|
2024-09-15 13:06:34 +02:00
|
|
|
|
2024-09-12 19:54:38 +02:00
|
|
|
def insert_spaces_before_caps(text):
|
|
|
|
"""Insert spaces before each capital letter in a given string."""
|
2024-09-15 13:06:34 +02:00
|
|
|
return re.sub(r"(?<!^)(?=[A-Z])", " ", text)
|
|
|
|
|
2024-09-12 19:54:38 +02:00
|
|
|
|
2024-09-18 12:46:26 +02:00
|
|
|
def save_to_json5(data, output_filename, package_name, header, sub_header,strVersion):
|
|
|
|
"""Save extracted data to a JSON5 file with a specific structure."""
|
|
|
|
# Generate prefix from uppercase letters in PackageName made into lowercase
|
|
|
|
prefix = "".join(re.findall(r"[A-Z]", package_name)).lower()
|
|
|
|
|
|
|
|
# Prepare structured html list
|
|
|
|
structured_html = []
|
|
|
|
paragraph_count = 1
|
|
|
|
preformatted_count = 1
|
|
|
|
input_count = 1
|
|
|
|
table_count = 1
|
2024-11-04 18:08:38 +01:00
|
|
|
link_count = 1
|
2024-09-18 12:46:26 +02:00
|
|
|
|
|
|
|
for record in data:
|
|
|
|
if record["Type"] == "Paragraph":
|
|
|
|
structured_html.append({f"Paragraph{paragraph_count}": record["Text"]})
|
|
|
|
paragraph_count += 1
|
|
|
|
elif record["Type"] == "Preformatted":
|
|
|
|
structured_html.append(
|
|
|
|
{f"Preformatted{preformatted_count}": record["Text"]}
|
|
|
|
)
|
|
|
|
preformatted_count += 1
|
2024-11-04 18:08:38 +01:00
|
|
|
elif record["Type"] == "Link":
|
|
|
|
link_structure = {
|
|
|
|
"Type": record["Type"],
|
|
|
|
"href": record["href"],
|
|
|
|
"title": record["title"]
|
|
|
|
}
|
|
|
|
structured_html.append({f"Link{link_count}": link_structure})
|
|
|
|
link_count += 1
|
2024-09-18 12:46:26 +02:00
|
|
|
elif record["Type"] == "Header" or record["Type"] == "SubHeader":
|
|
|
|
continue # Skip headers for input count
|
|
|
|
elif record["Type"] == "Table":
|
|
|
|
# Construct the table entry
|
|
|
|
table_structure = {
|
|
|
|
"Type": record["Type"],
|
|
|
|
"TableControl": record["TableControl"],
|
|
|
|
"TopHeadings": record["TopHeadings"],
|
|
|
|
"Columns": record["Columns"],
|
|
|
|
}
|
|
|
|
structured_html.append({f"Table{table_count}": table_structure})
|
|
|
|
table_count += 1
|
|
|
|
else: # For inputs, selects, textareas, and buttons
|
|
|
|
input_structure = {
|
|
|
|
"Type": record["Type"],
|
|
|
|
"Value": record.get("Value", ""), # Safely access Value
|
|
|
|
}
|
|
|
|
|
|
|
|
# Use .get() for the Name key to avoid KeyError
|
|
|
|
input_structure["Name"] = record.get(
|
|
|
|
"Name", None
|
|
|
|
) # Set to None if not present
|
|
|
|
input_structure["Label"] = record.get(
|
|
|
|
"Label", None
|
|
|
|
) # Set to None if not present
|
|
|
|
|
|
|
|
# Handle specific case for Select options
|
|
|
|
if "Options" in record:
|
|
|
|
input_structure["Options"] = record["Options"]
|
|
|
|
|
|
|
|
structured_html.append({f"Input{input_count}": input_structure})
|
|
|
|
input_count += 1
|
|
|
|
|
|
|
|
# Wrap the records with the required fields
|
|
|
|
json5_data = {
|
|
|
|
"PackageName": package_name,
|
|
|
|
"prefix": prefix,
|
|
|
|
"MenuHeading": "Miscellaneous",
|
|
|
|
"MenuDescription": insert_spaces_before_caps(package_name),
|
|
|
|
"MenuNavigation": "2000 400",
|
|
|
|
"firstPanel": "PARAMS",
|
|
|
|
"signalEvent": f"smeserver-{package_name.lower()}-update",
|
|
|
|
"html": [
|
|
|
|
{
|
|
|
|
"Name": "params",
|
|
|
|
"route": "PARAMS",
|
|
|
|
"Header": header if header else f"{package_name} Contrib",
|
|
|
|
"SubHeader": sub_header
|
|
|
|
if sub_header
|
|
|
|
else f"Manage {package_name} settings:",
|
|
|
|
**{
|
|
|
|
k: v for item in structured_html for k, v in item.items()
|
|
|
|
}, # Flatten the structured_html into the dict
|
|
|
|
}
|
|
|
|
],
|
|
|
|
}
|
|
|
|
|
|
|
|
# Save in JSON5 format (JSON with comments and unquoted keys)
|
|
|
|
with open(output_filename, "w", encoding="utf-8") as json_file:
|
|
|
|
|
|
|
|
json.dump(json5_data, json_file, ensure_ascii=False, indent=4)
|
|
|
|
|
|
|
|
# Manually format as JSON5 by adding single quotes (for simplicity)
|
|
|
|
with open(output_filename, "r+", encoding="utf-8") as json_file:
|
|
|
|
content = f"//\n// Generated by {strVersion}\n//\n"
|
|
|
|
content = content + json_file.read()
|
|
|
|
content = content.replace(
|
|
|
|
'"', "'"
|
|
|
|
) # Replace double quotes with single quotes for JSON5
|
|
|
|
json_file.seek(0)
|
|
|
|
json_file.write(content)
|
|
|
|
json_file.truncate() # Remove any old content beyond the new content length
|
2024-09-12 19:54:38 +02:00
|
|
|
|
|
|
|
|
2024-09-15 13:06:34 +02:00
|
|
|
def main():
|
2024-09-18 12:46:26 +02:00
|
|
|
strVersion = assemble_version_string()
|
2024-09-15 13:06:34 +02:00
|
|
|
# command line parameters
|
|
|
|
parser = argparse.ArgumentParser(description="sm1--html-2-jsopn5")
|
|
|
|
parser.add_argument(
|
|
|
|
"-f",
|
|
|
|
"--filename",
|
|
|
|
help="Specify a filename for the html file",
|
|
|
|
default="CreateStarterWebsite.html",
|
|
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
input_file = "/home/brianr/clients/SM2/SM2Gen/venv/html/" + args.filename
|
|
|
|
if not input_file.lower().endswith(".html"):
|
|
|
|
# Add .html extension
|
|
|
|
input_file += ".html"
|
|
|
|
print(input_file)
|
|
|
|
|
|
|
|
# Read HTML content
|
|
|
|
html_content = read_html_file(input_file)
|
|
|
|
|
|
|
|
# Validate the HTML before extracting data
|
|
|
|
validate_html(html_content)
|
|
|
|
|
|
|
|
# Extract data from HTML
|
|
|
|
data, header, sub_header = extract_data(html_content)
|
|
|
|
#
|
|
|
|
# Generate output JSON5 filename based on input file name
|
|
|
|
#
|
2024-09-18 12:46:26 +02:00
|
|
|
# Split the original path into directory and file name
|
2024-09-15 13:06:34 +02:00
|
|
|
directory, filename = os.path.split(input_file)
|
|
|
|
|
|
|
|
# Replace 'html' with 'json5' in the directory path
|
2024-09-18 12:46:26 +02:00
|
|
|
new_directory = directory.replace("/html", "/json5")
|
|
|
|
# print(new_directory)
|
2024-09-15 13:06:34 +02:00
|
|
|
|
|
|
|
# Construct the new path
|
2024-09-18 12:46:26 +02:00
|
|
|
output_file = check_file_version(os.path.join(new_directory, filename.replace(".html", ".json5")))
|
2024-09-15 13:06:34 +02:00
|
|
|
print(output_file)
|
2024-09-18 12:46:26 +02:00
|
|
|
# quit(1)
|
|
|
|
|
2024-09-15 13:06:34 +02:00
|
|
|
# Generate output JSON5 filename based on input file name
|
|
|
|
base_name = os.path.basename(input_file) # Get the file name (with extension)
|
|
|
|
package_name = os.path.splitext(base_name)[0] # Use the filename without extension
|
|
|
|
|
|
|
|
# Save extracted data to JSON5
|
2024-09-18 12:46:26 +02:00
|
|
|
save_to_json5(data, output_file, package_name, header, sub_header, strVersion)
|
2024-09-15 13:06:34 +02:00
|
|
|
print(f"Extracted data saved to '{output_file}'.")
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2024-11-04 18:08:38 +01:00
|
|
|
main()
|