Add in entries for other mojo input tags
This commit is contained in:
@@ -3,12 +3,16 @@ import os
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from lxml import etree # Import lxml for HTML validation
|
||||
import html
|
||||
import argparse
|
||||
|
||||
|
||||
def read_html_file(filename):
|
||||
"""Read HTML content from a file."""
|
||||
with open(filename, 'r', encoding='utf-8') as file:
|
||||
with open(filename, "r", encoding="utf-8") as file:
|
||||
return file.read()
|
||||
|
||||
|
||||
def validate_html(html):
|
||||
"""Validate the HTML content."""
|
||||
try:
|
||||
@@ -16,14 +20,20 @@ def validate_html(html):
|
||||
etree.fromstring(html, parser) # Attempt to parse the HTML
|
||||
except Exception as e:
|
||||
raise ValueError("Invalid HTML document") from e
|
||||
|
||||
|
||||
|
||||
def sanitize_text(text):
|
||||
# Replace newlines with spaces
|
||||
sanitized_text = text.replace('\n', ' ').replace('\r', ' ') # Handle both Unix and Windows line endings
|
||||
decoded_text = html.unescape(text)
|
||||
sanitized_text = decoded_text.replace("\n", "").replace(
|
||||
"\r", " "
|
||||
) # Handle both Unix and Windows line endings
|
||||
# Replace tabs with spaces
|
||||
sanitized_text = sanitized_text.replace('\t', ' ')
|
||||
# Escape quote characters
|
||||
sanitized_text = sanitized_text.replace('"', '\\"').replace("'", "\\'")
|
||||
sanitized_text = sanitized_text.replace("\t", "")
|
||||
# map single quotes to double
|
||||
sanitized_text = sanitized_text.replace("'", '"')
|
||||
#Take out any multiple spaces - reduce to one.
|
||||
sanitized_text = ' '.join(sanitized_text.split())
|
||||
# Strip leading and trailing whitespace
|
||||
sanitized_text = sanitized_text.strip()
|
||||
return sanitized_text
|
||||
@@ -31,15 +41,10 @@ def sanitize_text(text):
|
||||
|
||||
def extract_data(html):
|
||||
"""Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
records = []
|
||||
|
||||
hidden_input_names = [
|
||||
'page',
|
||||
'page_stack',
|
||||
'.id',
|
||||
'csrf_token'
|
||||
]
|
||||
hidden_input_names = ["page", "page_stack", ".id", "csrf_token"]
|
||||
|
||||
header_text = None
|
||||
sub_header_text = None
|
||||
@@ -48,81 +53,81 @@ def extract_data(html):
|
||||
table_counter = 0
|
||||
|
||||
# Extract elements while preserving order
|
||||
for element in soup.find_all(['h1', 'h2', 'p', 'pre', 'input', 'select', 'textarea', 'button', 'table']):
|
||||
if element.name == 'h1':
|
||||
for element in soup.find_all(
|
||||
["h1", "h2", "p", "pre", "input", "select", "textarea", "button", "table"]
|
||||
):
|
||||
if element.name == "h1":
|
||||
header_text = element.get_text(strip=True)
|
||||
records.append({
|
||||
'Type': 'Header',
|
||||
'Text': header_text
|
||||
})
|
||||
records.append({"Type": "Header", "Text": header_text})
|
||||
|
||||
elif element.name == 'h2':
|
||||
elif element.name == "h2":
|
||||
sub_header_text = element.get_text(strip=True)
|
||||
records.append({
|
||||
'Type': 'SubHeader',
|
||||
'Text': sub_header_text
|
||||
})
|
||||
records.append({"Type": "SubHeader", "Text": sub_header_text})
|
||||
|
||||
elif element.name == 'p':
|
||||
elif element.name == "p":
|
||||
text = element.get_text(strip=True)
|
||||
if text: # Ignore empty paragraphs
|
||||
#Sanitise text freom newlines,tabs and escape quotes.
|
||||
records.append({
|
||||
'Type': 'Paragraph',
|
||||
'Text': sanitize_text(text)
|
||||
})
|
||||
# Sanitise text freom newlines,tabs and escape quotes.
|
||||
records.append({"Type": "Paragraph", "Text": sanitize_text(text)})
|
||||
|
||||
elif element.name == 'pre':
|
||||
elif element.name == "pre":
|
||||
text = element.get_text(strip=True)
|
||||
if text: # Ensure non-empty before adding
|
||||
records.append({
|
||||
'Type': 'Preformatted',
|
||||
'Text': text
|
||||
})
|
||||
records.append({"Type": "Preformatted", "Text": text})
|
||||
|
||||
elif element.name == 'input':
|
||||
if element.get('type') == 'hidden' or element.get('name') in hidden_input_names:
|
||||
elif element.name == "input":
|
||||
if (
|
||||
element.get("type") == "hidden"
|
||||
or element.get("name") in hidden_input_names
|
||||
):
|
||||
continue
|
||||
|
||||
|
||||
input_info = {
|
||||
'Type': element.get('type', 'text').capitalize(),
|
||||
'Name': element.get('name'),
|
||||
'Value': element.get('value', ''),
|
||||
"Type": element.get("type", "text").capitalize(),
|
||||
"Name": element.get("name"),
|
||||
"Value": element.get("value", ""),
|
||||
}
|
||||
label = element.find_next('label')
|
||||
input_info['Label'] = label.get_text(strip=True) if label else None
|
||||
label = element.find_next("label")
|
||||
input_info["Label"] = label.get_text(strip=True) if label else None
|
||||
records.append(input_info)
|
||||
|
||||
elif element.name == 'select':
|
||||
options = [{'Value': option.get('value'), 'Text': option.get_text(strip=True)} for option in element.find_all('option')]
|
||||
elif element.name == "select":
|
||||
options = [
|
||||
{"Value": option.get("value"), "Text": option.get_text(strip=True)}
|
||||
for option in element.find_all("option")
|
||||
]
|
||||
select_info = {
|
||||
'Type': 'Select',
|
||||
'Name': element.get('name'),
|
||||
'Options': options,
|
||||
'Label': element.find_previous('label').get_text(strip=True) if element.find_previous('label') else None,
|
||||
"Type": "Select",
|
||||
"Name": element.get("name"),
|
||||
"Options": options,
|
||||
"Label": element.find_previous("label").get_text(strip=True)
|
||||
if element.find_previous("label")
|
||||
else None,
|
||||
}
|
||||
records.append(select_info)
|
||||
|
||||
elif element.name == 'textarea':
|
||||
elif element.name == "textarea":
|
||||
textarea_info = {
|
||||
'Type': 'Textarea',
|
||||
'Name': element.get('name'),
|
||||
'Value': element.get_text(strip=True),
|
||||
"Type": "Textarea",
|
||||
"Name": element.get("name"),
|
||||
"Value": element.get_text(strip=True),
|
||||
}
|
||||
label = element.find_previous('label')
|
||||
textarea_info['Label'] = label.get_text(strip=True) if label else None
|
||||
label = element.find_previous("label")
|
||||
textarea_info["Label"] = label.get_text(strip=True) if label else None
|
||||
records.append(textarea_info)
|
||||
|
||||
elif element.name == 'button':
|
||||
elif element.name == "button":
|
||||
button_info = {
|
||||
'Type': 'Button',
|
||||
'Name': element.get('name'),
|
||||
'Value': element.get_text(strip=True),
|
||||
'Label': element.find_previous('label').get_text(strip=True) if label else None,
|
||||
"Type": "Button",
|
||||
"Name": element.get("name"),
|
||||
"Value": element.get_text(strip=True),
|
||||
"Label": element.find_previous("label").get_text(strip=True)
|
||||
if label
|
||||
else None,
|
||||
}
|
||||
records.append(button_info)
|
||||
|
||||
elif element.name == 'table' and 'sme-border' in element.get('class', []):
|
||||
elif element.name == "table" and "sme-border" in element.get("class", []):
|
||||
# Increment the table counter
|
||||
table_counter += 1
|
||||
|
||||
@@ -132,37 +137,47 @@ def extract_data(html):
|
||||
columns = []
|
||||
|
||||
# Extract headings from the first row
|
||||
first_row = element.find('tr')
|
||||
first_row = element.find("tr")
|
||||
if first_row:
|
||||
for th in first_row.find_all('th'):
|
||||
for th in first_row.find_all("th"):
|
||||
top_headings.append(th.get_text(strip=True))
|
||||
|
||||
# Extract only the first data row's cell values for Columns
|
||||
data_rows = element.find_all('tr')[1:] # Skip the heading row
|
||||
data_rows = element.find_all("tr")[1:] # Skip the heading row
|
||||
if data_rows:
|
||||
first_data_row = data_rows[0] # Take the first row of data
|
||||
for idx, th in enumerate(first_row.find_all('th')):
|
||||
td = first_data_row.find_all('td')[idx] if idx < len(first_data_row.find_all('td')) else None
|
||||
for idx, th in enumerate(first_row.find_all("th")):
|
||||
td = (
|
||||
first_data_row.find_all("td")[idx]
|
||||
if idx < len(first_data_row.find_all("td"))
|
||||
else None
|
||||
)
|
||||
if td:
|
||||
columns.append(f"{table_control}-{th.get_text(strip=True)}") # Format as desired
|
||||
columns.append(
|
||||
f"{table_control}-{th.get_text(strip=True)}"
|
||||
) # Format as desired
|
||||
|
||||
records.append({
|
||||
'Type': 'Table',
|
||||
'TableControl': table_control,
|
||||
'TopHeadings': top_headings,
|
||||
'Columns': columns,
|
||||
})
|
||||
records.append(
|
||||
{
|
||||
"Type": "Table",
|
||||
"TableControl": table_control,
|
||||
"TopHeadings": top_headings,
|
||||
"Columns": columns,
|
||||
}
|
||||
)
|
||||
|
||||
return records, header_text, sub_header_text
|
||||
|
||||
|
||||
def insert_spaces_before_caps(text):
|
||||
"""Insert spaces before each capital letter in a given string."""
|
||||
return re.sub(r'(?<!^)(?=[A-Z])', ' ', text)
|
||||
return re.sub(r"(?<!^)(?=[A-Z])", " ", text)
|
||||
|
||||
|
||||
def save_to_json5(data, output_filename, package_name, header, sub_header):
|
||||
"""Save extracted data to a JSON5 file with a specific structure."""
|
||||
# Generate prefix from uppercase letters in PackageName made into lowercase
|
||||
prefix = ''.join(re.findall(r'[A-Z]', package_name)).lower()
|
||||
prefix = "".join(re.findall(r"[A-Z]", package_name)).lower()
|
||||
|
||||
# Prepare structured html list
|
||||
structured_html = []
|
||||
@@ -172,103 +187,131 @@ def save_to_json5(data, output_filename, package_name, header, sub_header):
|
||||
table_count = 1
|
||||
|
||||
for record in data:
|
||||
if record['Type'] == 'Paragraph':
|
||||
structured_html.append({
|
||||
f'Paragraph{paragraph_count}': record['Text']
|
||||
})
|
||||
if record["Type"] == "Paragraph":
|
||||
structured_html.append({f"Paragraph{paragraph_count}": record["Text"]})
|
||||
paragraph_count += 1
|
||||
elif record['Type'] == 'Preformatted':
|
||||
structured_html.append({
|
||||
f'Preformatted{preformatted_count}': record['Text']
|
||||
})
|
||||
elif record["Type"] == "Preformatted":
|
||||
structured_html.append(
|
||||
{f"Preformatted{preformatted_count}": record["Text"]}
|
||||
)
|
||||
preformatted_count += 1
|
||||
elif record['Type'] == 'Header' or record['Type'] == 'SubHeader':
|
||||
elif record["Type"] == "Header" or record["Type"] == "SubHeader":
|
||||
continue # Skip headers for input count
|
||||
elif record['Type'] == 'Table':
|
||||
elif record["Type"] == "Table":
|
||||
# Construct the table entry
|
||||
table_structure = {
|
||||
'Type': record['Type'],
|
||||
'TableControl': record['TableControl'],
|
||||
'TopHeadings': record['TopHeadings'],
|
||||
'Columns': record['Columns']
|
||||
"Type": record["Type"],
|
||||
"TableControl": record["TableControl"],
|
||||
"TopHeadings": record["TopHeadings"],
|
||||
"Columns": record["Columns"],
|
||||
}
|
||||
structured_html.append({
|
||||
f'Table{table_count}': table_structure
|
||||
})
|
||||
structured_html.append({f"Table{table_count}": table_structure})
|
||||
table_count += 1
|
||||
else: # For inputs, selects, textareas, and buttons
|
||||
input_structure = {
|
||||
'Type': record['Type'],
|
||||
'Value': record.get('Value', ''), # Safely access Value
|
||||
"Type": record["Type"],
|
||||
"Value": record.get("Value", ""), # Safely access Value
|
||||
}
|
||||
|
||||
# Use .get() for the Name key to avoid KeyError
|
||||
input_structure['Name'] = record.get('Name', None) # Set to None if not present
|
||||
input_structure['Label'] = record.get('Label', None) # Set to None if not present
|
||||
input_structure["Name"] = record.get(
|
||||
"Name", None
|
||||
) # Set to None if not present
|
||||
input_structure["Label"] = record.get(
|
||||
"Label", None
|
||||
) # Set to None if not present
|
||||
|
||||
# Handle specific case for Select options
|
||||
if 'Options' in record:
|
||||
input_structure['Options'] = record['Options']
|
||||
if "Options" in record:
|
||||
input_structure["Options"] = record["Options"]
|
||||
|
||||
structured_html.append({
|
||||
f'Input{input_count}': input_structure
|
||||
})
|
||||
structured_html.append({f"Input{input_count}": input_structure})
|
||||
input_count += 1
|
||||
|
||||
# Wrap the records with the required fields
|
||||
json5_data = {
|
||||
'PackageName': package_name,
|
||||
'prefix': prefix,
|
||||
'MenuHeading': 'Miscellaneous',
|
||||
'MenuDescription': insert_spaces_before_caps(package_name),
|
||||
'MenuNavigation': '2000 400',
|
||||
'firstPanel': 'PARAMS',
|
||||
'signalEvent': f'smeserver-{package_name.lower()}-update',
|
||||
'html': {
|
||||
'Name': 'params',
|
||||
'route': 'PARAMS',
|
||||
'Header': header if header else f'{package_name} Contrib',
|
||||
'SubHeader': sub_header if sub_header else f'Manage {package_name} settings:',
|
||||
**{k: v for item in structured_html for k, v in item.items()} # Flatten the structured_html into the dict
|
||||
}
|
||||
"PackageName": package_name,
|
||||
"prefix": prefix,
|
||||
"MenuHeading": "Miscellaneous",
|
||||
"MenuDescription": insert_spaces_before_caps(package_name),
|
||||
"MenuNavigation": "2000 400",
|
||||
"firstPanel": "PARAMS",
|
||||
"signalEvent": f"smeserver-{package_name.lower()}-update",
|
||||
"html": [{
|
||||
"Name": "params",
|
||||
"route": "PARAMS",
|
||||
"Header": header if header else f"{package_name} Contrib",
|
||||
"SubHeader": sub_header
|
||||
if sub_header
|
||||
else f"Manage {package_name} settings:",
|
||||
**{
|
||||
k: v for item in structured_html for k, v in item.items()
|
||||
}, # Flatten the structured_html into the dict
|
||||
}],
|
||||
}
|
||||
|
||||
# Save in JSON5 format (JSON with comments and unquoted keys)
|
||||
with open(output_filename, 'w', encoding='utf-8') as json_file:
|
||||
with open(output_filename, "w", encoding="utf-8") as json_file:
|
||||
json.dump(json5_data, json_file, ensure_ascii=False, indent=4)
|
||||
|
||||
|
||||
# Manually format as JSON5 by adding single quotes (for simplicity)
|
||||
with open(output_filename, 'r+', encoding='utf-8') as json_file:
|
||||
with open(output_filename, "r+", encoding="utf-8") as json_file:
|
||||
content = json_file.read()
|
||||
content = content.replace('"', "'") # Replace double quotes with single quotes for JSON5
|
||||
content = content.replace(
|
||||
'"', "'"
|
||||
) # Replace double quotes with single quotes for JSON5
|
||||
json_file.seek(0)
|
||||
json_file.write(content)
|
||||
json_file.truncate() # Remove any old content beyond the new content length
|
||||
|
||||
|
||||
def main():
|
||||
input_file = '/home/brianr/clients/SM2/SM2Gen/venv/html/CreateStarterWebsite.html' # Specify the input HTML file path
|
||||
# command line parameters
|
||||
parser = argparse.ArgumentParser(description="sm1--html-2-jsopn5")
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--filename",
|
||||
help="Specify a filename for the html file",
|
||||
default="CreateStarterWebsite.html",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
input_file = "/home/brianr/clients/SM2/SM2Gen/venv/html/" + args.filename
|
||||
if not input_file.lower().endswith(".html"):
|
||||
# Add .html extension
|
||||
input_file += ".html"
|
||||
print(input_file)
|
||||
|
||||
# Read HTML content
|
||||
html_content = read_html_file(input_file)
|
||||
# Read HTML content
|
||||
html_content = read_html_file(input_file)
|
||||
|
||||
# Validate the HTML before extracting data
|
||||
validate_html(html_content)
|
||||
# Validate the HTML before extracting data
|
||||
validate_html(html_content)
|
||||
|
||||
# Extract data from HTML
|
||||
data, header, sub_header = extract_data(html_content)
|
||||
# Extract data from HTML
|
||||
data, header, sub_header = extract_data(html_content)
|
||||
#
|
||||
# Generate output JSON5 filename based on input file name
|
||||
#
|
||||
# Split the original path into directory and file name
|
||||
directory, filename = os.path.split(input_file)
|
||||
|
||||
# Generate output JSON5 filename based on input file name
|
||||
base_name = os.path.basename(input_file) # Get the file name (with extension)
|
||||
package_name = os.path.splitext(base_name)[0] # Use the filename without extension
|
||||
json_filename = package_name + '.json5' # Change extension to .json5
|
||||
# Replace 'html' with 'json5' in the directory path
|
||||
new_directory = directory.replace('/html', '/json5')
|
||||
#print(new_directory)
|
||||
|
||||
# Create the output file path in the same directory
|
||||
output_directory = os.path.dirname(input_file)
|
||||
output_file = os.path.join(output_directory, json_filename)
|
||||
# Construct the new path
|
||||
output_file = os.path.join(new_directory, filename.replace('.html', '.json5'))
|
||||
print(output_file)
|
||||
#quit(1)
|
||||
|
||||
# Generate output JSON5 filename based on input file name
|
||||
base_name = os.path.basename(input_file) # Get the file name (with extension)
|
||||
package_name = os.path.splitext(base_name)[0] # Use the filename without extension
|
||||
|
||||
# Save extracted data to JSON5
|
||||
save_to_json5(data, output_file, package_name, header, sub_header)
|
||||
print(f"Extracted data saved to '{output_file}'.")
|
||||
# Save extracted data to JSON5
|
||||
save_to_json5(data, output_file, package_name, header, sub_header)
|
||||
print(f"Extracted data saved to '{output_file}'.")
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
Reference in New Issue
Block a user