SM2Gen/sm1-html-2-json5.py

275 lines
10 KiB
Python

import json
import os
import re
from bs4 import BeautifulSoup
from lxml import etree # Import lxml for HTML validation
def read_html_file(filename):
"""Read HTML content from a file."""
with open(filename, 'r', encoding='utf-8') as file:
return file.read()
def validate_html(html):
"""Validate the HTML content."""
try:
parser = etree.HTMLParser()
etree.fromstring(html, parser) # Attempt to parse the HTML
except Exception as e:
raise ValueError("Invalid HTML document") from e
def sanitize_text(text):
# Replace newlines with spaces
sanitized_text = text.replace('\n', ' ').replace('\r', ' ') # Handle both Unix and Windows line endings
# Replace tabs with spaces
sanitized_text = sanitized_text.replace('\t', ' ')
# Escape quote characters
sanitized_text = sanitized_text.replace('"', '\\"').replace("'", "\\'")
# Strip leading and trailing whitespace
sanitized_text = sanitized_text.strip()
return sanitized_text
def extract_data(html):
"""Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
soup = BeautifulSoup(html, 'lxml')
records = []
hidden_input_names = [
'page',
'page_stack',
'.id',
'csrf_token'
]
header_text = None
sub_header_text = None
# Counter for tables
table_counter = 0
# Extract elements while preserving order
for element in soup.find_all(['h1', 'h2', 'p', 'pre', 'input', 'select', 'textarea', 'button', 'table']):
if element.name == 'h1':
header_text = element.get_text(strip=True)
records.append({
'Type': 'Header',
'Text': header_text
})
elif element.name == 'h2':
sub_header_text = element.get_text(strip=True)
records.append({
'Type': 'SubHeader',
'Text': sub_header_text
})
elif element.name == 'p':
text = element.get_text(strip=True)
if text: # Ignore empty paragraphs
#Sanitise text freom newlines,tabs and escape quotes.
records.append({
'Type': 'Paragraph',
'Text': sanitize_text(text)
})
elif element.name == 'pre':
text = element.get_text(strip=True)
if text: # Ensure non-empty before adding
records.append({
'Type': 'Preformatted',
'Text': text
})
elif element.name == 'input':
if element.get('type') == 'hidden' or element.get('name') in hidden_input_names:
continue
input_info = {
'Type': element.get('type', 'text').capitalize(),
'Name': element.get('name'),
'Value': element.get('value', ''),
}
label = element.find_next('label')
input_info['Label'] = label.get_text(strip=True) if label else None
records.append(input_info)
elif element.name == 'select':
options = [{'Value': option.get('value'), 'Text': option.get_text(strip=True)} for option in element.find_all('option')]
select_info = {
'Type': 'Select',
'Name': element.get('name'),
'Options': options,
'Label': element.find_previous('label').get_text(strip=True) if element.find_previous('label') else None,
}
records.append(select_info)
elif element.name == 'textarea':
textarea_info = {
'Type': 'Textarea',
'Name': element.get('name'),
'Value': element.get_text(strip=True),
}
label = element.find_previous('label')
textarea_info['Label'] = label.get_text(strip=True) if label else None
records.append(textarea_info)
elif element.name == 'button':
button_info = {
'Type': 'Button',
'Name': element.get('name'),
'Value': element.get_text(strip=True),
'Label': element.find_previous('label').get_text(strip=True) if label else None,
}
records.append(button_info)
elif element.name == 'table' and 'sme-border' in element.get('class', []):
# Increment the table counter
table_counter += 1
# Prepare the TableControl format
table_control = f"Table{table_counter}" # e.g., "Table1", "Table2"
top_headings = []
columns = []
# Extract headings from the first row
first_row = element.find('tr')
if first_row:
for th in first_row.find_all('th'):
top_headings.append(th.get_text(strip=True))
# Extract only the first data row's cell values for Columns
data_rows = element.find_all('tr')[1:] # Skip the heading row
if data_rows:
first_data_row = data_rows[0] # Take the first row of data
for idx, th in enumerate(first_row.find_all('th')):
td = first_data_row.find_all('td')[idx] if idx < len(first_data_row.find_all('td')) else None
if td:
columns.append(f"{table_control}-{th.get_text(strip=True)}") # Format as desired
records.append({
'Type': 'Table',
'TableControl': table_control,
'TopHeadings': top_headings,
'Columns': columns,
})
return records, header_text, sub_header_text
def insert_spaces_before_caps(text):
"""Insert spaces before each capital letter in a given string."""
return re.sub(r'(?<!^)(?=[A-Z])', ' ', text)
def save_to_json5(data, output_filename, package_name, header, sub_header):
"""Save extracted data to a JSON5 file with a specific structure."""
# Generate prefix from uppercase letters in PackageName made into lowercase
prefix = ''.join(re.findall(r'[A-Z]', package_name)).lower()
# Prepare structured html list
structured_html = []
paragraph_count = 1
preformatted_count = 1
input_count = 1
table_count = 1
for record in data:
if record['Type'] == 'Paragraph':
structured_html.append({
f'Paragraph{paragraph_count}': record['Text']
})
paragraph_count += 1
elif record['Type'] == 'Preformatted':
structured_html.append({
f'Preformatted{preformatted_count}': record['Text']
})
preformatted_count += 1
elif record['Type'] == 'Header' or record['Type'] == 'SubHeader':
continue # Skip headers for input count
elif record['Type'] == 'Table':
# Construct the table entry
table_structure = {
'Type': record['Type'],
'TableControl': record['TableControl'],
'TopHeadings': record['TopHeadings'],
'Columns': record['Columns']
}
structured_html.append({
f'Table{table_count}': table_structure
})
table_count += 1
else: # For inputs, selects, textareas, and buttons
input_structure = {
'Type': record['Type'],
'Value': record.get('Value', ''), # Safely access Value
}
# Use .get() for the Name key to avoid KeyError
input_structure['Name'] = record.get('Name', None) # Set to None if not present
input_structure['Label'] = record.get('Label', None) # Set to None if not present
# Handle specific case for Select options
if 'Options' in record:
input_structure['Options'] = record['Options']
structured_html.append({
f'Input{input_count}': input_structure
})
input_count += 1
# Wrap the records with the required fields
json5_data = {
'PackageName': package_name,
'prefix': prefix,
'MenuHeading': 'Miscellaneous',
'MenuDescription': insert_spaces_before_caps(package_name),
'MenuNavigation': '2000 400',
'firstPanel': 'PARAMS',
'signalEvent': f'smeserver-{package_name.lower()}-update',
'html': {
'Name': 'params',
'route': 'PARAMS',
'Header': header if header else f'{package_name} Contrib',
'SubHeader': sub_header if sub_header else f'Manage {package_name} settings:',
**{k: v for item in structured_html for k, v in item.items()} # Flatten the structured_html into the dict
}
}
# Save in JSON5 format (JSON with comments and unquoted keys)
with open(output_filename, 'w', encoding='utf-8') as json_file:
json.dump(json5_data, json_file, ensure_ascii=False, indent=4)
# Manually format as JSON5 by adding single quotes (for simplicity)
with open(output_filename, 'r+', encoding='utf-8') as json_file:
content = json_file.read()
content = content.replace('"', "'") # Replace double quotes with single quotes for JSON5
json_file.seek(0)
json_file.write(content)
json_file.truncate() # Remove any old content beyond the new content length
def main():
input_file = '/home/brianr/clients/SM2/SM2Gen/venv/html/CreateStarterWebsite.html' # Specify the input HTML file path
# Read HTML content
html_content = read_html_file(input_file)
# Validate the HTML before extracting data
validate_html(html_content)
# Extract data from HTML
data, header, sub_header = extract_data(html_content)
# Generate output JSON5 filename based on input file name
base_name = os.path.basename(input_file) # Get the file name (with extension)
package_name = os.path.splitext(base_name)[0] # Use the filename without extension
json_filename = package_name + '.json5' # Change extension to .json5
# Create the output file path in the same directory
output_directory = os.path.dirname(input_file)
output_file = os.path.join(output_directory, json_filename)
# Save extracted data to JSON5
save_to_json5(data, output_file, package_name, header, sub_header)
print(f"Extracted data saved to '{output_file}'.")
if __name__ == '__main__':
main()