SM2Gen/sm1-html-2-json5.py

import json
import os
import re
from bs4 import BeautifulSoup
from lxml import etree  # Import lxml for HTML validation

def read_html_file(filename):
    """Read HTML content from a file."""
    with open(filename, 'r', encoding='utf-8') as file:
        return file.read()

def validate_html(html):
    """Validate the HTML content."""
    try:
        parser = etree.HTMLParser()
        etree.fromstring(html, parser)  # Attempt to parse the HTML
    except Exception as e:
        raise ValueError("Invalid HTML document") from e

def sanitize_text(text):
    # Replace newlines with spaces
    sanitized_text = text.replace('\n', ' ').replace('\r', ' ')  # Handle both Unix and Windows line endings
    # Replace tabs with spaces
    sanitized_text = sanitized_text.replace('\t', ' ')
    # Escape quote characters
    sanitized_text = sanitized_text.replace('"', '\\"').replace("'", "\\'")
    # Strip leading and trailing whitespace
    sanitized_text = sanitized_text.strip()
    return sanitized_text


def extract_data(html):
    """Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
    soup = BeautifulSoup(html, 'lxml')
    records = []

    hidden_input_names = [
        'page',
        'page_stack',
        '.id',
        'csrf_token'
    ]

    header_text = None
    sub_header_text = None

    # Counter for tables
    table_counter = 0

    # Extract elements while preserving order
    for element in soup.find_all(['h1', 'h2', 'p', 'pre', 'input', 'select', 'textarea', 'button', 'table']):
        if element.name == 'h1':
            header_text = element.get_text(strip=True)
            records.append({
                'Type': 'Header',
                'Text': header_text
            })

        elif element.name == 'h2':
            sub_header_text = element.get_text(strip=True)
            records.append({
                'Type': 'SubHeader',
                'Text': sub_header_text
            })

        elif element.name == 'p':
            text = element.get_text(strip=True)
            if text:  # Ignore empty paragraphs
				#Sanitise text freom newlines,tabs and escape quotes.
                records.append({
                    'Type': 'Paragraph',
                    'Text': sanitize_text(text)
                })

        elif element.name == 'pre':
            text = element.get_text(strip=True)
            if text:  # Ensure non-empty before adding
                records.append({
                    'Type': 'Preformatted',
                    'Text': text
                })

        elif element.name == 'input':
            if element.get('type') == 'hidden' or element.get('name') in hidden_input_names:
                continue

            input_info = {
                'Type': element.get('type', 'text').capitalize(),
                'Name': element.get('name'),
                'Value': element.get('value', ''),
            }
            label = element.find_next('label')
            input_info['Label'] = label.get_text(strip=True) if label else None
            records.append(input_info)

        elif element.name == 'select':
            options = [{'Value': option.get('value'), 'Text': option.get_text(strip=True)} for option in element.find_all('option')]
            select_info = {
                'Type': 'Select',
                'Name': element.get('name'),
                'Options': options,
                'Label': element.find_previous('label').get_text(strip=True) if element.find_previous('label') else None,
            }
            records.append(select_info)

        elif element.name == 'textarea':
            textarea_info = {
                'Type': 'Textarea',
                'Name': element.get('name'),
                'Value': element.get_text(strip=True),
            }
            label = element.find_previous('label')
            textarea_info['Label'] = label.get_text(strip=True) if label else None
            records.append(textarea_info)

        elif element.name == 'button':
            button_info = {
                'Type': 'Button',
                'Name': element.get('name'),
                'Value': element.get_text(strip=True),
                'Label': element.find_previous('label').get_text(strip=True) if label else None,
            }
            records.append(button_info)

        elif element.name == 'table' and 'sme-border' in element.get('class', []):
            # Increment the table counter
            table_counter += 1

            # Prepare the TableControl format
            table_control = f"Table{table_counter}"  # e.g., "Table1", "Table2"
            top_headings = []
            columns = []

            # Extract headings from the first row
            first_row = element.find('tr')
            if first_row:
                for th in first_row.find_all('th'):
                    top_headings.append(th.get_text(strip=True))

            # Extract only the first data row's cell values for Columns
            data_rows = element.find_all('tr')[1:]  # Skip the heading row
            if data_rows:
                first_data_row = data_rows[0]  # Take the first row of data
                for idx, th in enumerate(first_row.find_all('th')):
                    td = first_data_row.find_all('td')[idx] if idx < len(first_data_row.find_all('td')) else None
                    if td:
                        columns.append(f"{table_control}-{th.get_text(strip=True)}")  # Format as desired

            records.append({
                'Type': 'Table',
                'TableControl': table_control,
                'TopHeadings': top_headings,
                'Columns': columns,
            })

    return records, header_text, sub_header_text

def insert_spaces_before_caps(text):
    """Insert spaces before each capital letter in a given string."""
    return re.sub(r'(?<!^)(?=[A-Z])', ' ', text)

def save_to_json5(data, output_filename, package_name, header, sub_header):
    """Save extracted data to a JSON5 file with a specific structure."""
    # Generate prefix from uppercase letters in PackageName made into lowercase
    prefix = ''.join(re.findall(r'[A-Z]', package_name)).lower()

    # Prepare structured html list
    structured_html = []
    paragraph_count = 1
    preformatted_count = 1
    input_count = 1
    table_count = 1

    for record in data:
        if record['Type'] == 'Paragraph':
            structured_html.append({
                f'Paragraph{paragraph_count}': record['Text']
            })
            paragraph_count += 1
        elif record['Type'] == 'Preformatted':
            structured_html.append({
                f'Preformatted{preformatted_count}': record['Text']
            })
            preformatted_count += 1
        elif record['Type'] == 'Header' or record['Type'] == 'SubHeader':
            continue  # Skip headers for input count
        elif record['Type'] == 'Table':
            # Construct the table entry
            table_structure = {
                'Type': record['Type'],
                'TableControl': record['TableControl'],
                'TopHeadings': record['TopHeadings'],
                'Columns': record['Columns']
            }
            structured_html.append({
                f'Table{table_count}': table_structure
            })
            table_count += 1
        else:  # For inputs, selects, textareas, and buttons
            input_structure = {
                'Type': record['Type'],
                'Value': record.get('Value', ''),  # Safely access Value
            }

            # Use .get() for the Name key to avoid KeyError
            input_structure['Name'] = record.get('Name', None)  # Set to None if not present
            input_structure['Label'] = record.get('Label', None)  # Set to None if not present

            # Handle specific case for Select options
            if 'Options' in record:
                input_structure['Options'] = record['Options']

            structured_html.append({
                f'Input{input_count}': input_structure
            })
            input_count += 1

    # Wrap the records with the required fields
    json5_data = {
        'PackageName': package_name,
        'prefix': prefix,
        'MenuHeading': 'Miscellaneous',
        'MenuDescription': insert_spaces_before_caps(package_name),
        'MenuNavigation': '2000 400',
        'firstPanel': 'PARAMS',
        'signalEvent': f'smeserver-{package_name.lower()}-update',
        'html': {
            'Name': 'params',
            'route': 'PARAMS',
            'Header': header if header else f'{package_name} Contrib',
            'SubHeader': sub_header if sub_header else f'Manage {package_name} settings:',
            **{k: v for item in structured_html for k, v in item.items()}  # Flatten the structured_html into the dict
        }
    }

    # Save in JSON5 format (JSON with comments and unquoted keys)
    with open(output_filename, 'w', encoding='utf-8') as json_file:
        json.dump(json5_data, json_file, ensure_ascii=False, indent=4)

    # Manually format as JSON5 by adding single quotes (for simplicity)
    with open(output_filename, 'r+', encoding='utf-8') as json_file:
        content = json_file.read()
        content = content.replace('"', "'")  # Replace double quotes with single quotes for JSON5
        json_file.seek(0)
        json_file.write(content)
        json_file.truncate()  # Remove any old content beyond the new content length

def main():
    input_file = '/home/brianr/clients/SM2/SM2Gen/venv/html/CreateStarterWebsite.html'  # Specify the input HTML file path

    # Read HTML content
    html_content = read_html_file(input_file)

    # Validate the HTML before extracting data
    validate_html(html_content)

    # Extract data from HTML
    data, header, sub_header = extract_data(html_content)

    # Generate output JSON5 filename based on input file name
    base_name = os.path.basename(input_file)  # Get the file name (with extension)
    package_name = os.path.splitext(base_name)[0]  # Use the filename without extension
    json_filename = package_name + '.json5'  # Change extension to .json5

    # Create the output file path in the same directory
    output_directory = os.path.dirname(input_file)
    output_file = os.path.join(output_directory, json_filename)

    # Save extracted data to JSON5
    save_to_json5(data, output_file, package_name, header, sub_header)
    print(f"Extracted data saved to '{output_file}'.")

if __name__ == '__main__':
    main()