SM2Gen/sm1-html-2-json5.py

import json
import os
import re
from bs4 import BeautifulSoup
from lxml import etree  # Import lxml for HTML validation

def read_html_file(filename):
    """Read HTML content from a file."""
    with open(filename, 'r', encoding='utf-8') as file:
        return file.read()

def validate_html(html):
    """Validate the HTML content."""
    try:
        parser = etree.HTMLParser()
        etree.fromstring(html, parser)  # Attempt to parse the HTML
    except Exception as e:
        raise ValueError("Invalid HTML document") from e
        
def sanitize_text(text):
    # Replace newlines with spaces
    sanitized_text = text.replace('\n', ' ').replace('\r', ' ')  # Handle both Unix and Windows line endings
    # Replace tabs with spaces
    sanitized_text = sanitized_text.replace('\t', ' ')
    # Escape quote characters
    sanitized_text = sanitized_text.replace('"', '\\"').replace("'", "\\'")
    # Strip leading and trailing whitespace
    sanitized_text = sanitized_text.strip()
    return sanitized_text


def extract_data(html):
    """Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
    soup = BeautifulSoup(html, 'lxml')
    records = []

    hidden_input_names = [
        'page',
        'page_stack',
        '.id',
        'csrf_token'
    ]

    header_text = None
    sub_header_text = None

    # Counter for tables
    table_counter = 0

    # Extract elements while preserving order
    for element in soup.find_all(['h1', 'h2', 'p', 'pre', 'input', 'select', 'textarea', 'button', 'table']):
        if element.name == 'h1':
            header_text = element.get_text(strip=True)
            records.append({
                'Type': 'Header',
                'Text': header_text
            })

        elif element.name == 'h2':
            sub_header_text = element.get_text(strip=True)
            records.append({
                'Type': 'SubHeader',
                'Text': sub_header_text
            })

        elif element.name == 'p':
            text = element.get_text(strip=True)
            if text:  # Ignore empty paragraphs
				#Sanitise text freom newlines,tabs and escape quotes.
                records.append({
                    'Type': 'Paragraph',
                    'Text': sanitize_text(text)
                })

        elif element.name == 'pre':
            text = element.get_text(strip=True)
            if text:  # Ensure non-empty before adding
                records.append({
                    'Type': 'Preformatted',
                    'Text': text
                })

        elif element.name == 'input':
            if element.get('type') == 'hidden' or element.get('name') in hidden_input_names:
                continue
            
            input_info = {
                'Type': element.get('type', 'text').capitalize(),
                'Name': element.get('name'),
                'Value': element.get('value', ''),
            }
            label = element.find_next('label')
            input_info['Label'] = label.get_text(strip=True) if label else None
            records.append(input_info)

        elif element.name == 'select':
            options = [{'Value': option.get('value'), 'Text': option.get_text(strip=True)} for option in element.find_all('option')]
            select_info = {
                'Type': 'Select',
                'Name': element.get('name'),
                'Options': options,
                'Label': element.find_previous('label').get_text(strip=True) if element.find_previous('label') else None,
            }
            records.append(select_info)

        elif element.name == 'textarea':
            textarea_info = {
                'Type': 'Textarea',
                'Name': element.get('name'),
                'Value': element.get_text(strip=True),
            }
            label = element.find_previous('label')
            textarea_info['Label'] = label.get_text(strip=True) if label else None
            records.append(textarea_info)

        elif element.name == 'button':
            button_info = {
                'Type': 'Button',
                'Name': element.get('name'),
                'Value': element.get_text(strip=True),
                'Label': element.find_previous('label').get_text(strip=True) if label else None,
            }
            records.append(button_info)

        elif element.name == 'table' and 'sme-border' in element.get('class', []):
            # Increment the table counter
            table_counter += 1

            # Prepare the TableControl format
            table_control = f"Table{table_counter}"  # e.g., "Table1", "Table2"
            top_headings = []
            columns = []

            # Extract headings from the first row
            first_row = element.find('tr')
            if first_row:
                for th in first_row.find_all('th'):
                    top_headings.append(th.get_text(strip=True))

            # Extract only the first data row's cell values for Columns
            data_rows = element.find_all('tr')[1:]  # Skip the heading row
            if data_rows:
                first_data_row = data_rows[0]  # Take the first row of data
                for idx, th in enumerate(first_row.find_all('th')):
                    td = first_data_row.find_all('td')[idx] if idx < len(first_data_row.find_all('td')) else None
                    if td:
                        columns.append(f"{table_control}-{th.get_text(strip=True)}")  # Format as desired

            records.append({
                'Type': 'Table',
                'TableControl': table_control,
                'TopHeadings': top_headings,
                'Columns': columns,
            })

    return records, header_text, sub_header_text

def insert_spaces_before_caps(text):
    """Insert spaces before each capital letter in a given string."""
    return re.sub(r'(?<!^)(?=[A-Z])', ' ', text)

def save_to_json5(data, output_filename, package_name, header, sub_header):
    """Save extracted data to a JSON5 file with a specific structure."""
    # Generate prefix from uppercase letters in PackageName made into lowercase
    prefix = ''.join(re.findall(r'[A-Z]', package_name)).lower()

    # Prepare structured html list
    structured_html = []
    paragraph_count = 1
    preformatted_count = 1
    input_count = 1
    table_count = 1

    for record in data:
        if record['Type'] == 'Paragraph':
            structured_html.append({
                f'Paragraph{paragraph_count}': record['Text']
            })
            paragraph_count += 1
        elif record['Type'] == 'Preformatted':
            structured_html.append({
                f'Preformatted{preformatted_count}': record['Text']
            })
            preformatted_count += 1
        elif record['Type'] == 'Header' or record['Type'] == 'SubHeader':
            continue  # Skip headers for input count
        elif record['Type'] == 'Table':
            # Construct the table entry
            table_structure = {
                'Type': record['Type'],
                'TableControl': record['TableControl'],
                'TopHeadings': record['TopHeadings'],
                'Columns': record['Columns']
            }
            structured_html.append({
                f'Table{table_count}': table_structure
            })
            table_count += 1
        else:  # For inputs, selects, textareas, and buttons
            input_structure = {
                'Type': record['Type'],
                'Value': record.get('Value', ''),  # Safely access Value
            }

            # Use .get() for the Name key to avoid KeyError
            input_structure['Name'] = record.get('Name', None)  # Set to None if not present
            input_structure['Label'] = record.get('Label', None)  # Set to None if not present

            # Handle specific case for Select options
            if 'Options' in record:
                input_structure['Options'] = record['Options']

            structured_html.append({
                f'Input{input_count}': input_structure
            })
            input_count += 1

    # Wrap the records with the required fields
    json5_data = {
        'PackageName': package_name,
        'prefix': prefix,
        'MenuHeading': 'Miscellaneous',
        'MenuDescription': insert_spaces_before_caps(package_name),
        'MenuNavigation': '2000 400',
        'firstPanel': 'PARAMS',
        'signalEvent': f'smeserver-{package_name.lower()}-update',
        'html': {
            'Name': 'params',
            'route': 'PARAMS',
            'Header': header if header else f'{package_name} Contrib',
            'SubHeader': sub_header if sub_header else f'Manage {package_name} settings:',
            **{k: v for item in structured_html for k, v in item.items()}  # Flatten the structured_html into the dict
        }
    }

    # Save in JSON5 format (JSON with comments and unquoted keys)
    with open(output_filename, 'w', encoding='utf-8') as json_file:
        json.dump(json5_data, json_file, ensure_ascii=False, indent=4)
    
    # Manually format as JSON5 by adding single quotes (for simplicity)
    with open(output_filename, 'r+', encoding='utf-8') as json_file:
        content = json_file.read()
        content = content.replace('"', "'")  # Replace double quotes with single quotes for JSON5
        json_file.seek(0)
        json_file.write(content)
        json_file.truncate()  # Remove any old content beyond the new content length

def main():
    input_file = '/home/brianr/clients/SM2/SM2Gen/venv/html/CreateStarterWebsite.html'  # Specify the input HTML file path

    # Read HTML content
    html_content = read_html_file(input_file)

    # Validate the HTML before extracting data
    validate_html(html_content)

    # Extract data from HTML
    data, header, sub_header = extract_data(html_content)

    # Generate output JSON5 filename based on input file name
    base_name = os.path.basename(input_file)  # Get the file name (with extension)
    package_name = os.path.splitext(base_name)[0]  # Use the filename without extension
    json_filename = package_name + '.json5'  # Change extension to .json5

    # Create the output file path in the same directory
    output_directory = os.path.dirname(input_file)
    output_file = os.path.join(output_directory, json_filename)

    # Save extracted data to JSON5
    save_to_json5(data, output_file, package_name, header, sub_header)
    print(f"Extracted data saved to '{output_file}'.")

if __name__ == '__main__':
    main()
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`import json`
			`import os`
			`import re`
			`from bs4 import BeautifulSoup`
			`from lxml import etree # Import lxml for HTML validation`

			`def read_html_file(filename):`
			`"""Read HTML content from a file."""`
			`with open(filename, 'r', encoding='utf-8') as file:`
			`return file.read()`

			`def validate_html(html):`
			`"""Validate the HTML content."""`
			`try:`
			`parser = etree.HTMLParser()`
			`etree.fromstring(html, parser) # Attempt to parse the HTML`
			`except Exception as e:`
			`raise ValueError("Invalid HTML document") from e`
Add in original html - play with sanitising paragraphs 2024-09-14 19:08:48 +02:00
			`def sanitize_text(text):`
			`# Replace newlines with spaces`
			`sanitized_text = text.replace('\n', ' ').replace('\r', ' ') # Handle both Unix and Windows line endings`
			`# Replace tabs with spaces`
			`sanitized_text = sanitized_text.replace('\t', ' ')`
			`# Escape quote characters`
			`sanitized_text = sanitized_text.replace('"', '\\"').replace("'", "\\'")`
			`# Strip leading and trailing whitespace`
			`sanitized_text = sanitized_text.strip()`
			`return sanitized_text`

Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00
			`def extract_data(html):`
			`"""Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""`
			`soup = BeautifulSoup(html, 'lxml')`
			`records = []`

			`hidden_input_names = [`
			`'page',`
			`'page_stack',`
			`'.id',`
			`'csrf_token'`
			`]`

			`header_text = None`
			`sub_header_text = None`

			`# Counter for tables`
			`table_counter = 0`

			`# Extract elements while preserving order`
			`for element in soup.find_all(['h1', 'h2', 'p', 'pre', 'input', 'select', 'textarea', 'button', 'table']):`
			`if element.name == 'h1':`
			`header_text = element.get_text(strip=True)`
			`records.append({`
			`'Type': 'Header',`
			`'Text': header_text`
			`})`

			`elif element.name == 'h2':`
			`sub_header_text = element.get_text(strip=True)`
			`records.append({`
			`'Type': 'SubHeader',`
			`'Text': sub_header_text`
			`})`

			`elif element.name == 'p':`
			`text = element.get_text(strip=True)`
			`if text: # Ignore empty paragraphs`
Add in original html - play with sanitising paragraphs 2024-09-14 19:08:48 +02:00			`#Sanitise text freom newlines,tabs and escape quotes.`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`records.append({`
			`'Type': 'Paragraph',`
Add in original html - play with sanitising paragraphs 2024-09-14 19:08:48 +02:00			`'Text': sanitize_text(text)`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00			`})`

			`elif element.name == 'pre':`
			`text = element.get_text(strip=True)`
			`if text: # Ensure non-empty before adding`
			`records.append({`
			`'Type': 'Preformatted',`
			`'Text': text`
			`})`

			`elif element.name == 'input':`
			`if element.get('type') == 'hidden' or element.get('name') in hidden_input_names:`
			`continue`

			`input_info = {`
			`'Type': element.get('type', 'text').capitalize(),`
			`'Name': element.get('name'),`
			`'Value': element.get('value', ''),`
			`}`
			`label = element.find_next('label')`
			`input_info['Label'] = label.get_text(strip=True) if label else None`
			`records.append(input_info)`

			`elif element.name == 'select':`
			`options = [{'Value': option.get('value'), 'Text': option.get_text(strip=True)} for option in element.find_all('option')]`
			`select_info = {`
			`'Type': 'Select',`
			`'Name': element.get('name'),`
			`'Options': options,`
			`'Label': element.find_previous('label').get_text(strip=True) if element.find_previous('label') else None,`
			`}`
			`records.append(select_info)`

			`elif element.name == 'textarea':`
			`textarea_info = {`
			`'Type': 'Textarea',`
			`'Name': element.get('name'),`
			`'Value': element.get_text(strip=True),`
			`}`
			`label = element.find_previous('label')`
			`textarea_info['Label'] = label.get_text(strip=True) if label else None`
			`records.append(textarea_info)`

			`elif element.name == 'button':`
			`button_info = {`
			`'Type': 'Button',`
			`'Name': element.get('name'),`
			`'Value': element.get_text(strip=True),`
			`'Label': element.find_previous('label').get_text(strip=True) if label else None,`
			`}`
			`records.append(button_info)`

			`elif element.name == 'table' and 'sme-border' in element.get('class', []):`
			`# Increment the table counter`
			`table_counter += 1`

			`# Prepare the TableControl format`
			`table_control = f"Table{table_counter}" # e.g., "Table1", "Table2"`
			`top_headings = []`
			`columns = []`

			`# Extract headings from the first row`
			`first_row = element.find('tr')`
			`if first_row:`
			`for th in first_row.find_all('th'):`
			`top_headings.append(th.get_text(strip=True))`

			`# Extract only the first data row's cell values for Columns`
			`data_rows = element.find_all('tr')[1:] # Skip the heading row`
			`if data_rows:`
			`first_data_row = data_rows[0] # Take the first row of data`
			`for idx, th in enumerate(first_row.find_all('th')):`
			`td = first_data_row.find_all('td')[idx] if idx < len(first_data_row.find_all('td')) else None`
			`if td:`
			`columns.append(f"{table_control}-{th.get_text(strip=True)}") # Format as desired`

			`records.append({`
			`'Type': 'Table',`
			`'TableControl': table_control,`
			`'TopHeadings': top_headings,`
			`'Columns': columns,`
			`})`

			`return records, header_text, sub_header_text`

			`def insert_spaces_before_caps(text):`
			`"""Insert spaces before each capital letter in a given string."""`
			`return re.sub(r'(?<!^)(?=[A-Z])', ' ', text)`

			`def save_to_json5(data, output_filename, package_name, header, sub_header):`
			`"""Save extracted data to a JSON5 file with a specific structure."""`
First rnun of SM1-2-json created json5 2024-09-12 20:37:27 +02:00			`# Generate prefix from uppercase letters in PackageName made into lowercase`
Maily debugging info to find reason nfsshare works but diskusage fails 2024-09-13 17:36:57 +02:00			`prefix = ''.join(re.findall(r'[A-Z]', package_name)).lower()`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00
			`# Prepare structured html list`
			`structured_html = []`
			`paragraph_count = 1`
			`preformatted_count = 1`
			`input_count = 1`
			`table_count = 1`

			`for record in data:`
			`if record['Type'] == 'Paragraph':`
			`structured_html.append({`
			`f'Paragraph{paragraph_count}': record['Text']`
			`})`
			`paragraph_count += 1`
			`elif record['Type'] == 'Preformatted':`
			`structured_html.append({`
			`f'Preformatted{preformatted_count}': record['Text']`
			`})`
			`preformatted_count += 1`
			`elif record['Type'] == 'Header' or record['Type'] == 'SubHeader':`
			`continue # Skip headers for input count`
			`elif record['Type'] == 'Table':`
			`# Construct the table entry`
			`table_structure = {`
			`'Type': record['Type'],`
			`'TableControl': record['TableControl'],`
			`'TopHeadings': record['TopHeadings'],`
			`'Columns': record['Columns']`
			`}`
			`structured_html.append({`
			`f'Table{table_count}': table_structure`
			`})`
			`table_count += 1`
			`else: # For inputs, selects, textareas, and buttons`
			`input_structure = {`
			`'Type': record['Type'],`
			`'Value': record.get('Value', ''), # Safely access Value`
			`}`

			`# Use .get() for the Name key to avoid KeyError`
			`input_structure['Name'] = record.get('Name', None) # Set to None if not present`
			`input_structure['Label'] = record.get('Label', None) # Set to None if not present`

			`# Handle specific case for Select options`
			`if 'Options' in record:`
			`input_structure['Options'] = record['Options']`

			`structured_html.append({`
			`f'Input{input_count}': input_structure`
			`})`
			`input_count += 1`

			`# Wrap the records with the required fields`
			`json5_data = {`
			`'PackageName': package_name,`
			`'prefix': prefix,`
			`'MenuHeading': 'Miscellaneous',`
			`'MenuDescription': insert_spaces_before_caps(package_name),`
			`'MenuNavigation': '2000 400',`
			`'firstPanel': 'PARAMS',`
			`'signalEvent': f'smeserver-{package_name.lower()}-update',`
			`'html': {`
			`'Name': 'params',`
			`'route': 'PARAMS',`
			`'Header': header if header else f'{package_name} Contrib',`
			`'SubHeader': sub_header if sub_header else f'Manage {package_name} settings:',`
			`**{k: v for item in structured_html for k, v in item.items()} # Flatten the structured_html into the dict`
			`}`
			`}`

			`# Save in JSON5 format (JSON with comments and unquoted keys)`
			`with open(output_filename, 'w', encoding='utf-8') as json_file:`
			`json.dump(json5_data, json_file, ensure_ascii=False, indent=4)`

			`# Manually format as JSON5 by adding single quotes (for simplicity)`
			`with open(output_filename, 'r+', encoding='utf-8') as json_file:`
			`content = json_file.read()`
			`content = content.replace('"', "'") # Replace double quotes with single quotes for JSON5`
			`json_file.seek(0)`
			`json_file.write(content)`
			`json_file.truncate() # Remove any old content beyond the new content length`

			`def main():`
Add in original html - play with sanitising paragraphs 2024-09-14 19:08:48 +02:00			`input_file = '/home/brianr/clients/SM2/SM2Gen/venv/html/CreateStarterWebsite.html' # Specify the input HTML file path`
Add in preformat and sm1 html to json5 extractor program 2024-09-12 19:54:38 +02:00
			`# Read HTML content`
			`html_content = read_html_file(input_file)`

			`# Validate the HTML before extracting data`
			`validate_html(html_content)`

			`# Extract data from HTML`
			`data, header, sub_header = extract_data(html_content)`

			`# Generate output JSON5 filename based on input file name`
			`base_name = os.path.basename(input_file) # Get the file name (with extension)`
			`package_name = os.path.splitext(base_name)[0] # Use the filename without extension`
			`json_filename = package_name + '.json5' # Change extension to .json5`

			`# Create the output file path in the same directory`
			`output_directory = os.path.dirname(input_file)`
			`output_file = os.path.join(output_directory, json_filename)`

			`# Save extracted data to JSON5`
			`save_to_json5(data, output_file, package_name, header, sub_header)`
			`print(f"Extracted data saved to '{output_file}'.")`

			`if __name__ == '__main__':`
			`main()`