Add in preformat and sm1 html to json5 extractor program

2024-09-12 18:54:38 +01:00
parent 5c5a3bfba2
commit 2ee6bd3bb6
7 changed files with 718 additions and 0 deletions
--- a/sm1-html-2-json5.py
+++ b/sm1-html-2-json5.py
@@ -0,0 +1,261 @@
+import json
+import os
+import re
+from bs4 import BeautifulSoup
+from lxml import etree  # Import lxml for HTML validation
+
+def read_html_file(filename):
+    """Read HTML content from a file."""
+    with open(filename, 'r', encoding='utf-8') as file:
+        return file.read()
+
+def validate_html(html):
+    """Validate the HTML content."""
+    try:
+        parser = etree.HTMLParser()
+        etree.fromstring(html, parser)  # Attempt to parse the HTML
+    except Exception as e:
+        raise ValueError("Invalid HTML document") from e
+
+def extract_data(html):
+    """Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
+    soup = BeautifulSoup(html, 'lxml')
+    records = []
+
+    hidden_input_names = [
+        'page',
+        'page_stack',
+        '.id',
+        'csrf_token'
+    ]
+
+    header_text = None
+    sub_header_text = None
+
+    # Counter for tables
+    table_counter = 0
+
+    # Extract elements while preserving order
+    for element in soup.find_all(['h1', 'h2', 'p', 'pre', 'input', 'select', 'textarea', 'button', 'table']):
+        if element.name == 'h1':
+            header_text = element.get_text(strip=True)
+            records.append({
+                'Type': 'Header',
+                'Text': header_text
+            })
+
+        elif element.name == 'h2':
+            sub_header_text = element.get_text(strip=True)
+            records.append({
+                'Type': 'SubHeader',
+                'Text': sub_header_text
+            })
+
+        elif element.name == 'p':
+            text = element.get_text(strip=True)
+            if text:  # Ignore empty paragraphs
+                records.append({
+                    'Type': 'Paragraph',
+                    'Text': text
+                })
+
+        elif element.name == 'pre':
+            text = element.get_text(strip=True)
+            if text:  # Ensure non-empty before adding
+                records.append({
+                    'Type': 'Preformatted',
+                    'Text': text
+                })
+
+        elif element.name == 'input':
+            if element.get('type') == 'hidden' or element.get('name') in hidden_input_names:
+                continue
+            
+            input_info = {
+                'Type': element.get('type', 'text').capitalize(),
+                'Name': element.get('name'),
+                'Value': element.get('value', ''),
+            }
+            label = element.find_next('label')
+            input_info['Label'] = label.get_text(strip=True) if label else None
+            records.append(input_info)
+
+        elif element.name == 'select':
+            options = [{'Value': option.get('value'), 'Text': option.get_text(strip=True)} for option in element.find_all('option')]
+            select_info = {
+                'Type': 'Select',
+                'Name': element.get('name'),
+                'Options': options,
+                'Label': element.find_previous('label').get_text(strip=True) if element.find_previous('label') else None,
+            }
+            records.append(select_info)
+
+        elif element.name == 'textarea':
+            textarea_info = {
+                'Type': 'Textarea',
+                'Name': element.get('name'),
+                'Value': element.get_text(strip=True),
+            }
+            label = element.find_previous('label')
+            textarea_info['Label'] = label.get_text(strip=True) if label else None
+            records.append(textarea_info)
+
+        elif element.name == 'button':
+            button_info = {
+                'Type': 'Button',
+                'Name': element.get('name'),
+                'Value': element.get_text(strip=True),
+                'Label': element.find_previous('label').get_text(strip=True) if label else None,
+            }
+            records.append(button_info)
+
+        elif element.name == 'table' and 'sme-border' in element.get('class', []):
+            # Increment the table counter
+            table_counter += 1
+
+            # Prepare the TableControl format
+            table_control = f"Table{table_counter}"  # e.g., "Table1", "Table2"
+            top_headings = []
+            columns = []
+
+            # Extract headings from the first row
+            first_row = element.find('tr')
+            if first_row:
+                for th in first_row.find_all('th'):
+                    top_headings.append(th.get_text(strip=True))
+
+            # Extract only the first data row's cell values for Columns
+            data_rows = element.find_all('tr')[1:]  # Skip the heading row
+            if data_rows:
+                first_data_row = data_rows[0]  # Take the first row of data
+                for idx, th in enumerate(first_row.find_all('th')):
+                    td = first_data_row.find_all('td')[idx] if idx < len(first_data_row.find_all('td')) else None
+                    if td:
+                        columns.append(f"{table_control}-{th.get_text(strip=True)}")  # Format as desired
+
+            records.append({
+                'Type': 'Table',
+                'TableControl': table_control,
+                'TopHeadings': top_headings,
+                'Columns': columns,
+            })
+
+    return records, header_text, sub_header_text
+
+def insert_spaces_before_caps(text):
+    """Insert spaces before each capital letter in a given string."""
+    return re.sub(r'(?<!^)(?=[A-Z])', ' ', text)
+
+def save_to_json5(data, output_filename, package_name, header, sub_header):
+    """Save extracted data to a JSON5 file with a specific structure."""
+    # Generate prefix from uppercase letters in PackageName
+    prefix = ''.join(re.findall(r'[A-Z]', package_name))
+
+    # Prepare structured html list
+    structured_html = []
+    paragraph_count = 1
+    preformatted_count = 1
+    input_count = 1
+    table_count = 1
+
+    for record in data:
+        if record['Type'] == 'Paragraph':
+            structured_html.append({
+                f'Paragraph{paragraph_count}': record['Text']
+            })
+            paragraph_count += 1
+        elif record['Type'] == 'Preformatted':
+            structured_html.append({
+                f'Preformatted{preformatted_count}': record['Text']
+            })
+            preformatted_count += 1
+        elif record['Type'] == 'Header' or record['Type'] == 'SubHeader':
+            continue  # Skip headers for input count
+        elif record['Type'] == 'Table':
+            # Construct the table entry
+            table_structure = {
+                'Type': record['Type'],
+                'TableControl': record['TableControl'],
+                'TopHeadings': record['TopHeadings'],
+                'Columns': record['Columns']
+            }
+            structured_html.append({
+                f'Table{table_count}': table_structure
+            })
+            table_count += 1
+        else:  # For inputs, selects, textareas, and buttons
+            input_structure = {
+                'Type': record['Type'],
+                'Value': record.get('Value', ''),  # Safely access Value
+            }
+
+            # Use .get() for the Name key to avoid KeyError
+            input_structure['Name'] = record.get('Name', None)  # Set to None if not present
+            input_structure['Label'] = record.get('Label', None)  # Set to None if not present
+
+            # Handle specific case for Select options
+            if 'Options' in record:
+                input_structure['Options'] = record['Options']
+
+            structured_html.append({
+                f'Input{input_count}': input_structure
+            })
+            input_count += 1
+
+    # Wrap the records with the required fields
+    json5_data = {
+        'PackageName': package_name,
+        'prefix': prefix,
+        'MenuHeading': 'Miscellaneous',
+        'MenuDescription': insert_spaces_before_caps(package_name),
+        'MenuNavigation': '2000 400',
+        'firstPanel': 'PARAMS',
+        'signalEvent': f'smeserver-{package_name.lower()}-update',
+        'html': {
+            'Name': 'params',
+            'route': 'PARAMS',
+            'Header': header if header else f'{package_name} Contrib',
+            'SubHeader': sub_header if sub_header else f'Manage {package_name} settings:',
+            **{k: v for item in structured_html for k, v in item.items()}  # Flatten the structured_html into the dict
+        }
+    }
+
+    # Save in JSON5 format (JSON with comments and unquoted keys)
+    with open(output_filename, 'w', encoding='utf-8') as json_file:
+        json.dump(json5_data, json_file, ensure_ascii=False, indent=4)
+    
+    # Manually format as JSON5 by adding single quotes (for simplicity)
+    with open(output_filename, 'r+', encoding='utf-8') as json_file:
+        content = json_file.read()
+        content = content.replace('"', "'")  # Replace double quotes with single quotes for JSON5
+        json_file.seek(0)
+        json_file.write(content)
+        json_file.truncate()  # Remove any old content beyond the new content length
+
+def main():
+    input_file = '/home/brianr/clients/SM2/SM1-JSONGen/DiskUsage.html'  # Specify the input HTML file path
+
+    # Read HTML content
+    html_content = read_html_file(input_file)
+
+    # Validate the HTML before extracting data
+    validate_html(html_content)
+
+    # Extract data from HTML
+    data, header, sub_header = extract_data(html_content)
+
+    # Generate output JSON5 filename based on input file name
+    base_name = os.path.basename(input_file)  # Get the file name (with extension)
+    package_name = os.path.splitext(base_name)[0]  # Use the filename without extension
+    json_filename = package_name + '.json5'  # Change extension to .json5
+
+    # Create the output file path in the same directory
+    output_directory = os.path.dirname(input_file)
+    output_file = os.path.join(output_directory, json_filename)
+
+    # Save extracted data to JSON5
+    save_to_json5(data, output_file, package_name, header, sub_header)
+    print(f"Extracted data saved to '{output_file}'.")
+
+if __name__ == '__main__':
+    main()