Add in original html - play with sanitising paragraphs

2024-09-14 18:08:48 +01:00
parent 3813f55f68
commit 3b6e1930a0
30 changed files with 562 additions and 167 deletions
--- a/sm1-html-2-json5.py
+++ b/sm1-html-2-json5.py
@@ -16,6 +16,18 @@ def validate_html(html):
        etree.fromstring(html, parser)  # Attempt to parse the HTML
    except Exception as e:
        raise ValueError("Invalid HTML document") from e
+        
+def sanitize_text(text):
+    # Replace newlines with spaces
+    sanitized_text = text.replace('\n', ' ').replace('\r', ' ')  # Handle both Unix and Windows line endings
+    # Replace tabs with spaces
+    sanitized_text = sanitized_text.replace('\t', ' ')
+    # Escape quote characters
+    sanitized_text = sanitized_text.replace('"', '\\"').replace("'", "\\'")
+    # Strip leading and trailing whitespace
+    sanitized_text = sanitized_text.strip()
+    return sanitized_text
+

 def extract_data(html):
    """Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
@@ -54,9 +66,10 @@ def extract_data(html):
        elif element.name == 'p':
            text = element.get_text(strip=True)
            if text:  # Ignore empty paragraphs
+				#Sanitise text freom newlines,tabs and escape quotes.
                records.append({
                    'Type': 'Paragraph',
-                    'Text': text
+                    'Text': sanitize_text(text)
                })

        elif element.name == 'pre':
@@ -233,7 +246,7 @@ def save_to_json5(data, output_filename, package_name, header, sub_header):
        json_file.truncate()  # Remove any old content beyond the new content length

 def main():
-    input_file = '/home/brianr/clients/SM2/SM1-JSONGen/DiskUsage.html'  # Specify the input HTML file path
+    input_file = '/home/brianr/clients/SM2/SM2Gen/venv/html/CreateStarterWebsite.html'  # Specify the input HTML file path

    # Read HTML content
    html_content = read_html_file(input_file)