More fixing paragraph text quotes etc

2024-09-16 14:01:17 +01:00
parent 265901414c
commit 5fc9f9b763
26 changed files with 199 additions and 216 deletions
--- a/sm1-html-2-json5.py
+++ b/sm1-html-2-json5.py
@@ -21,24 +21,54 @@ def validate_html(html):
    except Exception as e:
        raise ValueError("Invalid HTML document") from e

+def convert_double_quotes_to_span(text):
+    """Convert single-quoted text to <span>...</span>."""
+    # Use a regular expression to find single-quoted text and replace it
+    return re.sub(r'"(.*?)"', r"<span class=emphasis-para>\1</span>", text)

+
+# def sanitize_text(text):
+    # # Replace newlines with spaces
+    # print(f"--{text}--")
+    # decoded_text = html.unescape(text)
+
+    # sanitized_text = decoded_text.replace("\n", "").replace(
+        # "\r", " "
+    # )    # Handle both Unix and Windows line endings
+    # # Replace tabs with spaces
+    # sanitized_text = sanitized_text.replace("\t", "")
+    # # map single quotes to double
+    # # sanitized_text = sanitized_text.replace("'", '"')
+    # #Map signle and double quotes to nothing
+    # sanitized_text.replace("'","").replace('"','')
+    # #Take out any multiple spaces - reduce to one.
+    # sanitized_text = ' '.join(sanitized_text.split())
+    # # Strip leading and trailing whitespace
+    # sanitized_text = sanitized_text.strip()
+    # #sanitized_text = convert_double_quotes_to_span(sanitized_text)
+    # print(f"++{sanitized_text}++")
+    # return sanitized_text
+    
 def sanitize_text(text):
    # Replace newlines with spaces
+    print(f"--{text}--")
+    # Take out html entities
    decoded_text = html.unescape(text)
-    sanitized_text = decoded_text.replace("\n", "").replace(
-        "\r", " "
-    )  # Handle both Unix and Windows line endings
+	# Take out newlines
+    sanitized_text = decoded_text.replace('\n', ' ').replace('\r', ' ')  # Handle both Unix and Windows line endings
    # Replace tabs with spaces
-    sanitized_text = sanitized_text.replace("\t", "")
-    # map single quotes to double
-    sanitized_text = sanitized_text.replace("'", '"')
+    sanitized_text = sanitized_text.replace('\t', ' ')
+    # Replace quote characters
+    sanitized_text = sanitized_text.replace('"', '').replace("'", '')  # Remove double and single quotes
    #Take out any multiple spaces - reduce to one.
    sanitized_text = ' '.join(sanitized_text.split())
    # Strip leading and trailing whitespace
    sanitized_text = sanitized_text.strip()
+    print(f"++{sanitized_text}++")
    return sanitized_text


+
 def extract_data(html):
    """Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
    soup = BeautifulSoup(html, "lxml")
@@ -68,7 +98,10 @@ def extract_data(html):
            text = element.get_text(strip=True)
            if text:  # Ignore empty paragraphs
                # Sanitise text freom newlines,tabs and escape quotes.
-                records.append({"Type": "Paragraph", "Text": sanitize_text(text)})
+                sanitised_text = sanitize_text(text)
+                if sanitised_text == "":
+                   continue
+                records.append({"Type": "Paragraph", "Text": sanitised_text})

        elif element.name == "pre":
            text = element.get_text(strip=True)