Add in original html - play with sanitising paragraphs

This commit is contained in:
2024-09-14 18:08:48 +01:00
parent 3813f55f68
commit 3b6e1930a0
30 changed files with 562 additions and 167 deletions

View File

@@ -16,6 +16,18 @@ def validate_html(html):
etree.fromstring(html, parser) # Attempt to parse the HTML
except Exception as e:
raise ValueError("Invalid HTML document") from e
def sanitize_text(text):
# Replace newlines with spaces
sanitized_text = text.replace('\n', ' ').replace('\r', ' ') # Handle both Unix and Windows line endings
# Replace tabs with spaces
sanitized_text = sanitized_text.replace('\t', ' ')
# Escape quote characters
sanitized_text = sanitized_text.replace('"', '\\"').replace("'", "\\'")
# Strip leading and trailing whitespace
sanitized_text = sanitized_text.strip()
return sanitized_text
def extract_data(html):
"""Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
@@ -54,9 +66,10 @@ def extract_data(html):
elif element.name == 'p':
text = element.get_text(strip=True)
if text: # Ignore empty paragraphs
#Sanitise text freom newlines,tabs and escape quotes.
records.append({
'Type': 'Paragraph',
'Text': text
'Text': sanitize_text(text)
})
elif element.name == 'pre':
@@ -233,7 +246,7 @@ def save_to_json5(data, output_filename, package_name, header, sub_header):
json_file.truncate() # Remove any old content beyond the new content length
def main():
input_file = '/home/brianr/clients/SM2/SM1-JSONGen/DiskUsage.html' # Specify the input HTML file path
input_file = '/home/brianr/clients/SM2/SM2Gen/venv/html/CreateStarterWebsite.html' # Specify the input HTML file path
# Read HTML content
html_content = read_html_file(input_file)