Add in original html - play with sanitising paragraphs
This commit is contained in:
@@ -16,6 +16,18 @@ def validate_html(html):
|
||||
etree.fromstring(html, parser) # Attempt to parse the HTML
|
||||
except Exception as e:
|
||||
raise ValueError("Invalid HTML document") from e
|
||||
|
||||
def sanitize_text(text):
|
||||
# Replace newlines with spaces
|
||||
sanitized_text = text.replace('\n', ' ').replace('\r', ' ') # Handle both Unix and Windows line endings
|
||||
# Replace tabs with spaces
|
||||
sanitized_text = sanitized_text.replace('\t', ' ')
|
||||
# Escape quote characters
|
||||
sanitized_text = sanitized_text.replace('"', '\\"').replace("'", "\\'")
|
||||
# Strip leading and trailing whitespace
|
||||
sanitized_text = sanitized_text.strip()
|
||||
return sanitized_text
|
||||
|
||||
|
||||
def extract_data(html):
|
||||
"""Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
|
||||
@@ -54,9 +66,10 @@ def extract_data(html):
|
||||
elif element.name == 'p':
|
||||
text = element.get_text(strip=True)
|
||||
if text: # Ignore empty paragraphs
|
||||
#Sanitise text freom newlines,tabs and escape quotes.
|
||||
records.append({
|
||||
'Type': 'Paragraph',
|
||||
'Text': text
|
||||
'Text': sanitize_text(text)
|
||||
})
|
||||
|
||||
elif element.name == 'pre':
|
||||
@@ -233,7 +246,7 @@ def save_to_json5(data, output_filename, package_name, header, sub_header):
|
||||
json_file.truncate() # Remove any old content beyond the new content length
|
||||
|
||||
def main():
|
||||
input_file = '/home/brianr/clients/SM2/SM1-JSONGen/DiskUsage.html' # Specify the input HTML file path
|
||||
input_file = '/home/brianr/clients/SM2/SM2Gen/venv/html/CreateStarterWebsite.html' # Specify the input HTML file path
|
||||
|
||||
# Read HTML content
|
||||
html_content = read_html_file(input_file)
|
||||
|
Reference in New Issue
Block a user