More fixing paragraph text quotes etc

This commit is contained in:
2024-09-16 14:01:17 +01:00
parent 265901414c
commit 5fc9f9b763
26 changed files with 199 additions and 216 deletions

View File

@@ -21,24 +21,54 @@ def validate_html(html):
except Exception as e:
raise ValueError("Invalid HTML document") from e
def convert_double_quotes_to_span(text):
"""Convert single-quoted text to <span>...</span>."""
# Use a regular expression to find single-quoted text and replace it
return re.sub(r'"(.*?)"', r"<span class=emphasis-para>\1</span>", text)
# def sanitize_text(text):
# # Replace newlines with spaces
# print(f"--{text}--")
# decoded_text = html.unescape(text)
# sanitized_text = decoded_text.replace("\n", "").replace(
# "\r", " "
# ) # Handle both Unix and Windows line endings
# # Replace tabs with spaces
# sanitized_text = sanitized_text.replace("\t", "")
# # map single quotes to double
# # sanitized_text = sanitized_text.replace("'", '"')
# #Map signle and double quotes to nothing
# sanitized_text.replace("'","").replace('"','')
# #Take out any multiple spaces - reduce to one.
# sanitized_text = ' '.join(sanitized_text.split())
# # Strip leading and trailing whitespace
# sanitized_text = sanitized_text.strip()
# #sanitized_text = convert_double_quotes_to_span(sanitized_text)
# print(f"++{sanitized_text}++")
# return sanitized_text
def sanitize_text(text):
# Replace newlines with spaces
print(f"--{text}--")
# Take out html entities
decoded_text = html.unescape(text)
sanitized_text = decoded_text.replace("\n", "").replace(
"\r", " "
) # Handle both Unix and Windows line endings
# Take out newlines
sanitized_text = decoded_text.replace('\n', ' ').replace('\r', ' ') # Handle both Unix and Windows line endings
# Replace tabs with spaces
sanitized_text = sanitized_text.replace("\t", "")
# map single quotes to double
sanitized_text = sanitized_text.replace("'", '"')
sanitized_text = sanitized_text.replace('\t', ' ')
# Replace quote characters
sanitized_text = sanitized_text.replace('"', '').replace("'", '') # Remove double and single quotes
#Take out any multiple spaces - reduce to one.
sanitized_text = ' '.join(sanitized_text.split())
# Strip leading and trailing whitespace
sanitized_text = sanitized_text.strip()
print(f"++{sanitized_text}++")
return sanitized_text
def extract_data(html):
"""Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
soup = BeautifulSoup(html, "lxml")
@@ -68,7 +98,10 @@ def extract_data(html):
text = element.get_text(strip=True)
if text: # Ignore empty paragraphs
# Sanitise text freom newlines,tabs and escape quotes.
records.append({"Type": "Paragraph", "Text": sanitize_text(text)})
sanitised_text = sanitize_text(text)
if sanitised_text == "":
continue
records.append({"Type": "Paragraph", "Text": sanitised_text})
elif element.name == "pre":
text = element.get_text(strip=True)