More fixing paragraph text quotes etc
This commit is contained in:
@@ -21,24 +21,54 @@ def validate_html(html):
|
||||
except Exception as e:
|
||||
raise ValueError("Invalid HTML document") from e
|
||||
|
||||
def convert_double_quotes_to_span(text):
|
||||
"""Convert single-quoted text to <span>...</span>."""
|
||||
# Use a regular expression to find single-quoted text and replace it
|
||||
return re.sub(r'"(.*?)"', r"<span class=emphasis-para>\1</span>", text)
|
||||
|
||||
|
||||
# def sanitize_text(text):
|
||||
# # Replace newlines with spaces
|
||||
# print(f"--{text}--")
|
||||
# decoded_text = html.unescape(text)
|
||||
|
||||
# sanitized_text = decoded_text.replace("\n", "").replace(
|
||||
# "\r", " "
|
||||
# ) # Handle both Unix and Windows line endings
|
||||
# # Replace tabs with spaces
|
||||
# sanitized_text = sanitized_text.replace("\t", "")
|
||||
# # map single quotes to double
|
||||
# # sanitized_text = sanitized_text.replace("'", '"')
|
||||
# #Map signle and double quotes to nothing
|
||||
# sanitized_text.replace("'","").replace('"','')
|
||||
# #Take out any multiple spaces - reduce to one.
|
||||
# sanitized_text = ' '.join(sanitized_text.split())
|
||||
# # Strip leading and trailing whitespace
|
||||
# sanitized_text = sanitized_text.strip()
|
||||
# #sanitized_text = convert_double_quotes_to_span(sanitized_text)
|
||||
# print(f"++{sanitized_text}++")
|
||||
# return sanitized_text
|
||||
|
||||
def sanitize_text(text):
|
||||
# Replace newlines with spaces
|
||||
print(f"--{text}--")
|
||||
# Take out html entities
|
||||
decoded_text = html.unescape(text)
|
||||
sanitized_text = decoded_text.replace("\n", "").replace(
|
||||
"\r", " "
|
||||
) # Handle both Unix and Windows line endings
|
||||
# Take out newlines
|
||||
sanitized_text = decoded_text.replace('\n', ' ').replace('\r', ' ') # Handle both Unix and Windows line endings
|
||||
# Replace tabs with spaces
|
||||
sanitized_text = sanitized_text.replace("\t", "")
|
||||
# map single quotes to double
|
||||
sanitized_text = sanitized_text.replace("'", '"')
|
||||
sanitized_text = sanitized_text.replace('\t', ' ')
|
||||
# Replace quote characters
|
||||
sanitized_text = sanitized_text.replace('"', '').replace("'", '') # Remove double and single quotes
|
||||
#Take out any multiple spaces - reduce to one.
|
||||
sanitized_text = ' '.join(sanitized_text.split())
|
||||
# Strip leading and trailing whitespace
|
||||
sanitized_text = sanitized_text.strip()
|
||||
print(f"++{sanitized_text}++")
|
||||
return sanitized_text
|
||||
|
||||
|
||||
|
||||
def extract_data(html):
|
||||
"""Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
@@ -68,7 +98,10 @@ def extract_data(html):
|
||||
text = element.get_text(strip=True)
|
||||
if text: # Ignore empty paragraphs
|
||||
# Sanitise text freom newlines,tabs and escape quotes.
|
||||
records.append({"Type": "Paragraph", "Text": sanitize_text(text)})
|
||||
sanitised_text = sanitize_text(text)
|
||||
if sanitised_text == "":
|
||||
continue
|
||||
records.append({"Type": "Paragraph", "Text": sanitised_text})
|
||||
|
||||
elif element.name == "pre":
|
||||
text = element.get_text(strip=True)
|
||||
|
Reference in New Issue
Block a user