Add list of singleton pamraeters to custom, refine letsencrypt json
This commit is contained in:
@@ -150,136 +150,141 @@ def sanitize_text(text):
|
||||
|
||||
|
||||
def extract_data(html):
|
||||
"""Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
records = []
|
||||
"""Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
records = []
|
||||
|
||||
hidden_input_names = ["page", "page_stack", ".id", "csrf_token"]
|
||||
hidden_input_names = ["page", "page_stack", ".id", "csrf_token"]
|
||||
|
||||
header_text = None
|
||||
sub_header_text = None
|
||||
header_text = None
|
||||
sub_header_text = None
|
||||
|
||||
# Counter for tables
|
||||
table_counter = 0
|
||||
# Counter for tables
|
||||
table_counter = 0
|
||||
|
||||
# Extract elements while preserving order
|
||||
for element in soup.find_all(
|
||||
["h1", "h2", "p", "pre", "input", "select", "textarea", "button", "table"]
|
||||
):
|
||||
if element.name == "h1":
|
||||
header_text = element.get_text(strip=True)
|
||||
records.append({"Type": "Header", "Text": header_text})
|
||||
# Extract elements while preserving order
|
||||
for element in soup.find_all(
|
||||
["h1", "h2", "p", "pre", "input", "select", "textarea", "button", "table","a"]
|
||||
):
|
||||
if element.name == "h1":
|
||||
header_text = element.get_text(strip=True)
|
||||
records.append({"Type": "Header", "Text": header_text})
|
||||
|
||||
elif element.name == "h2":
|
||||
sub_header_text = element.get_text(strip=True)
|
||||
records.append({"Type": "SubHeader", "Text": sub_header_text})
|
||||
elif element.name == "h2":
|
||||
sub_header_text = element.get_text(strip=True)
|
||||
records.append({"Type": "SubHeader", "Text": sub_header_text})
|
||||
|
||||
elif element.name == "p":
|
||||
text = element.get_text(strip=True)
|
||||
if text: # Ignore empty paragraphs
|
||||
# Sanitise text freom newlines,tabs and escape quotes.
|
||||
sanitised_text = sanitize_text(text)
|
||||
if sanitised_text == "":
|
||||
continue
|
||||
records.append({"Type": "Paragraph", "Text": sanitised_text})
|
||||
elif element.name == "p":
|
||||
text = element.get_text(strip=True)
|
||||
if text: # Ignore empty paragraphs
|
||||
# Sanitise text from newlines,tabs and escape quotes.
|
||||
sanitised_text = sanitize_text(text)
|
||||
if sanitised_text == "":
|
||||
continue
|
||||
records.append({"Type": "Paragraph", "Text": sanitised_text})
|
||||
|
||||
elif element.name == "pre":
|
||||
text = element.get_text(strip=True)
|
||||
if text: # Ensure non-empty before adding
|
||||
records.append({"Type": "Preformatted", "Text": text})
|
||||
elif element.name == "pre":
|
||||
text = element.get_text(strip=True)
|
||||
if text: # Ensure non-empty before adding
|
||||
records.append({"Type": "Preformatted", "Text": text})
|
||||
|
||||
elif element.name == "input":
|
||||
if (
|
||||
element.get("type") == "hidden"
|
||||
or element.get("name") in hidden_input_names
|
||||
):
|
||||
continue
|
||||
elif element.name == "a":
|
||||
title = element.get_text(strip=True)
|
||||
href = element.get("href")
|
||||
records.append({"Type": "Link", "href": href, "title": title})
|
||||
|
||||
input_info = {
|
||||
"Type": element.get("type", "text").capitalize(),
|
||||
"Name": element.get("name"),
|
||||
"Value": element.get("value", ""),
|
||||
}
|
||||
label = element.find_next("label")
|
||||
input_info["Label"] = label.get_text(strip=True) if label else None
|
||||
records.append(input_info)
|
||||
elif element.name == "input":
|
||||
if (
|
||||
element.get("type") == "hidden"
|
||||
or element.get("name") in hidden_input_names
|
||||
):
|
||||
continue
|
||||
|
||||
elif element.name == "select":
|
||||
options = [
|
||||
{"Value": option.get("value"), "Text": option.get_text(strip=True)}
|
||||
for option in element.find_all("option")
|
||||
]
|
||||
select_info = {
|
||||
"Type": "Select",
|
||||
"Name": element.get("name"),
|
||||
"Options": options,
|
||||
"Label": element.find_previous("label").get_text(strip=True)
|
||||
if element.find_previous("label")
|
||||
else None,
|
||||
}
|
||||
records.append(select_info)
|
||||
input_info = {
|
||||
"Type": element.get("type", "text").capitalize(),
|
||||
"Name": element.get("name"),
|
||||
"Value": element.get("value", ""),
|
||||
}
|
||||
label = element.find_next("label")
|
||||
input_info["Label"] = label.get_text(strip=True) if label else None
|
||||
records.append(input_info)
|
||||
|
||||
elif element.name == "textarea":
|
||||
textarea_info = {
|
||||
"Type": "Textarea",
|
||||
"Name": element.get("name"),
|
||||
"Value": element.get_text(strip=True),
|
||||
}
|
||||
label = element.find_previous("label")
|
||||
textarea_info["Label"] = label.get_text(strip=True) if label else None
|
||||
records.append(textarea_info)
|
||||
elif element.name == "select":
|
||||
options = [
|
||||
{"Value": option.get("value"), "Text": option.get_text(strip=True)}
|
||||
for option in element.find_all("option")
|
||||
]
|
||||
select_info = {
|
||||
"Type": "Select",
|
||||
"Name": element.get("name"),
|
||||
"Options": options,
|
||||
"Label": element.find_previous("label").get_text(strip=True)
|
||||
if element.find_previous("label")
|
||||
else None,
|
||||
}
|
||||
records.append(select_info)
|
||||
|
||||
elif element.name == "button":
|
||||
button_info = {
|
||||
"Type": "Button",
|
||||
"Name": element.get("name"),
|
||||
"Value": element.get_text(strip=True),
|
||||
"Label": element.find_previous("label").get_text(strip=True)
|
||||
if label
|
||||
else None,
|
||||
}
|
||||
records.append(button_info)
|
||||
|
||||
elif element.name == "table" and "sme-border" in element.get("class", []):
|
||||
# Increment the table counter
|
||||
table_counter += 1
|
||||
elif element.name == "textarea":
|
||||
textarea_info = {
|
||||
"Type": "Textarea",
|
||||
"Name": element.get("name"),
|
||||
"Value": element.get_text(strip=True),
|
||||
}
|
||||
label = element.find_previous("label")
|
||||
textarea_info["Label"] = label.get_text(strip=True) if label else None
|
||||
records.append(textarea_info)
|
||||
|
||||
# Prepare the TableControl format
|
||||
table_control = f"Table{table_counter}" # e.g., "Table1", "Table2"
|
||||
top_headings = []
|
||||
columns = []
|
||||
elif element.name == "button":
|
||||
button_info = {
|
||||
"Type": "Button",
|
||||
"Name": element.get("name"),
|
||||
"Value": element.get_text(strip=True),
|
||||
"Label": element.find_previous("label").get_text(strip=True)
|
||||
if label
|
||||
else None,
|
||||
}
|
||||
records.append(button_info)
|
||||
|
||||
# Extract headings from the first row
|
||||
first_row = element.find("tr")
|
||||
if first_row:
|
||||
for th in first_row.find_all("th"):
|
||||
top_headings.append(th.get_text(strip=True))
|
||||
elif element.name == "table" and "sme-border" in element.get("class", []):
|
||||
# Increment the table counter
|
||||
table_counter += 1
|
||||
|
||||
# Extract only the first data row's cell values for Columns
|
||||
data_rows = element.find_all("tr")[1:] # Skip the heading row
|
||||
if data_rows:
|
||||
first_data_row = data_rows[0] # Take the first row of data
|
||||
for idx, th in enumerate(first_row.find_all("th")):
|
||||
td = (
|
||||
first_data_row.find_all("td")[idx]
|
||||
if idx < len(first_data_row.find_all("td"))
|
||||
else None
|
||||
)
|
||||
if td:
|
||||
columns.append(
|
||||
f"{table_control}-{th.get_text(strip=True)}"
|
||||
) # Format as desired
|
||||
# Prepare the TableControl format
|
||||
table_control = f"Table{table_counter}" # e.g., "Table1", "Table2"
|
||||
top_headings = []
|
||||
columns = []
|
||||
|
||||
records.append(
|
||||
{
|
||||
"Type": "Table",
|
||||
"TableControl": table_control,
|
||||
"TopHeadings": top_headings,
|
||||
"Columns": columns,
|
||||
}
|
||||
)
|
||||
# Extract headings from the first row
|
||||
first_row = element.find("tr")
|
||||
if first_row:
|
||||
for th in first_row.find_all("th"):
|
||||
top_headings.append(th.get_text(strip=True))
|
||||
|
||||
return records, header_text, sub_header_text
|
||||
# Extract only the first data row's cell values for Columns
|
||||
data_rows = element.find_all("tr")[1:] # Skip the heading row
|
||||
if data_rows:
|
||||
first_data_row = data_rows[0] # Take the first row of data
|
||||
for idx, th in enumerate(first_row.find_all("th")):
|
||||
td = (
|
||||
first_data_row.find_all("td")[idx]
|
||||
if idx < len(first_data_row.find_all("td"))
|
||||
else None
|
||||
)
|
||||
if td:
|
||||
columns.append(
|
||||
f"{table_control}-{th.get_text(strip=True)}"
|
||||
) # Format as desired
|
||||
|
||||
records.append(
|
||||
{
|
||||
"Type": "Table",
|
||||
"TableControl": table_control,
|
||||
"TopHeadings": top_headings,
|
||||
"Columns": columns,
|
||||
}
|
||||
)
|
||||
return records, header_text, sub_header_text
|
||||
|
||||
|
||||
def insert_spaces_before_caps(text):
|
||||
@@ -298,6 +303,7 @@ def save_to_json5(data, output_filename, package_name, header, sub_header,strVer
|
||||
preformatted_count = 1
|
||||
input_count = 1
|
||||
table_count = 1
|
||||
link_count = 1
|
||||
|
||||
for record in data:
|
||||
if record["Type"] == "Paragraph":
|
||||
@@ -308,6 +314,14 @@ def save_to_json5(data, output_filename, package_name, header, sub_header,strVer
|
||||
{f"Preformatted{preformatted_count}": record["Text"]}
|
||||
)
|
||||
preformatted_count += 1
|
||||
elif record["Type"] == "Link":
|
||||
link_structure = {
|
||||
"Type": record["Type"],
|
||||
"href": record["href"],
|
||||
"title": record["title"]
|
||||
}
|
||||
structured_html.append({f"Link{link_count}": link_structure})
|
||||
link_count += 1
|
||||
elif record["Type"] == "Header" or record["Type"] == "SubHeader":
|
||||
continue # Skip headers for input count
|
||||
elif record["Type"] == "Table":
|
||||
@@ -432,4 +446,4 @@ def main():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
Reference in New Issue
Block a user