import json
import re
from bs4 import BeautifulSoup
from lxml import etree # Import lxml for HTML validation
import html
import argparse
import pkg_resources
import sys
import traceback
import os
from datetime import datetime, timedelta
sm1_html_2_json5_version = "0.5"
def assemble_version_string():
try:
chameleon_version = pkg_resources.get_distribution("Chameleon").version
except pkg_resources.DistributionNotFound:
chameleon_version = "No version information"
python_version = sys.version
version_pattern = r"(\d{1,3}\.\d{1,3}\.\d{1,3})"
version_match = re.search(version_pattern, python_version)
python_version = version_match.group(0) if version_match else "Unknown"
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
strVersion = (
"sm1-html-2-json5 version:"
+ sm1_html_2_json5_version
+ " Chameleon version:"
+ chameleon_version
+ " On Python:"
+ python_version
+ " at "
+ formatted_datetime
)
return strVersion
def check_file_version(filename, ThresholdSecs=3):
#
# Check modified versus creation date of the file and return +".new" if modified since creation + ThresholdSecs
#
try:
with open(filename, 'r') as file:
# Read the first three lines
header_lines = [file.readline().strip() for _ in range(5)]
# Extract the timestamp
timestamp_str = None
for line in header_lines:
if ' at ' in line:
# Split at 'at', expect the timestamp to be in the third part
print(line)
timestamp_str = line.split('at')[2].strip()
break
if timestamp_str is None:
print("Warning: No timestamp found. Returning original filename.")
return filename # Return the original filename if no timestamp is found
# Convert the string timestamp to a datetime object
file_timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S')
# Add the threshold seconds to the creation date
file_timestamp += timedelta(seconds=ThresholdSecs)
# Get the last modified time of the file, ignoring milliseconds
file_modified_time = datetime.fromtimestamp(os.path.getmtime(filename)).replace(microsecond=0)
print(file_modified_time,file_timestamp)
# Compare the timestamps
if file_modified_time > file_timestamp:
return f"{filename}.new"
else:
return filename
except FileNotFoundError:
print(f"Error: The file '{filename}' does not exist.")
return filename
except Exception as e:
print(f"An error occurred: {traceback.format_exc()}")
return filename
def read_html_file(filename):
"""Read HTML content from a file."""
with open(filename, "r", encoding="utf-8") as file:
return file.read()
def validate_html(html):
"""Validate the HTML content."""
try:
parser = etree.HTMLParser()
etree.fromstring(html, parser) # Attempt to parse the HTML
except Exception as e:
raise ValueError("Invalid HTML document") from e
def convert_double_quotes_to_span(text):
"""Convert single-quoted text to ...."""
# Use a regular expression to find single-quoted text and replace it
return re.sub(r'"(.*?)"', r"\1", text)
# def sanitize_text(text):
# # Replace newlines with spaces
# print(f"--{text}--")
# decoded_text = html.unescape(text)
# sanitized_text = decoded_text.replace("\n", "").replace(
# "\r", " "
# ) # Handle both Unix and Windows line endings
# # Replace tabs with spaces
# sanitized_text = sanitized_text.replace("\t", "")
# # map single quotes to double
# # sanitized_text = sanitized_text.replace("'", '"')
# #Map signle and double quotes to nothing
# sanitized_text.replace("'","").replace('"','')
# #Take out any multiple spaces - reduce to one.
# sanitized_text = ' '.join(sanitized_text.split())
# # Strip leading and trailing whitespace
# sanitized_text = sanitized_text.strip()
# #sanitized_text = convert_double_quotes_to_span(sanitized_text)
# print(f"++{sanitized_text}++")
# return sanitized_text
def sanitize_text(text):
# Replace newlines with spaces
print(f"--{text}--")
# Take out html entities
decoded_text = html.unescape(text)
# Take out newlines
sanitized_text = decoded_text.replace("\n", " ").replace(
"\r", " "
) # Handle both Unix and Windows line endings
# Replace tabs with spaces
sanitized_text = sanitized_text.replace("\t", " ")
# Replace quote characters
sanitized_text = sanitized_text.replace('"', "").replace(
"'", ""
) # Remove double and single quotes
# Take out any multiple spaces - reduce to one.
sanitized_text = " ".join(sanitized_text.split())
# Strip leading and trailing whitespace
sanitized_text = sanitized_text.strip()
print(f"++{sanitized_text}++")
return sanitized_text
def extract_data(html):
"""Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
soup = BeautifulSoup(html, "lxml")
records = []
hidden_input_names = ["page", "page_stack", ".id", "csrf_token"]
header_text = None
sub_header_text = None
# Counter for tables
table_counter = 0
# Extract elements while preserving order
for element in soup.find_all(
["h1", "h2", "p", "pre", "input", "select", "textarea", "button", "table","a"]
):
if element.name == "h1":
header_text = element.get_text(strip=True)
records.append({"Type": "Header", "Text": header_text})
elif element.name == "h2":
sub_header_text = element.get_text(strip=True)
records.append({"Type": "SubHeader", "Text": sub_header_text})
elif element.name == "p":
text = element.get_text(strip=True)
if text: # Ignore empty paragraphs
# Sanitise text from newlines,tabs and escape quotes.
sanitised_text = sanitize_text(text)
if sanitised_text == "":
continue
records.append({"Type": "Paragraph", "Text": sanitised_text})
elif element.name == "pre":
text = element.get_text(strip=True)
if text: # Ensure non-empty before adding
records.append({"Type": "Preformatted", "Text": text})
elif element.name == "a":
title = element.get_text(strip=True)
href = element.get("href")
records.append({"Type": "Link", "href": href, "title": title})
elif element.name == "input":
if (
element.get("type") == "hidden"
or element.get("name") in hidden_input_names
):
continue
input_info = {
"Type": element.get("type", "text").capitalize(),
"Name": element.get("name"),
"Value": element.get("value", ""),
}
label = element.find_next("label")
input_info["Label"] = label.get_text(strip=True) if label else None
records.append(input_info)
elif element.name == "select":
options = [
{"Value": option.get("value"), "Text": option.get_text(strip=True)}
for option in element.find_all("option")
]
select_info = {
"Type": "Select",
"Name": element.get("name"),
"Options": options,
"Label": element.find_previous("label").get_text(strip=True)
if element.find_previous("label")
else None,
}
records.append(select_info)
elif element.name == "textarea":
textarea_info = {
"Type": "Textarea",
"Name": element.get("name"),
"Value": element.get_text(strip=True),
}
label = element.find_previous("label")
textarea_info["Label"] = label.get_text(strip=True) if label else None
records.append(textarea_info)
elif element.name == "button":
button_info = {
"Type": "Button",
"Name": element.get("name"),
"Value": element.get_text(strip=True),
"Label": element.find_previous("label").get_text(strip=True)
if label
else None,
}
records.append(button_info)
elif element.name == "table" and "sme-border" in element.get("class", []):
# Increment the table counter
table_counter += 1
# Prepare the TableControl format
table_control = f"Table{table_counter}" # e.g., "Table1", "Table2"
top_headings = []
columns = []
# Extract headings from the first row
first_row = element.find("tr")
if first_row:
for th in first_row.find_all("th"):
top_headings.append(th.get_text(strip=True))
# Extract only the first data row's cell values for Columns
data_rows = element.find_all("tr")[1:] # Skip the heading row
if data_rows:
first_data_row = data_rows[0] # Take the first row of data
for idx, th in enumerate(first_row.find_all("th")):
td = (
first_data_row.find_all("td")[idx]
if idx < len(first_data_row.find_all("td"))
else None
)
if td:
columns.append(
f"{table_control}-{th.get_text(strip=True)}"
) # Format as desired
records.append(
{
"Type": "Table",
"TableControl": table_control,
"TopHeadings": top_headings,
"Columns": columns,
}
)
return records, header_text, sub_header_text
def insert_spaces_before_caps(text):
"""Insert spaces before each capital letter in a given string."""
return re.sub(r"(?