import json import re from bs4 import BeautifulSoup from lxml import etree # Import lxml for HTML validation import html import argparse import pkg_resources import sys import traceback import os from datetime import datetime, timedelta sm1_html_2_json5_version = "0.5" def assemble_version_string(): try: chameleon_version = pkg_resources.get_distribution("Chameleon").version except pkg_resources.DistributionNotFound: chameleon_version = "No version information" python_version = sys.version version_pattern = r"(\d{1,3}\.\d{1,3}\.\d{1,3})" version_match = re.search(version_pattern, python_version) python_version = version_match.group(0) if version_match else "Unknown" current_datetime = datetime.now() formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S") strVersion = ( "sm1-html-2-json5 version:" + sm1_html_2_json5_version + " Chameleon version:" + chameleon_version + " On Python:" + python_version + " at " + formatted_datetime ) return strVersion def check_file_version(filename, ThresholdSecs=3): # # Check modified versus creation date of the file and return +".new" if modified since creation + ThresholdSecs # try: with open(filename, 'r') as file: # Read the first three lines header_lines = [file.readline().strip() for _ in range(5)] # Extract the timestamp timestamp_str = None for line in header_lines: if ' at ' in line: # Split at 'at', expect the timestamp to be in the third part print(line) timestamp_str = line.split('at')[2].strip() break if timestamp_str is None: print("Warning: No timestamp found. Returning original filename.") return filename # Return the original filename if no timestamp is found # Convert the string timestamp to a datetime object file_timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S') # Add the threshold seconds to the creation date file_timestamp += timedelta(seconds=ThresholdSecs) # Get the last modified time of the file, ignoring milliseconds file_modified_time = datetime.fromtimestamp(os.path.getmtime(filename)).replace(microsecond=0) print(file_modified_time,file_timestamp) # Compare the timestamps if file_modified_time > file_timestamp: return f"{filename}.new" else: return filename except FileNotFoundError: print(f"Error: The file '{filename}' does not exist.") return filename except Exception as e: print(f"An error occurred: {traceback.format_exc()}") return filename def read_html_file(filename): """Read HTML content from a file.""" with open(filename, "r", encoding="utf-8") as file: return file.read() def validate_html(html): """Validate the HTML content.""" try: parser = etree.HTMLParser() etree.fromstring(html, parser) # Attempt to parse the HTML except Exception as e: raise ValueError("Invalid HTML document") from e def convert_double_quotes_to_span(text): """Convert single-quoted text to <span>...</span>.""" # Use a regular expression to find single-quoted text and replace it return re.sub(r'"(.*?)"', r"<span class=emphasis-para>\1</span>", text) # def sanitize_text(text): # # Replace newlines with spaces # print(f"--{text}--") # decoded_text = html.unescape(text) # sanitized_text = decoded_text.replace("\n", "").replace( # "\r", " " # ) # Handle both Unix and Windows line endings # # Replace tabs with spaces # sanitized_text = sanitized_text.replace("\t", "") # # map single quotes to double # # sanitized_text = sanitized_text.replace("'", '"') # #Map signle and double quotes to nothing # sanitized_text.replace("'","").replace('"','') # #Take out any multiple spaces - reduce to one. # sanitized_text = ' '.join(sanitized_text.split()) # # Strip leading and trailing whitespace # sanitized_text = sanitized_text.strip() # #sanitized_text = convert_double_quotes_to_span(sanitized_text) # print(f"++{sanitized_text}++") # return sanitized_text def sanitize_text(text): # Replace newlines with spaces print(f"--{text}--") # Take out html entities decoded_text = html.unescape(text) # Take out newlines sanitized_text = decoded_text.replace("\n", " ").replace( "\r", " " ) # Handle both Unix and Windows line endings # Replace tabs with spaces sanitized_text = sanitized_text.replace("\t", " ") # Replace quote characters sanitized_text = sanitized_text.replace('"', "").replace( "'", "" ) # Remove double and single quotes # Take out any multiple spaces - reduce to one. sanitized_text = " ".join(sanitized_text.split()) # Strip leading and trailing whitespace sanitized_text = sanitized_text.strip() print(f"++{sanitized_text}++") return sanitized_text def extract_data(html): """Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order.""" soup = BeautifulSoup(html, "lxml") records = [] hidden_input_names = ["page", "page_stack", ".id", "csrf_token"] header_text = None sub_header_text = None # Counter for tables table_counter = 0 # Extract elements while preserving order for element in soup.find_all( ["h1", "h2", "p", "pre", "input", "select", "textarea", "button", "table","a"] ): if element.name == "h1": header_text = element.get_text(strip=True) records.append({"Type": "Header", "Text": header_text}) elif element.name == "h2": sub_header_text = element.get_text(strip=True) records.append({"Type": "SubHeader", "Text": sub_header_text}) elif element.name == "p": text = element.get_text(strip=True) if text: # Ignore empty paragraphs # Sanitise text from newlines,tabs and escape quotes. sanitised_text = sanitize_text(text) if sanitised_text == "": continue records.append({"Type": "Paragraph", "Text": sanitised_text}) elif element.name == "pre": text = element.get_text(strip=True) if text: # Ensure non-empty before adding records.append({"Type": "Preformatted", "Text": text}) elif element.name == "a": title = element.get_text(strip=True) href = element.get("href") records.append({"Type": "Link", "href": href, "title": title}) elif element.name == "input": if ( element.get("type") == "hidden" or element.get("name") in hidden_input_names ): continue input_info = { "Type": element.get("type", "text").capitalize(), "Name": element.get("name"), "Value": element.get("value", ""), } label = element.find_next("label") input_info["Label"] = label.get_text(strip=True) if label else None records.append(input_info) elif element.name == "select": options = [ {"Value": option.get("value"), "Text": option.get_text(strip=True)} for option in element.find_all("option") ] select_info = { "Type": "Select", "Name": element.get("name"), "Options": options, "Label": element.find_previous("label").get_text(strip=True) if element.find_previous("label") else None, } records.append(select_info) elif element.name == "textarea": textarea_info = { "Type": "Textarea", "Name": element.get("name"), "Value": element.get_text(strip=True), } label = element.find_previous("label") textarea_info["Label"] = label.get_text(strip=True) if label else None records.append(textarea_info) elif element.name == "button": button_info = { "Type": "Button", "Name": element.get("name"), "Value": element.get_text(strip=True), "Label": element.find_previous("label").get_text(strip=True) if label else None, } records.append(button_info) elif element.name == "table" and "sme-border" in element.get("class", []): # Increment the table counter table_counter += 1 # Prepare the TableControl format table_control = f"Table{table_counter}" # e.g., "Table1", "Table2" top_headings = [] columns = [] # Extract headings from the first row first_row = element.find("tr") if first_row: for th in first_row.find_all("th"): top_headings.append(th.get_text(strip=True)) # Extract only the first data row's cell values for Columns data_rows = element.find_all("tr")[1:] # Skip the heading row if data_rows: first_data_row = data_rows[0] # Take the first row of data for idx, th in enumerate(first_row.find_all("th")): td = ( first_data_row.find_all("td")[idx] if idx < len(first_data_row.find_all("td")) else None ) if td: columns.append( f"{table_control}-{th.get_text(strip=True)}" ) # Format as desired records.append( { "Type": "Table", "TableControl": table_control, "TopHeadings": top_headings, "Columns": columns, } ) return records, header_text, sub_header_text def insert_spaces_before_caps(text): """Insert spaces before each capital letter in a given string.""" return re.sub(r"(?<!^)(?=[A-Z])", " ", text) def save_to_json5(data, output_filename, package_name, header, sub_header,strVersion): """Save extracted data to a JSON5 file with a specific structure.""" # Generate prefix from uppercase letters in PackageName made into lowercase prefix = "".join(re.findall(r"[A-Z]", package_name)).lower() # Prepare structured html list structured_html = [] paragraph_count = 1 preformatted_count = 1 input_count = 1 table_count = 1 link_count = 1 for record in data: if record["Type"] == "Paragraph": structured_html.append({f"Paragraph{paragraph_count}": record["Text"]}) paragraph_count += 1 elif record["Type"] == "Preformatted": structured_html.append( {f"Preformatted{preformatted_count}": record["Text"]} ) preformatted_count += 1 elif record["Type"] == "Link": link_structure = { "Type": record["Type"], "href": record["href"], "title": record["title"] } structured_html.append({f"Link{link_count}": link_structure}) link_count += 1 elif record["Type"] == "Header" or record["Type"] == "SubHeader": continue # Skip headers for input count elif record["Type"] == "Table": # Construct the table entry table_structure = { "Type": record["Type"], "TableControl": record["TableControl"], "TopHeadings": record["TopHeadings"], "Columns": record["Columns"], } structured_html.append({f"Table{table_count}": table_structure}) table_count += 1 else: # For inputs, selects, textareas, and buttons input_structure = { "Type": record["Type"], "Value": record.get("Value", ""), # Safely access Value } # Use .get() for the Name key to avoid KeyError input_structure["Name"] = record.get( "Name", None ) # Set to None if not present input_structure["Label"] = record.get( "Label", None ) # Set to None if not present # Handle specific case for Select options if "Options" in record: input_structure["Options"] = record["Options"] structured_html.append({f"Input{input_count}": input_structure}) input_count += 1 # Wrap the records with the required fields json5_data = { "PackageName": package_name, "prefix": prefix, "MenuHeading": "Miscellaneous", "MenuDescription": insert_spaces_before_caps(package_name), "MenuNavigation": "2000 400", "firstPanel": "PARAMS", "signalEvent": f"smeserver-{package_name.lower()}-update", "html": [ { "Name": "params", "route": "PARAMS", "Header": header if header else f"{package_name} Contrib", "SubHeader": sub_header if sub_header else f"Manage {package_name} settings:", **{ k: v for item in structured_html for k, v in item.items() }, # Flatten the structured_html into the dict } ], } # Save in JSON5 format (JSON with comments and unquoted keys) with open(output_filename, "w", encoding="utf-8") as json_file: json.dump(json5_data, json_file, ensure_ascii=False, indent=4) # Manually format as JSON5 by adding single quotes (for simplicity) with open(output_filename, "r+", encoding="utf-8") as json_file: content = f"//\n// Generated by {strVersion}\n//\n" content = content + json_file.read() content = content.replace( '"', "'" ) # Replace double quotes with single quotes for JSON5 json_file.seek(0) json_file.write(content) json_file.truncate() # Remove any old content beyond the new content length def main(): strVersion = assemble_version_string() # command line parameters parser = argparse.ArgumentParser(description="sm1--html-2-jsopn5") parser.add_argument( "-f", "--filename", help="Specify a filename for the html file", default="CreateStarterWebsite.html", ) args = parser.parse_args() input_file = "/home/brianr/clients/SM2/SM2Gen/venv/html/" + args.filename if not input_file.lower().endswith(".html"): # Add .html extension input_file += ".html" print(input_file) # Read HTML content html_content = read_html_file(input_file) # Validate the HTML before extracting data validate_html(html_content) # Extract data from HTML data, header, sub_header = extract_data(html_content) # # Generate output JSON5 filename based on input file name # # Split the original path into directory and file name directory, filename = os.path.split(input_file) # Replace 'html' with 'json5' in the directory path new_directory = directory.replace("/html", "/json5") # print(new_directory) # Construct the new path output_file = check_file_version(os.path.join(new_directory, filename.replace(".html", ".json5"))) print(output_file) # quit(1) # Generate output JSON5 filename based on input file name base_name = os.path.basename(input_file) # Get the file name (with extension) package_name = os.path.splitext(base_name)[0] # Use the filename without extension # Save extracted data to JSON5 save_to_json5(data, output_file, package_name, header, sub_header, strVersion) print(f"Extracted data saved to '{output_file}'.") if __name__ == "__main__": main()