import json
import re
from bs4 import BeautifulSoup
from lxml import etree  # Import lxml for HTML validation
import html
import argparse
import pkg_resources
import sys
import traceback
import os
from datetime import datetime, timedelta
sm1_html_2_json5_version = "0.5"
def assemble_version_string():
	try:
		chameleon_version = pkg_resources.get_distribution("Chameleon").version
	except pkg_resources.DistributionNotFound:
		chameleon_version = "No version information"
	python_version = sys.version
	version_pattern = r"(\d{1,3}\.\d{1,3}\.\d{1,3})"
	version_match = re.search(version_pattern, python_version)
	python_version = version_match.group(0) if version_match else "Unknown"
	current_datetime = datetime.now()
	formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
	strVersion = (
		"sm1-html-2-json5 version:"
		+ sm1_html_2_json5_version
		+ " Chameleon version:"
		+ chameleon_version
		+ " On Python:"
		+ python_version
		+ " at "
		+ formatted_datetime
	)
	return strVersion
def check_file_version(filename, ThresholdSecs=3):
	#
	# Check modified versus creation date of the file and return +".new" if modified since creation + ThresholdSecs
	#
	try:
		with open(filename, 'r') as file:
			# Read the first three lines
			header_lines = [file.readline().strip() for _ in range(5)]
		# Extract the timestamp
		timestamp_str = None
		for line in header_lines:
			if ' at ' in line:
				# Split at 'at', expect the timestamp to be in the third part
				print(line)
				timestamp_str = line.split('at')[2].strip()
				break
		
		if timestamp_str is None:
			print("Warning: No timestamp found. Returning original filename.")
			return filename  # Return the original filename if no timestamp is found
		# Convert the string timestamp to a datetime object
		file_timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S')
		# Add the threshold seconds to the creation date
		file_timestamp += timedelta(seconds=ThresholdSecs)
		# Get the last modified time of the file, ignoring milliseconds
		file_modified_time = datetime.fromtimestamp(os.path.getmtime(filename)).replace(microsecond=0)
		
		print(file_modified_time,file_timestamp)
		# Compare the timestamps
		if file_modified_time > file_timestamp:
			return f"{filename}.new"
		else:
			return filename
	except FileNotFoundError:
		print(f"Error: The file '{filename}' does not exist.")
		return filename
	except Exception as e:
		print(f"An error occurred: {traceback.format_exc()}")
		return filename
            
def read_html_file(filename):
    """Read HTML content from a file."""
    with open(filename, "r", encoding="utf-8") as file:
        return file.read()
def validate_html(html):
    """Validate the HTML content."""
    try:
        parser = etree.HTMLParser()
        etree.fromstring(html, parser)  # Attempt to parse the HTML
    except Exception as e:
        raise ValueError("Invalid HTML document") from e
def convert_double_quotes_to_span(text):
    """Convert single-quoted text to ...."""
    # Use a regular expression to find single-quoted text and replace it
    return re.sub(r'"(.*?)"', r"\1", text)
# def sanitize_text(text):
# # Replace newlines with spaces
# print(f"--{text}--")
# decoded_text = html.unescape(text)
# sanitized_text = decoded_text.replace("\n", "").replace(
# "\r", " "
# )    # Handle both Unix and Windows line endings
# # Replace tabs with spaces
# sanitized_text = sanitized_text.replace("\t", "")
# # map single quotes to double
# # sanitized_text = sanitized_text.replace("'", '"')
# #Map signle and double quotes to nothing
# sanitized_text.replace("'","").replace('"','')
# #Take out any multiple spaces - reduce to one.
# sanitized_text = ' '.join(sanitized_text.split())
# # Strip leading and trailing whitespace
# sanitized_text = sanitized_text.strip()
# #sanitized_text = convert_double_quotes_to_span(sanitized_text)
# print(f"++{sanitized_text}++")
# return sanitized_text
def sanitize_text(text):
    # Replace newlines with spaces
    print(f"--{text}--")
    # Take out html entities
    decoded_text = html.unescape(text)
    # Take out newlines
    sanitized_text = decoded_text.replace("\n", " ").replace(
        "\r", " "
    )  # Handle both Unix and Windows line endings
    # Replace tabs with spaces
    sanitized_text = sanitized_text.replace("\t", " ")
    # Replace quote characters
    sanitized_text = sanitized_text.replace('"', "").replace(
        "'", ""
    )  # Remove double and single quotes
    # Take out any multiple spaces - reduce to one.
    sanitized_text = " ".join(sanitized_text.split())
    # Strip leading and trailing whitespace
    sanitized_text = sanitized_text.strip()
    print(f"++{sanitized_text}++")
    return sanitized_text
def extract_data(html):
    """Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
    soup = BeautifulSoup(html, "lxml")
    records = []
    hidden_input_names = ["page", "page_stack", ".id", "csrf_token"]
    header_text = None
    sub_header_text = None
    # Counter for tables
    table_counter = 0
    # Extract elements while preserving order
    for element in soup.find_all(
        ["h1", "h2", "p", "pre", "input", "select", "textarea", "button", "table"]
    ):
        if element.name == "h1":
            header_text = element.get_text(strip=True)
            records.append({"Type": "Header", "Text": header_text})
        elif element.name == "h2":
            sub_header_text = element.get_text(strip=True)
            records.append({"Type": "SubHeader", "Text": sub_header_text})
        elif element.name == "p":
            text = element.get_text(strip=True)
            if text:  # Ignore empty paragraphs
                # Sanitise text freom newlines,tabs and escape quotes.
                sanitised_text = sanitize_text(text)
                if sanitised_text == "":
                    continue
                records.append({"Type": "Paragraph", "Text": sanitised_text})
        elif element.name == "pre":
            text = element.get_text(strip=True)
            if text:  # Ensure non-empty before adding
                records.append({"Type": "Preformatted", "Text": text})
        elif element.name == "input":
            if (
                element.get("type") == "hidden"
                or element.get("name") in hidden_input_names
            ):
                continue
            input_info = {
                "Type": element.get("type", "text").capitalize(),
                "Name": element.get("name"),
                "Value": element.get("value", ""),
            }
            label = element.find_next("label")
            input_info["Label"] = label.get_text(strip=True) if label else None
            records.append(input_info)
        elif element.name == "select":
            options = [
                {"Value": option.get("value"), "Text": option.get_text(strip=True)}
                for option in element.find_all("option")
            ]
            select_info = {
                "Type": "Select",
                "Name": element.get("name"),
                "Options": options,
                "Label": element.find_previous("label").get_text(strip=True)
                if element.find_previous("label")
                else None,
            }
            records.append(select_info)
        elif element.name == "textarea":
            textarea_info = {
                "Type": "Textarea",
                "Name": element.get("name"),
                "Value": element.get_text(strip=True),
            }
            label = element.find_previous("label")
            textarea_info["Label"] = label.get_text(strip=True) if label else None
            records.append(textarea_info)
        elif element.name == "button":
            button_info = {
                "Type": "Button",
                "Name": element.get("name"),
                "Value": element.get_text(strip=True),
                "Label": element.find_previous("label").get_text(strip=True)
                if label
                else None,
            }
            records.append(button_info)
        elif element.name == "table" and "sme-border" in element.get("class", []):
            # Increment the table counter
            table_counter += 1
            # Prepare the TableControl format
            table_control = f"Table{table_counter}"  # e.g., "Table1", "Table2"
            top_headings = []
            columns = []
            # Extract headings from the first row
            first_row = element.find("tr")
            if first_row:
                for th in first_row.find_all("th"):
                    top_headings.append(th.get_text(strip=True))
            # Extract only the first data row's cell values for Columns
            data_rows = element.find_all("tr")[1:]  # Skip the heading row
            if data_rows:
                first_data_row = data_rows[0]  # Take the first row of data
                for idx, th in enumerate(first_row.find_all("th")):
                    td = (
                        first_data_row.find_all("td")[idx]
                        if idx < len(first_data_row.find_all("td"))
                        else None
                    )
                    if td:
                        columns.append(
                            f"{table_control}-{th.get_text(strip=True)}"
                        )  # Format as desired
            records.append(
                {
                    "Type": "Table",
                    "TableControl": table_control,
                    "TopHeadings": top_headings,
                    "Columns": columns,
                }
            )
    return records, header_text, sub_header_text
def insert_spaces_before_caps(text):
    """Insert spaces before each capital letter in a given string."""
    return re.sub(r"(?