import json
import re
from bs4 import BeautifulSoup
from lxml import etree  # Import lxml for HTML validation
import html
import argparse
import pkg_resources
import sys
import traceback
import os
from datetime import datetime, timedelta


sm1_html_2_json5_version = "0.5"

def assemble_version_string():
	try:
		chameleon_version = pkg_resources.get_distribution("Chameleon").version
	except pkg_resources.DistributionNotFound:
		chameleon_version = "No version information"
	python_version = sys.version
	version_pattern = r"(\d{1,3}\.\d{1,3}\.\d{1,3})"
	version_match = re.search(version_pattern, python_version)
	python_version = version_match.group(0) if version_match else "Unknown"
	current_datetime = datetime.now()
	formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
	strVersion = (
		"sm1-html-2-json5 version:"
		+ sm1_html_2_json5_version
		+ " Chameleon version:"
		+ chameleon_version
		+ " On Python:"
		+ python_version
		+ " at "
		+ formatted_datetime
	)
	return strVersion

def check_file_version(filename, ThresholdSecs=3):
	#
	# Check modified versus creation date of the file and return +".new" if modified since creation + ThresholdSecs
	#
	try:
		with open(filename, 'r') as file:
			# Read the first three lines
			header_lines = [file.readline().strip() for _ in range(5)]

		# Extract the timestamp
		timestamp_str = None
		for line in header_lines:
			if ' at ' in line:
				# Split at 'at', expect the timestamp to be in the third part
				print(line)
				timestamp_str = line.split('at')[2].strip()
				break
		
		if timestamp_str is None:
			print("Warning: No timestamp found. Returning original filename.")
			return filename  # Return the original filename if no timestamp is found

		# Convert the string timestamp to a datetime object
		file_timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S')

		# Add the threshold seconds to the creation date
		file_timestamp += timedelta(seconds=ThresholdSecs)

		# Get the last modified time of the file, ignoring milliseconds
		file_modified_time = datetime.fromtimestamp(os.path.getmtime(filename)).replace(microsecond=0)
		
		print(file_modified_time,file_timestamp)

		# Compare the timestamps
		if file_modified_time > file_timestamp:
			return f"{filename}.new"
		else:
			return filename
	except FileNotFoundError:
		print(f"Error: The file '{filename}' does not exist.")
		return filename
	except Exception as e:
		print(f"An error occurred: {traceback.format_exc()}")
		return filename
            

def read_html_file(filename):
    """Read HTML content from a file."""
    with open(filename, "r", encoding="utf-8") as file:
        return file.read()


def validate_html(html):
    """Validate the HTML content."""
    try:
        parser = etree.HTMLParser()
        etree.fromstring(html, parser)  # Attempt to parse the HTML
    except Exception as e:
        raise ValueError("Invalid HTML document") from e


def convert_double_quotes_to_span(text):
    """Convert single-quoted text to <span>...</span>."""
    # Use a regular expression to find single-quoted text and replace it
    return re.sub(r'"(.*?)"', r"<span class=emphasis-para>\1</span>", text)


# def sanitize_text(text):
# # Replace newlines with spaces
# print(f"--{text}--")
# decoded_text = html.unescape(text)

# sanitized_text = decoded_text.replace("\n", "").replace(
# "\r", " "
# )    # Handle both Unix and Windows line endings
# # Replace tabs with spaces
# sanitized_text = sanitized_text.replace("\t", "")
# # map single quotes to double
# # sanitized_text = sanitized_text.replace("'", '"')
# #Map signle and double quotes to nothing
# sanitized_text.replace("'","").replace('"','')
# #Take out any multiple spaces - reduce to one.
# sanitized_text = ' '.join(sanitized_text.split())
# # Strip leading and trailing whitespace
# sanitized_text = sanitized_text.strip()
# #sanitized_text = convert_double_quotes_to_span(sanitized_text)
# print(f"++{sanitized_text}++")
# return sanitized_text


def sanitize_text(text):
    # Replace newlines with spaces
    print(f"--{text}--")
    # Take out html entities
    decoded_text = html.unescape(text)
    # Take out newlines
    sanitized_text = decoded_text.replace("\n", " ").replace(
        "\r", " "
    )  # Handle both Unix and Windows line endings
    # Replace tabs with spaces
    sanitized_text = sanitized_text.replace("\t", " ")
    # Replace quote characters
    sanitized_text = sanitized_text.replace('"', "").replace(
        "'", ""
    )  # Remove double and single quotes
    # Take out any multiple spaces - reduce to one.
    sanitized_text = " ".join(sanitized_text.split())
    # Strip leading and trailing whitespace
    sanitized_text = sanitized_text.strip()
    print(f"++{sanitized_text}++")
    return sanitized_text


def extract_data(html):
	"""Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
	soup = BeautifulSoup(html, "lxml")
	records = []

	hidden_input_names = ["page", "page_stack", ".id", "csrf_token"]

	header_text = None
	sub_header_text = None

	# Counter for tables
	table_counter = 0

	# Extract elements while preserving order
	for element in soup.find_all(
		["h1", "h2", "p", "pre", "input", "select", "textarea", "button", "table","a"]
	):
		if element.name == "h1":
			header_text = element.get_text(strip=True)
			records.append({"Type": "Header", "Text": header_text})

		elif element.name == "h2":
			sub_header_text = element.get_text(strip=True)
			records.append({"Type": "SubHeader", "Text": sub_header_text})

		elif element.name == "p":
			text = element.get_text(strip=True)
			if text:  # Ignore empty paragraphs
				# Sanitise text from newlines,tabs and escape quotes.
				sanitised_text = sanitize_text(text)
				if sanitised_text == "":
					continue
				records.append({"Type": "Paragraph", "Text": sanitised_text})

		elif element.name == "pre":
			text = element.get_text(strip=True)
			if text:  # Ensure non-empty before adding
				records.append({"Type": "Preformatted", "Text": text})

		elif element.name == "a":
			title = element.get_text(strip=True)
			href = element.get("href")
			records.append({"Type": "Link", "href": href, "title": title})

		elif element.name == "input":
			if (
				element.get("type") == "hidden"
				or element.get("name") in hidden_input_names
			):
				continue

			input_info = {
				"Type": element.get("type", "text").capitalize(),
				"Name": element.get("name"),
				"Value": element.get("value", ""),
			}
			label = element.find_next("label")
			input_info["Label"] = label.get_text(strip=True) if label else None
			records.append(input_info)

		elif element.name == "select":
			options = [
				{"Value": option.get("value"), "Text": option.get_text(strip=True)}
				for option in element.find_all("option")
			]
			select_info = {
				"Type": "Select",
				"Name": element.get("name"),
				"Options": options,
				"Label": element.find_previous("label").get_text(strip=True)
				if element.find_previous("label")
				else None,
			}
			records.append(select_info)


		elif element.name == "textarea":
			textarea_info = {
				"Type": "Textarea",
				"Name": element.get("name"),
				"Value": element.get_text(strip=True),
			}
			label = element.find_previous("label")
			textarea_info["Label"] = label.get_text(strip=True) if label else None
			records.append(textarea_info)

		elif element.name == "button":
			button_info = {
				"Type": "Button",
				"Name": element.get("name"),
				"Value": element.get_text(strip=True),
				"Label": element.find_previous("label").get_text(strip=True)
				if label
				else None,
			}
			records.append(button_info)

		elif element.name == "table" and "sme-border" in element.get("class", []):
			# Increment the table counter
			table_counter += 1

			# Prepare the TableControl format
			table_control = f"Table{table_counter}"  # e.g., "Table1", "Table2"
			top_headings = []
			columns = []

			# Extract headings from the first row
			first_row = element.find("tr")
			if first_row:
				for th in first_row.find_all("th"):
					top_headings.append(th.get_text(strip=True))

			# Extract only the first data row's cell values for Columns
			data_rows = element.find_all("tr")[1:]  # Skip the heading row
			if data_rows:
				first_data_row = data_rows[0]  # Take the first row of data
				for idx, th in enumerate(first_row.find_all("th")):
					td = (
						first_data_row.find_all("td")[idx]
						if idx < len(first_data_row.find_all("td"))
						else None
					)
					if td:
						columns.append(
							f"{table_control}-{th.get_text(strip=True)}"
						)  # Format as desired

			records.append(
				{
					"Type": "Table",
					"TableControl": table_control,
					"TopHeadings": top_headings,
					"Columns": columns,
				}
			)
	return records, header_text, sub_header_text


def insert_spaces_before_caps(text):
    """Insert spaces before each capital letter in a given string."""
    return re.sub(r"(?<!^)(?=[A-Z])", " ", text)


def save_to_json5(data, output_filename, package_name, header, sub_header,strVersion):
	"""Save extracted data to a JSON5 file with a specific structure."""
	# Generate prefix from uppercase letters in PackageName made into lowercase
	prefix = "".join(re.findall(r"[A-Z]", package_name)).lower()

	# Prepare structured html list
	structured_html = []
	paragraph_count = 1
	preformatted_count = 1
	input_count = 1
	table_count = 1
	link_count = 1

	for record in data:
		if record["Type"] == "Paragraph":
			structured_html.append({f"Paragraph{paragraph_count}": record["Text"]})
			paragraph_count += 1
		elif record["Type"] == "Preformatted":
			structured_html.append(
				{f"Preformatted{preformatted_count}": record["Text"]}
			)
			preformatted_count += 1
		elif record["Type"] == "Link":
			link_structure = {
				"Type": record["Type"],
				"href": record["href"],
				"title": record["title"]
			}
			structured_html.append({f"Link{link_count}": link_structure})
			link_count += 1
		elif record["Type"] == "Header" or record["Type"] == "SubHeader":
			continue  # Skip headers for input count
		elif record["Type"] == "Table":
			# Construct the table entry
			table_structure = {
				"Type": record["Type"],
				"TableControl": record["TableControl"],
				"TopHeadings": record["TopHeadings"],
				"Columns": record["Columns"],
			}
			structured_html.append({f"Table{table_count}": table_structure})
			table_count += 1
		else:  # For inputs, selects, textareas, and buttons
			input_structure = {
				"Type": record["Type"],
				"Value": record.get("Value", ""),  # Safely access Value
			}

			# Use .get() for the Name key to avoid KeyError
			input_structure["Name"] = record.get(
				"Name", None
			)  # Set to None if not present
			input_structure["Label"] = record.get(
				"Label", None
			)  # Set to None if not present

			# Handle specific case for Select options
			if "Options" in record:
				input_structure["Options"] = record["Options"]

			structured_html.append({f"Input{input_count}": input_structure})
			input_count += 1

	# Wrap the records with the required fields
	json5_data = {
		"PackageName": package_name,
		"prefix": prefix,
		"MenuHeading": "Miscellaneous",
		"MenuDescription": insert_spaces_before_caps(package_name),
		"MenuNavigation": "2000 400",
		"firstPanel": "PARAMS",
		"signalEvent": f"smeserver-{package_name.lower()}-update",
		"html": [
			{
				"Name": "params",
				"route": "PARAMS",
				"Header": header if header else f"{package_name} Contrib",
				"SubHeader": sub_header
				if sub_header
				else f"Manage {package_name} settings:",
				**{
					k: v for item in structured_html for k, v in item.items()
				},  # Flatten the structured_html into the dict
			}
		],
	}

	# Save in JSON5 format (JSON with comments and unquoted keys)
	with open(output_filename, "w", encoding="utf-8") as json_file:
		
		json.dump(json5_data, json_file, ensure_ascii=False, indent=4)

	# Manually format as JSON5 by adding single quotes (for simplicity)
	with open(output_filename, "r+", encoding="utf-8") as json_file:
		content = f"//\n// Generated by {strVersion}\n//\n"
		content = content + json_file.read()
		content = content.replace(
			'"', "'"
		)  # Replace double quotes with single quotes for JSON5
		json_file.seek(0)
		json_file.write(content)
		json_file.truncate()  # Remove any old content beyond the new content length


def main():
	strVersion = assemble_version_string()
	# command line parameters
	parser = argparse.ArgumentParser(description="sm1--html-2-jsopn5")
	parser.add_argument(
		"-f",
		"--filename",
		help="Specify a filename for the html file",
		default="CreateStarterWebsite.html",
	)
	args = parser.parse_args()
	input_file = "/home/brianr/clients/SM2/SM2Gen/venv/html/" + args.filename
	if not input_file.lower().endswith(".html"):
		# Add .html extension
		input_file += ".html"
	print(input_file)

	# Read HTML content
	html_content = read_html_file(input_file)

	# Validate the HTML before extracting data
	validate_html(html_content)

	# Extract data from HTML
	data, header, sub_header = extract_data(html_content)
	#
	# Generate output JSON5 filename based on input file name
	#
	# Split the original path into directory and file name
	directory, filename = os.path.split(input_file)

	# Replace 'html' with 'json5' in the directory path
	new_directory = directory.replace("/html", "/json5")
	# print(new_directory)

	# Construct the new path
	output_file = check_file_version(os.path.join(new_directory, filename.replace(".html", ".json5")))
	print(output_file)
	# quit(1)

	# Generate output JSON5 filename based on input file name
	base_name = os.path.basename(input_file)  # Get the file name (with extension)
	package_name = os.path.splitext(base_name)[0]  # Use the filename without extension

	# Save extracted data to JSON5
	save_to_json5(data, output_file, package_name, header, sub_header, strVersion)
	print(f"Extracted data saved to '{output_file}'.")


if __name__ == "__main__":
    main()