check file changed and generate as .new if changed after creation

2024-09-18 11:46:26 +01:00
parent 1811a52dec
commit b4a6be435c
10 changed files with 627 additions and 511 deletions
--- a/sm1-html-2-json5.py
+++ b/sm1-html-2-json5.py
@@ -1,12 +1,87 @@
 import json
-import os
 import re
 from bs4 import BeautifulSoup
 from lxml import etree  # Import lxml for HTML validation
 import html
 import argparse
+import pkg_resources
+import sys
+import traceback
+import os
+from datetime import datetime, timedelta


+sm1_html_2_json5_version = "0.5"
+
+def assemble_version_string():
+	try:
+		chameleon_version = pkg_resources.get_distribution("Chameleon").version
+	except pkg_resources.DistributionNotFound:
+		chameleon_version = "No version information"
+	python_version = sys.version
+	version_pattern = r"(\d{1,3}\.\d{1,3}\.\d{1,3})"
+	version_match = re.search(version_pattern, python_version)
+	python_version = version_match.group(0) if version_match else "Unknown"
+	current_datetime = datetime.now()
+	formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
+	strVersion = (
+		"sm1-html-2-json5 version:"
+		+ sm1_html_2_json5_version
+		+ " Chameleon version:"
+		+ chameleon_version
+		+ " On Python:"
+		+ python_version
+		+ " at "
+		+ formatted_datetime
+	)
+	return strVersion
+
+def check_file_version(filename, ThresholdSecs=3):
+	#
+	# Check modified versus creation date of the file and return +".new" if modified since creation + ThresholdSecs
+	#
+	try:
+		with open(filename, 'r') as file:
+			# Read the first three lines
+			header_lines = [file.readline().strip() for _ in range(5)]
+
+		# Extract the timestamp
+		timestamp_str = None
+		for line in header_lines:
+			if ' at ' in line:
+				# Split at 'at', expect the timestamp to be in the third part
+				print(line)
+				timestamp_str = line.split('at')[2].strip()
+				break
+		
+		if timestamp_str is None:
+			print("Warning: No timestamp found. Returning original filename.")
+			return filename  # Return the original filename if no timestamp is found
+
+		# Convert the string timestamp to a datetime object
+		file_timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S')
+
+		# Add the threshold seconds to the creation date
+		file_timestamp += timedelta(seconds=ThresholdSecs)
+
+		# Get the last modified time of the file, ignoring milliseconds
+		file_modified_time = datetime.fromtimestamp(os.path.getmtime(filename)).replace(microsecond=0)
+		
+		print(file_modified_time,file_timestamp)
+
+		# Compare the timestamps
+		if file_modified_time > file_timestamp:
+			return f"{filename}.new"
+		else:
+			return filename
+	except FileNotFoundError:
+		print(f"Error: The file '{filename}' does not exist.")
+		return filename
+	except Exception as e:
+		print(f"An error occurred: {traceback.format_exc()}")
+		return filename
+            
+
 def read_html_file(filename):
    """Read HTML content from a file."""
    with open(filename, "r", encoding="utf-8") as file:
@@ -21,6 +96,7 @@ def validate_html(html):
    except Exception as e:
        raise ValueError("Invalid HTML document") from e

+
 def convert_double_quotes_to_span(text):
    """Convert single-quoted text to <span>...</span>."""
    # Use a regular expression to find single-quoted text and replace it
@@ -28,47 +104,51 @@ def convert_double_quotes_to_span(text):


 # def sanitize_text(text):
-    # # Replace newlines with spaces
-    # print(f"--{text}--")
-    # decoded_text = html.unescape(text)
+# # Replace newlines with spaces
+# print(f"--{text}--")
+# decoded_text = html.unescape(text)
+
+# sanitized_text = decoded_text.replace("\n", "").replace(
+# "\r", " "
+# )    # Handle both Unix and Windows line endings
+# # Replace tabs with spaces
+# sanitized_text = sanitized_text.replace("\t", "")
+# # map single quotes to double
+# # sanitized_text = sanitized_text.replace("'", '"')
+# #Map signle and double quotes to nothing
+# sanitized_text.replace("'","").replace('"','')
+# #Take out any multiple spaces - reduce to one.
+# sanitized_text = ' '.join(sanitized_text.split())
+# # Strip leading and trailing whitespace
+# sanitized_text = sanitized_text.strip()
+# #sanitized_text = convert_double_quotes_to_span(sanitized_text)
+# print(f"++{sanitized_text}++")
+# return sanitized_text
+

-    # sanitized_text = decoded_text.replace("\n", "").replace(
-        # "\r", " "
-    # )    # Handle both Unix and Windows line endings
-    # # Replace tabs with spaces
-    # sanitized_text = sanitized_text.replace("\t", "")
-    # # map single quotes to double
-    # # sanitized_text = sanitized_text.replace("'", '"')
-    # #Map signle and double quotes to nothing
-    # sanitized_text.replace("'","").replace('"','')
-    # #Take out any multiple spaces - reduce to one.
-    # sanitized_text = ' '.join(sanitized_text.split())
-    # # Strip leading and trailing whitespace
-    # sanitized_text = sanitized_text.strip()
-    # #sanitized_text = convert_double_quotes_to_span(sanitized_text)
-    # print(f"++{sanitized_text}++")
-    # return sanitized_text
-    
 def sanitize_text(text):
    # Replace newlines with spaces
    print(f"--{text}--")
    # Take out html entities
    decoded_text = html.unescape(text)
-	# Take out newlines
-    sanitized_text = decoded_text.replace('\n', ' ').replace('\r', ' ')  # Handle both Unix and Windows line endings
+    # Take out newlines
+    sanitized_text = decoded_text.replace("\n", " ").replace(
+        "\r", " "
+    )  # Handle both Unix and Windows line endings
    # Replace tabs with spaces
-    sanitized_text = sanitized_text.replace('\t', ' ')
+    sanitized_text = sanitized_text.replace("\t", " ")
    # Replace quote characters
-    sanitized_text = sanitized_text.replace('"', '').replace("'", '')  # Remove double and single quotes
-    #Take out any multiple spaces - reduce to one.
-    sanitized_text = ' '.join(sanitized_text.split())
+    sanitized_text = sanitized_text.replace('"', "").replace(
+        "'", ""
+    )  # Remove double and single quotes
+    # Take out any multiple spaces - reduce to one.
+    sanitized_text = " ".join(sanitized_text.split())
    # Strip leading and trailing whitespace
    sanitized_text = sanitized_text.strip()
    print(f"++{sanitized_text}++")
    return sanitized_text


-
 def extract_data(html):
    """Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
    soup = BeautifulSoup(html, "lxml")
@@ -100,7 +180,7 @@ def extract_data(html):
                # Sanitise text freom newlines,tabs and escape quotes.
                sanitised_text = sanitize_text(text)
                if sanitised_text == "":
-                   continue
+                    continue
                records.append({"Type": "Paragraph", "Text": sanitised_text})

        elif element.name == "pre":
@@ -207,98 +287,103 @@ def insert_spaces_before_caps(text):
    return re.sub(r"(?<!^)(?=[A-Z])", " ", text)


-def save_to_json5(data, output_filename, package_name, header, sub_header):
-    """Save extracted data to a JSON5 file with a specific structure."""
-    # Generate prefix from uppercase letters in PackageName made into lowercase
-    prefix = "".join(re.findall(r"[A-Z]", package_name)).lower()
+def save_to_json5(data, output_filename, package_name, header, sub_header,strVersion):
+	"""Save extracted data to a JSON5 file with a specific structure."""
+	# Generate prefix from uppercase letters in PackageName made into lowercase
+	prefix = "".join(re.findall(r"[A-Z]", package_name)).lower()

-    # Prepare structured html list
-    structured_html = []
-    paragraph_count = 1
-    preformatted_count = 1
-    input_count = 1
-    table_count = 1
+	# Prepare structured html list
+	structured_html = []
+	paragraph_count = 1
+	preformatted_count = 1
+	input_count = 1
+	table_count = 1

-    for record in data:
-        if record["Type"] == "Paragraph":
-            structured_html.append({f"Paragraph{paragraph_count}": record["Text"]})
-            paragraph_count += 1
-        elif record["Type"] == "Preformatted":
-            structured_html.append(
-                {f"Preformatted{preformatted_count}": record["Text"]}
-            )
-            preformatted_count += 1
-        elif record["Type"] == "Header" or record["Type"] == "SubHeader":
-            continue  # Skip headers for input count
-        elif record["Type"] == "Table":
-            # Construct the table entry
-            table_structure = {
-                "Type": record["Type"],
-                "TableControl": record["TableControl"],
-                "TopHeadings": record["TopHeadings"],
-                "Columns": record["Columns"],
-            }
-            structured_html.append({f"Table{table_count}": table_structure})
-            table_count += 1
-        else:  # For inputs, selects, textareas, and buttons
-            input_structure = {
-                "Type": record["Type"],
-                "Value": record.get("Value", ""),  # Safely access Value
-            }
+	for record in data:
+		if record["Type"] == "Paragraph":
+			structured_html.append({f"Paragraph{paragraph_count}": record["Text"]})
+			paragraph_count += 1
+		elif record["Type"] == "Preformatted":
+			structured_html.append(
+				{f"Preformatted{preformatted_count}": record["Text"]}
+			)
+			preformatted_count += 1
+		elif record["Type"] == "Header" or record["Type"] == "SubHeader":
+			continue  # Skip headers for input count
+		elif record["Type"] == "Table":
+			# Construct the table entry
+			table_structure = {
+				"Type": record["Type"],
+				"TableControl": record["TableControl"],
+				"TopHeadings": record["TopHeadings"],
+				"Columns": record["Columns"],
+			}
+			structured_html.append({f"Table{table_count}": table_structure})
+			table_count += 1
+		else:  # For inputs, selects, textareas, and buttons
+			input_structure = {
+				"Type": record["Type"],
+				"Value": record.get("Value", ""),  # Safely access Value
+			}

-            # Use .get() for the Name key to avoid KeyError
-            input_structure["Name"] = record.get(
-                "Name", None
-            )  # Set to None if not present
-            input_structure["Label"] = record.get(
-                "Label", None
-            )  # Set to None if not present
+			# Use .get() for the Name key to avoid KeyError
+			input_structure["Name"] = record.get(
+				"Name", None
+			)  # Set to None if not present
+			input_structure["Label"] = record.get(
+				"Label", None
+			)  # Set to None if not present

-            # Handle specific case for Select options
-            if "Options" in record:
-                input_structure["Options"] = record["Options"]
+			# Handle specific case for Select options
+			if "Options" in record:
+				input_structure["Options"] = record["Options"]

-            structured_html.append({f"Input{input_count}": input_structure})
-            input_count += 1
+			structured_html.append({f"Input{input_count}": input_structure})
+			input_count += 1

-    # Wrap the records with the required fields
-    json5_data = {
-        "PackageName": package_name,
-        "prefix": prefix,
-        "MenuHeading": "Miscellaneous",
-        "MenuDescription": insert_spaces_before_caps(package_name),
-        "MenuNavigation": "2000 400",
-        "firstPanel": "PARAMS",
-        "signalEvent": f"smeserver-{package_name.lower()}-update",
-        "html": [{
-            "Name": "params",
-            "route": "PARAMS",
-            "Header": header if header else f"{package_name} Contrib",
-            "SubHeader": sub_header
-            if sub_header
-            else f"Manage {package_name} settings:",
-            **{
-                k: v for item in structured_html for k, v in item.items()
-            },  # Flatten the structured_html into the dict
-        }],
-    }
+	# Wrap the records with the required fields
+	json5_data = {
+		"PackageName": package_name,
+		"prefix": prefix,
+		"MenuHeading": "Miscellaneous",
+		"MenuDescription": insert_spaces_before_caps(package_name),
+		"MenuNavigation": "2000 400",
+		"firstPanel": "PARAMS",
+		"signalEvent": f"smeserver-{package_name.lower()}-update",
+		"html": [
+			{
+				"Name": "params",
+				"route": "PARAMS",
+				"Header": header if header else f"{package_name} Contrib",
+				"SubHeader": sub_header
+				if sub_header
+				else f"Manage {package_name} settings:",
+				**{
+					k: v for item in structured_html for k, v in item.items()
+				},  # Flatten the structured_html into the dict
+			}
+		],
+	}

-    # Save in JSON5 format (JSON with comments and unquoted keys)
-    with open(output_filename, "w", encoding="utf-8") as json_file:
-        json.dump(json5_data, json_file, ensure_ascii=False, indent=4)
+	# Save in JSON5 format (JSON with comments and unquoted keys)
+	with open(output_filename, "w", encoding="utf-8") as json_file:
+		
+		json.dump(json5_data, json_file, ensure_ascii=False, indent=4)

-    # Manually format as JSON5 by adding single quotes (for simplicity)
-    with open(output_filename, "r+", encoding="utf-8") as json_file:
-        content = json_file.read()
-        content = content.replace(
-            '"', "'"
-        )  # Replace double quotes with single quotes for JSON5
-        json_file.seek(0)
-        json_file.write(content)
-        json_file.truncate()  # Remove any old content beyond the new content length
+	# Manually format as JSON5 by adding single quotes (for simplicity)
+	with open(output_filename, "r+", encoding="utf-8") as json_file:
+		content = f"//\n// Generated by {strVersion}\n//\n"
+		content = content + json_file.read()
+		content = content.replace(
+			'"', "'"
+		)  # Replace double quotes with single quotes for JSON5
+		json_file.seek(0)
+		json_file.write(content)
+		json_file.truncate()  # Remove any old content beyond the new content length


 def main():
+	strVersion = assemble_version_string()
 	# command line parameters
 	parser = argparse.ArgumentParser(description="sm1--html-2-jsopn5")
 	parser.add_argument(
@@ -325,24 +410,24 @@ def main():
 	#
 	# Generate output JSON5 filename based on input file name
 	#
-	 # Split the original path into directory and file name
+	# Split the original path into directory and file name
 	directory, filename = os.path.split(input_file)

 	# Replace 'html' with 'json5' in the directory path
-	new_directory = directory.replace('/html', '/json5')
-	#print(new_directory)
+	new_directory = directory.replace("/html", "/json5")
+	# print(new_directory)

 	# Construct the new path
-	output_file = os.path.join(new_directory, filename.replace('.html', '.json5'))
+	output_file = check_file_version(os.path.join(new_directory, filename.replace(".html", ".json5")))
 	print(output_file)
-	#quit(1)
-	
+	# quit(1)
+
 	# Generate output JSON5 filename based on input file name
 	base_name = os.path.basename(input_file)  # Get the file name (with extension)
 	package_name = os.path.splitext(base_name)[0]  # Use the filename without extension

 	# Save extracted data to JSON5
-	save_to_json5(data, output_file, package_name, header, sub_header)
+	save_to_json5(data, output_file, package_name, header, sub_header, strVersion)
 	print(f"Extracted data saved to '{output_file}'.")