check file changed and generate as .new if changed after creation
This commit is contained in:
@@ -1,12 +1,87 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from lxml import etree # Import lxml for HTML validation
|
||||
import html
|
||||
import argparse
|
||||
import pkg_resources
|
||||
import sys
|
||||
import traceback
|
||||
import os
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
sm1_html_2_json5_version = "0.5"
|
||||
|
||||
def assemble_version_string():
|
||||
try:
|
||||
chameleon_version = pkg_resources.get_distribution("Chameleon").version
|
||||
except pkg_resources.DistributionNotFound:
|
||||
chameleon_version = "No version information"
|
||||
python_version = sys.version
|
||||
version_pattern = r"(\d{1,3}\.\d{1,3}\.\d{1,3})"
|
||||
version_match = re.search(version_pattern, python_version)
|
||||
python_version = version_match.group(0) if version_match else "Unknown"
|
||||
current_datetime = datetime.now()
|
||||
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
|
||||
strVersion = (
|
||||
"sm1-html-2-json5 version:"
|
||||
+ sm1_html_2_json5_version
|
||||
+ " Chameleon version:"
|
||||
+ chameleon_version
|
||||
+ " On Python:"
|
||||
+ python_version
|
||||
+ " at "
|
||||
+ formatted_datetime
|
||||
)
|
||||
return strVersion
|
||||
|
||||
def check_file_version(filename, ThresholdSecs=3):
|
||||
#
|
||||
# Check modified versus creation date of the file and return +".new" if modified since creation + ThresholdSecs
|
||||
#
|
||||
try:
|
||||
with open(filename, 'r') as file:
|
||||
# Read the first three lines
|
||||
header_lines = [file.readline().strip() for _ in range(5)]
|
||||
|
||||
# Extract the timestamp
|
||||
timestamp_str = None
|
||||
for line in header_lines:
|
||||
if ' at ' in line:
|
||||
# Split at 'at', expect the timestamp to be in the third part
|
||||
print(line)
|
||||
timestamp_str = line.split('at')[2].strip()
|
||||
break
|
||||
|
||||
if timestamp_str is None:
|
||||
print("Warning: No timestamp found. Returning original filename.")
|
||||
return filename # Return the original filename if no timestamp is found
|
||||
|
||||
# Convert the string timestamp to a datetime object
|
||||
file_timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# Add the threshold seconds to the creation date
|
||||
file_timestamp += timedelta(seconds=ThresholdSecs)
|
||||
|
||||
# Get the last modified time of the file, ignoring milliseconds
|
||||
file_modified_time = datetime.fromtimestamp(os.path.getmtime(filename)).replace(microsecond=0)
|
||||
|
||||
print(file_modified_time,file_timestamp)
|
||||
|
||||
# Compare the timestamps
|
||||
if file_modified_time > file_timestamp:
|
||||
return f"{filename}.new"
|
||||
else:
|
||||
return filename
|
||||
except FileNotFoundError:
|
||||
print(f"Error: The file '{filename}' does not exist.")
|
||||
return filename
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {traceback.format_exc()}")
|
||||
return filename
|
||||
|
||||
|
||||
def read_html_file(filename):
|
||||
"""Read HTML content from a file."""
|
||||
with open(filename, "r", encoding="utf-8") as file:
|
||||
@@ -21,6 +96,7 @@ def validate_html(html):
|
||||
except Exception as e:
|
||||
raise ValueError("Invalid HTML document") from e
|
||||
|
||||
|
||||
def convert_double_quotes_to_span(text):
|
||||
"""Convert single-quoted text to <span>...</span>."""
|
||||
# Use a regular expression to find single-quoted text and replace it
|
||||
@@ -28,47 +104,51 @@ def convert_double_quotes_to_span(text):
|
||||
|
||||
|
||||
# def sanitize_text(text):
|
||||
# # Replace newlines with spaces
|
||||
# print(f"--{text}--")
|
||||
# decoded_text = html.unescape(text)
|
||||
# # Replace newlines with spaces
|
||||
# print(f"--{text}--")
|
||||
# decoded_text = html.unescape(text)
|
||||
|
||||
# sanitized_text = decoded_text.replace("\n", "").replace(
|
||||
# "\r", " "
|
||||
# ) # Handle both Unix and Windows line endings
|
||||
# # Replace tabs with spaces
|
||||
# sanitized_text = sanitized_text.replace("\t", "")
|
||||
# # map single quotes to double
|
||||
# # sanitized_text = sanitized_text.replace("'", '"')
|
||||
# #Map signle and double quotes to nothing
|
||||
# sanitized_text.replace("'","").replace('"','')
|
||||
# #Take out any multiple spaces - reduce to one.
|
||||
# sanitized_text = ' '.join(sanitized_text.split())
|
||||
# # Strip leading and trailing whitespace
|
||||
# sanitized_text = sanitized_text.strip()
|
||||
# #sanitized_text = convert_double_quotes_to_span(sanitized_text)
|
||||
# print(f"++{sanitized_text}++")
|
||||
# return sanitized_text
|
||||
|
||||
|
||||
# sanitized_text = decoded_text.replace("\n", "").replace(
|
||||
# "\r", " "
|
||||
# ) # Handle both Unix and Windows line endings
|
||||
# # Replace tabs with spaces
|
||||
# sanitized_text = sanitized_text.replace("\t", "")
|
||||
# # map single quotes to double
|
||||
# # sanitized_text = sanitized_text.replace("'", '"')
|
||||
# #Map signle and double quotes to nothing
|
||||
# sanitized_text.replace("'","").replace('"','')
|
||||
# #Take out any multiple spaces - reduce to one.
|
||||
# sanitized_text = ' '.join(sanitized_text.split())
|
||||
# # Strip leading and trailing whitespace
|
||||
# sanitized_text = sanitized_text.strip()
|
||||
# #sanitized_text = convert_double_quotes_to_span(sanitized_text)
|
||||
# print(f"++{sanitized_text}++")
|
||||
# return sanitized_text
|
||||
|
||||
def sanitize_text(text):
|
||||
# Replace newlines with spaces
|
||||
print(f"--{text}--")
|
||||
# Take out html entities
|
||||
decoded_text = html.unescape(text)
|
||||
# Take out newlines
|
||||
sanitized_text = decoded_text.replace('\n', ' ').replace('\r', ' ') # Handle both Unix and Windows line endings
|
||||
# Take out newlines
|
||||
sanitized_text = decoded_text.replace("\n", " ").replace(
|
||||
"\r", " "
|
||||
) # Handle both Unix and Windows line endings
|
||||
# Replace tabs with spaces
|
||||
sanitized_text = sanitized_text.replace('\t', ' ')
|
||||
sanitized_text = sanitized_text.replace("\t", " ")
|
||||
# Replace quote characters
|
||||
sanitized_text = sanitized_text.replace('"', '').replace("'", '') # Remove double and single quotes
|
||||
#Take out any multiple spaces - reduce to one.
|
||||
sanitized_text = ' '.join(sanitized_text.split())
|
||||
sanitized_text = sanitized_text.replace('"', "").replace(
|
||||
"'", ""
|
||||
) # Remove double and single quotes
|
||||
# Take out any multiple spaces - reduce to one.
|
||||
sanitized_text = " ".join(sanitized_text.split())
|
||||
# Strip leading and trailing whitespace
|
||||
sanitized_text = sanitized_text.strip()
|
||||
print(f"++{sanitized_text}++")
|
||||
return sanitized_text
|
||||
|
||||
|
||||
|
||||
def extract_data(html):
|
||||
"""Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order."""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
@@ -100,7 +180,7 @@ def extract_data(html):
|
||||
# Sanitise text freom newlines,tabs and escape quotes.
|
||||
sanitised_text = sanitize_text(text)
|
||||
if sanitised_text == "":
|
||||
continue
|
||||
continue
|
||||
records.append({"Type": "Paragraph", "Text": sanitised_text})
|
||||
|
||||
elif element.name == "pre":
|
||||
@@ -207,98 +287,103 @@ def insert_spaces_before_caps(text):
|
||||
return re.sub(r"(?<!^)(?=[A-Z])", " ", text)
|
||||
|
||||
|
||||
def save_to_json5(data, output_filename, package_name, header, sub_header):
|
||||
"""Save extracted data to a JSON5 file with a specific structure."""
|
||||
# Generate prefix from uppercase letters in PackageName made into lowercase
|
||||
prefix = "".join(re.findall(r"[A-Z]", package_name)).lower()
|
||||
def save_to_json5(data, output_filename, package_name, header, sub_header,strVersion):
|
||||
"""Save extracted data to a JSON5 file with a specific structure."""
|
||||
# Generate prefix from uppercase letters in PackageName made into lowercase
|
||||
prefix = "".join(re.findall(r"[A-Z]", package_name)).lower()
|
||||
|
||||
# Prepare structured html list
|
||||
structured_html = []
|
||||
paragraph_count = 1
|
||||
preformatted_count = 1
|
||||
input_count = 1
|
||||
table_count = 1
|
||||
# Prepare structured html list
|
||||
structured_html = []
|
||||
paragraph_count = 1
|
||||
preformatted_count = 1
|
||||
input_count = 1
|
||||
table_count = 1
|
||||
|
||||
for record in data:
|
||||
if record["Type"] == "Paragraph":
|
||||
structured_html.append({f"Paragraph{paragraph_count}": record["Text"]})
|
||||
paragraph_count += 1
|
||||
elif record["Type"] == "Preformatted":
|
||||
structured_html.append(
|
||||
{f"Preformatted{preformatted_count}": record["Text"]}
|
||||
)
|
||||
preformatted_count += 1
|
||||
elif record["Type"] == "Header" or record["Type"] == "SubHeader":
|
||||
continue # Skip headers for input count
|
||||
elif record["Type"] == "Table":
|
||||
# Construct the table entry
|
||||
table_structure = {
|
||||
"Type": record["Type"],
|
||||
"TableControl": record["TableControl"],
|
||||
"TopHeadings": record["TopHeadings"],
|
||||
"Columns": record["Columns"],
|
||||
}
|
||||
structured_html.append({f"Table{table_count}": table_structure})
|
||||
table_count += 1
|
||||
else: # For inputs, selects, textareas, and buttons
|
||||
input_structure = {
|
||||
"Type": record["Type"],
|
||||
"Value": record.get("Value", ""), # Safely access Value
|
||||
}
|
||||
for record in data:
|
||||
if record["Type"] == "Paragraph":
|
||||
structured_html.append({f"Paragraph{paragraph_count}": record["Text"]})
|
||||
paragraph_count += 1
|
||||
elif record["Type"] == "Preformatted":
|
||||
structured_html.append(
|
||||
{f"Preformatted{preformatted_count}": record["Text"]}
|
||||
)
|
||||
preformatted_count += 1
|
||||
elif record["Type"] == "Header" or record["Type"] == "SubHeader":
|
||||
continue # Skip headers for input count
|
||||
elif record["Type"] == "Table":
|
||||
# Construct the table entry
|
||||
table_structure = {
|
||||
"Type": record["Type"],
|
||||
"TableControl": record["TableControl"],
|
||||
"TopHeadings": record["TopHeadings"],
|
||||
"Columns": record["Columns"],
|
||||
}
|
||||
structured_html.append({f"Table{table_count}": table_structure})
|
||||
table_count += 1
|
||||
else: # For inputs, selects, textareas, and buttons
|
||||
input_structure = {
|
||||
"Type": record["Type"],
|
||||
"Value": record.get("Value", ""), # Safely access Value
|
||||
}
|
||||
|
||||
# Use .get() for the Name key to avoid KeyError
|
||||
input_structure["Name"] = record.get(
|
||||
"Name", None
|
||||
) # Set to None if not present
|
||||
input_structure["Label"] = record.get(
|
||||
"Label", None
|
||||
) # Set to None if not present
|
||||
# Use .get() for the Name key to avoid KeyError
|
||||
input_structure["Name"] = record.get(
|
||||
"Name", None
|
||||
) # Set to None if not present
|
||||
input_structure["Label"] = record.get(
|
||||
"Label", None
|
||||
) # Set to None if not present
|
||||
|
||||
# Handle specific case for Select options
|
||||
if "Options" in record:
|
||||
input_structure["Options"] = record["Options"]
|
||||
# Handle specific case for Select options
|
||||
if "Options" in record:
|
||||
input_structure["Options"] = record["Options"]
|
||||
|
||||
structured_html.append({f"Input{input_count}": input_structure})
|
||||
input_count += 1
|
||||
structured_html.append({f"Input{input_count}": input_structure})
|
||||
input_count += 1
|
||||
|
||||
# Wrap the records with the required fields
|
||||
json5_data = {
|
||||
"PackageName": package_name,
|
||||
"prefix": prefix,
|
||||
"MenuHeading": "Miscellaneous",
|
||||
"MenuDescription": insert_spaces_before_caps(package_name),
|
||||
"MenuNavigation": "2000 400",
|
||||
"firstPanel": "PARAMS",
|
||||
"signalEvent": f"smeserver-{package_name.lower()}-update",
|
||||
"html": [{
|
||||
"Name": "params",
|
||||
"route": "PARAMS",
|
||||
"Header": header if header else f"{package_name} Contrib",
|
||||
"SubHeader": sub_header
|
||||
if sub_header
|
||||
else f"Manage {package_name} settings:",
|
||||
**{
|
||||
k: v for item in structured_html for k, v in item.items()
|
||||
}, # Flatten the structured_html into the dict
|
||||
}],
|
||||
}
|
||||
# Wrap the records with the required fields
|
||||
json5_data = {
|
||||
"PackageName": package_name,
|
||||
"prefix": prefix,
|
||||
"MenuHeading": "Miscellaneous",
|
||||
"MenuDescription": insert_spaces_before_caps(package_name),
|
||||
"MenuNavigation": "2000 400",
|
||||
"firstPanel": "PARAMS",
|
||||
"signalEvent": f"smeserver-{package_name.lower()}-update",
|
||||
"html": [
|
||||
{
|
||||
"Name": "params",
|
||||
"route": "PARAMS",
|
||||
"Header": header if header else f"{package_name} Contrib",
|
||||
"SubHeader": sub_header
|
||||
if sub_header
|
||||
else f"Manage {package_name} settings:",
|
||||
**{
|
||||
k: v for item in structured_html for k, v in item.items()
|
||||
}, # Flatten the structured_html into the dict
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
# Save in JSON5 format (JSON with comments and unquoted keys)
|
||||
with open(output_filename, "w", encoding="utf-8") as json_file:
|
||||
json.dump(json5_data, json_file, ensure_ascii=False, indent=4)
|
||||
# Save in JSON5 format (JSON with comments and unquoted keys)
|
||||
with open(output_filename, "w", encoding="utf-8") as json_file:
|
||||
|
||||
json.dump(json5_data, json_file, ensure_ascii=False, indent=4)
|
||||
|
||||
# Manually format as JSON5 by adding single quotes (for simplicity)
|
||||
with open(output_filename, "r+", encoding="utf-8") as json_file:
|
||||
content = json_file.read()
|
||||
content = content.replace(
|
||||
'"', "'"
|
||||
) # Replace double quotes with single quotes for JSON5
|
||||
json_file.seek(0)
|
||||
json_file.write(content)
|
||||
json_file.truncate() # Remove any old content beyond the new content length
|
||||
# Manually format as JSON5 by adding single quotes (for simplicity)
|
||||
with open(output_filename, "r+", encoding="utf-8") as json_file:
|
||||
content = f"//\n// Generated by {strVersion}\n//\n"
|
||||
content = content + json_file.read()
|
||||
content = content.replace(
|
||||
'"', "'"
|
||||
) # Replace double quotes with single quotes for JSON5
|
||||
json_file.seek(0)
|
||||
json_file.write(content)
|
||||
json_file.truncate() # Remove any old content beyond the new content length
|
||||
|
||||
|
||||
def main():
|
||||
strVersion = assemble_version_string()
|
||||
# command line parameters
|
||||
parser = argparse.ArgumentParser(description="sm1--html-2-jsopn5")
|
||||
parser.add_argument(
|
||||
@@ -325,24 +410,24 @@ def main():
|
||||
#
|
||||
# Generate output JSON5 filename based on input file name
|
||||
#
|
||||
# Split the original path into directory and file name
|
||||
# Split the original path into directory and file name
|
||||
directory, filename = os.path.split(input_file)
|
||||
|
||||
# Replace 'html' with 'json5' in the directory path
|
||||
new_directory = directory.replace('/html', '/json5')
|
||||
#print(new_directory)
|
||||
new_directory = directory.replace("/html", "/json5")
|
||||
# print(new_directory)
|
||||
|
||||
# Construct the new path
|
||||
output_file = os.path.join(new_directory, filename.replace('.html', '.json5'))
|
||||
output_file = check_file_version(os.path.join(new_directory, filename.replace(".html", ".json5")))
|
||||
print(output_file)
|
||||
#quit(1)
|
||||
|
||||
# quit(1)
|
||||
|
||||
# Generate output JSON5 filename based on input file name
|
||||
base_name = os.path.basename(input_file) # Get the file name (with extension)
|
||||
package_name = os.path.splitext(base_name)[0] # Use the filename without extension
|
||||
|
||||
# Save extracted data to JSON5
|
||||
save_to_json5(data, output_file, package_name, header, sub_header)
|
||||
save_to_json5(data, output_file, package_name, header, sub_header, strVersion)
|
||||
print(f"Extracted data saved to '{output_file}'.")
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user