import json import os import re from bs4 import BeautifulSoup from lxml import etree # Import lxml for HTML validation def read_html_file(filename): """Read HTML content from a file.""" with open(filename, 'r', encoding='utf-8') as file: return file.read() def validate_html(html): """Validate the HTML content.""" try: parser = etree.HTMLParser() etree.fromstring(html, parser) # Attempt to parse the HTML except Exception as e: raise ValueError("Invalid HTML document") from e def sanitize_text(text): # Replace newlines with spaces sanitized_text = text.replace('\n', ' ').replace('\r', ' ') # Handle both Unix and Windows line endings # Replace tabs with spaces sanitized_text = sanitized_text.replace('\t', ' ') # Escape quote characters sanitized_text = sanitized_text.replace('"', '\\"').replace("'", "\\'") # Strip leading and trailing whitespace sanitized_text = sanitized_text.strip() return sanitized_text def extract_data(html): """Extract paragraphs, inputs, tables, and pre blocks from HTML and organize them in order.""" soup = BeautifulSoup(html, 'lxml') records = [] hidden_input_names = [ 'page', 'page_stack', '.id', 'csrf_token' ] header_text = None sub_header_text = None # Counter for tables table_counter = 0 # Extract elements while preserving order for element in soup.find_all(['h1', 'h2', 'p', 'pre', 'input', 'select', 'textarea', 'button', 'table']): if element.name == 'h1': header_text = element.get_text(strip=True) records.append({ 'Type': 'Header', 'Text': header_text }) elif element.name == 'h2': sub_header_text = element.get_text(strip=True) records.append({ 'Type': 'SubHeader', 'Text': sub_header_text }) elif element.name == 'p': text = element.get_text(strip=True) if text: # Ignore empty paragraphs #Sanitise text freom newlines,tabs and escape quotes. records.append({ 'Type': 'Paragraph', 'Text': sanitize_text(text) }) elif element.name == 'pre': text = element.get_text(strip=True) if text: # Ensure non-empty before adding records.append({ 'Type': 'Preformatted', 'Text': text }) elif element.name == 'input': if element.get('type') == 'hidden' or element.get('name') in hidden_input_names: continue input_info = { 'Type': element.get('type', 'text').capitalize(), 'Name': element.get('name'), 'Value': element.get('value', ''), } label = element.find_next('label') input_info['Label'] = label.get_text(strip=True) if label else None records.append(input_info) elif element.name == 'select': options = [{'Value': option.get('value'), 'Text': option.get_text(strip=True)} for option in element.find_all('option')] select_info = { 'Type': 'Select', 'Name': element.get('name'), 'Options': options, 'Label': element.find_previous('label').get_text(strip=True) if element.find_previous('label') else None, } records.append(select_info) elif element.name == 'textarea': textarea_info = { 'Type': 'Textarea', 'Name': element.get('name'), 'Value': element.get_text(strip=True), } label = element.find_previous('label') textarea_info['Label'] = label.get_text(strip=True) if label else None records.append(textarea_info) elif element.name == 'button': button_info = { 'Type': 'Button', 'Name': element.get('name'), 'Value': element.get_text(strip=True), 'Label': element.find_previous('label').get_text(strip=True) if label else None, } records.append(button_info) elif element.name == 'table' and 'sme-border' in element.get('class', []): # Increment the table counter table_counter += 1 # Prepare the TableControl format table_control = f"Table{table_counter}" # e.g., "Table1", "Table2" top_headings = [] columns = [] # Extract headings from the first row first_row = element.find('tr') if first_row: for th in first_row.find_all('th'): top_headings.append(th.get_text(strip=True)) # Extract only the first data row's cell values for Columns data_rows = element.find_all('tr')[1:] # Skip the heading row if data_rows: first_data_row = data_rows[0] # Take the first row of data for idx, th in enumerate(first_row.find_all('th')): td = first_data_row.find_all('td')[idx] if idx < len(first_data_row.find_all('td')) else None if td: columns.append(f"{table_control}-{th.get_text(strip=True)}") # Format as desired records.append({ 'Type': 'Table', 'TableControl': table_control, 'TopHeadings': top_headings, 'Columns': columns, }) return records, header_text, sub_header_text def insert_spaces_before_caps(text): """Insert spaces before each capital letter in a given string.""" return re.sub(r'(?