From 944ea98ee8fe1d122e29b9924cc40e6d156f8fcf Mon Sep 17 00:00:00 2001
From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org>
Date: Fri, 21 Feb 2025 09:41:06 +0100
Subject: [PATCH] Improvements for grid tables conversion

---
 toMkdocs/toMkdocs.py | 172 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 141 insertions(+), 31 deletions(-)

diff --git a/toMkdocs/toMkdocs.py b/toMkdocs/toMkdocs.py
index ae0be2a..0222748 100644
--- a/toMkdocs/toMkdocs.py
+++ b/toMkdocs/toMkdocs.py
@@ -486,18 +486,31 @@ def parse_pandoc_table_with_spans(pandoc_table):
 			self.list_flag = False
 
 		def set_alignment(self):
-			header_delimiter_index = 0
-			while header_delimiter_index in range(len(default_alignments)) and self.position > header_delimiter_positions[header_delimiter_index]:
-				header_delimiter_index += 1
-			if header_delimiter_index in range(len(default_alignments)):
-				if self.position < header_delimiter_positions[header_delimiter_index]:
-					self.alignment = default_alignments[header_delimiter_index]
-				elif self.position == header_delimiter_positions[header_delimiter_index]:
-					self.alignment = default_alignments[header_delimiter_index]
+			if has_header:
+				header_delimiter_index = 0
+				while header_delimiter_index in range(len(default_alignments)) and self.position > header_delimiter_positions[header_delimiter_index]:
 					header_delimiter_index += 1
+				if header_delimiter_index in range(len(default_alignments)):
+					if self.position < header_delimiter_positions[header_delimiter_index]:
+						self.alignment = default_alignments[header_delimiter_index]
+					elif self.position == header_delimiter_positions[header_delimiter_index]:
+						self.alignment = default_alignments[header_delimiter_index]
+						header_delimiter_index += 1
+				else:
+					raise ValueError("Invalid table formatting")
 			else:
-				raise ValueError("Invalid table formatting")
-
+				body_delimiter_index = 0
+				while body_delimiter_index in range(len(default_alignments)) and self.position > \
+						delimiter_positions[body_delimiter_index]:
+					body_delimiter_index += 1
+				if body_delimiter_index in range(len(default_alignments)):
+					if self.position < delimiter_positions[body_delimiter_index]:
+						self.alignment = default_alignments[body_delimiter_index]
+					elif self.position == delimiter_positions[body_delimiter_index]:
+						self.alignment = default_alignments[body_delimiter_index]
+						body_delimiter_index += 1
+				else:
+					raise ValueError("Invalid table formatting")
 	class Row():
 		"""	Represents a row in the markdown file. """
 		cells:list[Cell] = []
@@ -534,12 +547,15 @@ def parse_pandoc_table_with_spans(pandoc_table):
 			if content.strip().startswith("- "):  # List
 				cell.list_flag = True
 				#print(content)
-				cell.content = content.strip() + "\n"  # Add newline to know when the list element ends
+				content = re.sub(r'\\\s*$', "\n", content.strip())
+				cell.content = content + "@"  # Add list element end mark to know when the list element ends
 			elif cell.list_flag and content.strip() != "":  # any other content when handling list is concatenated to the last list element
-				cell.content += content.strip() + "\n"
+				content = re.sub(r'\\\s*$', "\n", content.strip())
+				cell.content += content + "@" #add the list element end mark
 			elif content.strip == "":  # separation between list and other paragraph
-				cell.list_flag = False
-				cell.content += "\n" #if not cell['content'].endswith("\n") else ""
+				#if cell.list_flag:
+				#	cell.list_flag = False
+				cell.content += "\n" if not cell['content'].endswith("\n") else ""
 			else:
 				cell.content = re.sub(r'\\\s*$', "\n", content.strip())
 		else:
@@ -548,12 +564,16 @@ def parse_pandoc_table_with_spans(pandoc_table):
 					cell.content += "\n"
 					#cell['content'] = cell['content'].strip("\n")
 				cell.list_flag = True
-				cell.content += content.strip() + "\n"  # Add newline to know when the list element ends
+				content = re.sub(r'\\\s*$', "\n", content.strip())
+				cell.content += content + "@"  # Add list element end mark to know when the list element ends
 			elif cell.list_flag and content.strip() != "":  # any other content when handling list is concatenated to the last list element
-				cell.content = cell.content.strip("\n")
-				cell.content += " " + content.strip() + "\n"
+				cell.content = cell.content.strip("@") #remove list element end mark
+				content = re.sub(r'\\\s*$', "\n", content.strip())
+				cell.content += " " + content + "@" #add list element end mark
 			elif content.strip() == "":  # separation between list and other paragraph
-				cell.list_flag = False
+				if cell.list_flag:
+					cell.list_flag = False
+					cell.content += "\n\n" #end list by \n
 				#content = re.sub(r'\\\s*$', "\n", content.strip())
 				cell.content += "\n" if not cell.content.endswith("\n") else ""
 			else:
@@ -604,6 +624,7 @@ def parse_pandoc_table_with_spans(pandoc_table):
 				delimiter_positions.append(min(del_positions) if del_positions else -1)
 	has_header = False
 	header_delimiter_positions = []
+	header_rows = []
 	for index in separator_indices:
 		if _matchGridTableHeaderSeparator.match(lines[index]):
 			has_header = True
@@ -624,6 +645,18 @@ def parse_pandoc_table_with_spans(pandoc_table):
 				del_positions = [lines[index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[index][delimiter_positions_start + 1:]]
 				header_delimiter_positions.append(min(del_positions) if del_positions else -1)
 
+	if not has_header:
+		#Set default alignments from the first separator
+		parts = re.split(r"\+", lines[0].strip("+"))
+		default_alignments = []
+		# Calculate default alignments and positions of delimiters
+		for part_index in range(len(parts)):
+			if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
+				default_alignments.append("align=\"left\"")
+			elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
+				default_alignments.append("align=\"right\"")
+			else:
+				default_alignments.append("align=\"center\"")
 	data_rows = []
 	for row in range(len(separator_indices) - 1):
 		rows = []
@@ -636,6 +669,10 @@ def parse_pandoc_table_with_spans(pandoc_table):
 			for line in row_lines:
 				if is_separator(line) and not in_data_row:
 					in_data_row = True
+					# Add delimiter alignment check for separator lines
+					if not check_delimiter_alignment(line, delimiter_positions):
+						raise ValueError(f"Misaligned delimiters in separator row: {line}")
+					
 					parts = re.split(r"\s*\+\s*", line.strip("+"))
 					delimiter_index = 0
 					# Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator)
@@ -667,7 +704,11 @@ def parse_pandoc_table_with_spans(pandoc_table):
 				elif in_data_row:
 					# Regular data row or partial separator
 					if _matchGridTableBodySeparator.match(line): # Partial separator
-						cells_content = re.split(r"[\|\+]", line.strip("|").strip("+"))  # (?<!\\)[\|\+]
+						# Add delimiter alignment check for partial separators
+						if not check_delimiter_alignment(line, delimiter_positions):
+							raise ValueError(f"Misaligned delimiters in partial separator: {line}")
+							
+						cells_content = re.split(r"[\|\+]", line.strip("|").strip("+"))
 						#Add another row, set delimiters for each cell
 						rows.append(Row(number_of_columns))
 						aux_delimiter_index = 0
@@ -717,7 +758,13 @@ def parse_pandoc_table_with_spans(pandoc_table):
 						else:
 							raise ValueError("More cells than columns found")
 					else: # Data row
-						cells_content = re.split(r"\s*\|\s*", line.strip("|"))
+						cells_content = line.strip()
+						cells_content = re.split(r"\|", line.strip("|"))
+						
+						# Add delimiter alignment check
+						if not check_delimiter_alignment(line, delimiter_positions):
+							raise ValueError(f"Misaligned delimiters in row: {line}")
+							
 						column_index = 0
 						if len(cells_content) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
 							for i in range(len(cells_content)):
@@ -744,6 +791,10 @@ def parse_pandoc_table_with_spans(pandoc_table):
 			elif has_header and start < header_separator_index: # table_row and auxiliar_row are part of header_rows
 				for header_row in rows:
 					header_rows.append(header_row.cells)
+			else:
+				#only body
+				for body_row in rows:
+					data_rows.append(body_row.cells)
 
 	#print(header_rows)
 	#print(data_rows)
@@ -821,19 +872,32 @@ def parse_pandoc_table_with_spans(pandoc_table):
 
 	return header_rows, data_rows
 
-def generate_html_table_with_spans(pandoc_table):
+def generate_html_table_with_spans(pandoc_table: str) -> str:
 	"""
 	Generate an HTML table from a Pandoc-style grid table with row and column spans.
 
-	:param pandoc_table: String of the Pandoc-style grid table.
-	:return: HTML string.
+	Args:
+		pandoc_table (str): String of the Pandoc-style grid table.
+
+	Returns:
+		str: Generated HTML table markup, or error message if generation fails.
 	"""
+	debug_output = []
+	def debug_print(msg):
+		debug_output.append(str(msg))  # Convert message to string
+
 	try:
+		# Redirect print statements to our debug collector
+		global print
+		original_print = print
+		print = debug_print
+		
 		grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table)
-	except:
-		logging.ERROR("Grid table could not be generated")
-		return "HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS"
-	else:
+		
+		# Restore original print
+		print = original_print
+		
+		# Generate table HTML...
 		html = "<table>\n"
 		has_header = False
 
@@ -851,7 +915,7 @@ def generate_html_table_with_spans(pandoc_table):
 					else:
 						# Prepare content, in case there's a list
 						#print(cell.content)
-						if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>",
+						if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+)@",
 												 cell.content):  # Update cell in new row
 							#print("MATCHING")
 							list = "<ul>"
@@ -859,7 +923,7 @@ def generate_html_table_with_spans(pandoc_table):
 							for match in matches:
 								list += "<li>" + match[1] + "</li>"
 							list += "</ul>"
-							cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell.content)
+							cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+(?:(?!@).)+@)+", list, cell.content)
 							# Enforce left alignment if cell contains a list
 							cell.alignment = "align=\"left\""
 						#else:
@@ -880,7 +944,7 @@ def generate_html_table_with_spans(pandoc_table):
 				else:
 					#Prepare content, in case there's a list
 					#print(cell.content)
-					if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell.content):  # Update cell in new row
+					if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+)@", cell.content):  # Update cell in new row
 						#print("MATCHING")
 						#print(cell.content)
 						list = "<ul>"
@@ -888,7 +952,7 @@ def generate_html_table_with_spans(pandoc_table):
 						for match in matches:
 							list += "<li>" + match[1] + "</li>"
 						list += "</ul>"
-						cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell.content)
+						cell.content = re.sub(r"\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+@)+",list, cell.content)
 						# Enforce left alignment if cell contains a list
 						cell.alignment = "align=\"left\""
 					#else:
@@ -901,6 +965,52 @@ def generate_html_table_with_spans(pandoc_table):
 		html += "    </tbody>\n"
 		html += "</table>"
 		return html
+	except Exception as e:
+		logging.error("Grid table could not be generated")
+		debug_text = "<br>".join(debug_output)  # Now all items are strings
+		return f"HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE.<br><pre>{debug_text}</pre>"
+
+def check_delimiter_alignment(line: str, delimiter_positions: list[int], delimiters: str = "|+") -> bool:
+    """
+    Check if delimiters in a row align with expected positions.
+    
+    Args:
+        line: The line of text to check
+        delimiter_positions: List of expected positions (based on + characters)
+        delimiters: String containing valid delimiter characters (default: "|+")
+    
+    Returns:
+        bool: True if delimiters align correctly, False otherwise
+    """
+    if not line or not delimiter_positions:
+        return False
+    
+    print(f"\nChecking line: '{line}'")
+    print(f"Expected delimiter positions: {delimiter_positions}")
+    
+    # For full separator lines (only +)
+    if '+' in line and '|' not in line:
+        current_positions = [i for i, char in enumerate(line) if (char == '+' and i != 0)]
+        print(f"Full separator line - Found + at positions: {current_positions}")
+        return all(delimiter_positions[-1] in current_positions and 
+				   line.startswith("+") and
+				   pos in delimiter_positions for pos in current_positions)
+    
+    # For data lines (only |)
+    if '|' in line and '+' not in line:
+        current_positions = [i for i, char in enumerate(line) if (char == '|' and i != 0)]
+        print(f"Data line - Found | at positions: {current_positions}")
+        return all(delimiter_positions[-1] in current_positions and 
+				   line.startswith("|") and
+				   pos in delimiter_positions for pos in current_positions)
+       
+    # For partial separators (mix of + and |)
+    current_positions = [i for i, char in enumerate(line) if (char in delimiters and i != 0)]
+    print(f"Partial separator - Found delimiters at positions: {current_positions}")
+    print(f"Characters at those positions: {[line[pos] for pos in current_positions]}")
+    return all(delimiter_positions[-1] in current_positions and 
+			   (line.startswith("+") or line.startswith("|")) and
+			   pos in delimiter_positions for pos in current_positions)
 
 def analyseMarkdown(filename:str) -> Document:
 	"""	Analyse the markdown file and split it into clauses.
-- 
GitLab