From 2451610e5d7b6278076cd1640cc04870de606ce6 Mon Sep 17 00:00:00 2001
From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org>
Date: Wed, 27 Nov 2024 15:06:25 +0100
Subject: [PATCH] More cleanup + cell alignment as defined in header separator
 line (Pandoc's behaviour)

---
 toMkdocs/toMkdocs.py | 222 +++++++++++++++++--------------------------
 1 file changed, 88 insertions(+), 134 deletions(-)

diff --git a/toMkdocs/toMkdocs.py b/toMkdocs/toMkdocs.py
index 69427be..33947cb 100644
--- a/toMkdocs/toMkdocs.py
+++ b/toMkdocs/toMkdocs.py
@@ -422,6 +422,7 @@ _matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE)
 _matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE)
 _matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE)
 _matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE)
+_matchGridTableBodySeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
 _match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE)
 _matchListInContent = re.compile(r'^(?:\s*(P<marker>[-*+]|\s*\d+\.))\s+(P<content>.+)$', re.IGNORECASE)
 _markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE)
@@ -502,8 +503,24 @@ def parse_pandoc_table_with_spans(pandoc_table):
 		#print(cell['content'])
 		return list_flag, cell
 
+	def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions):
+		for j in range(column_index, number_of_parts):
+			delimiter_start = row[j - 1]['position'] if j != 0 else 0
+			positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "|+" if delimiter in line[delimiter_start + 1:]]
+			position = min(positions) if positions else -1
+			if position > delimiter_positions[j]:  # Colspan to be increased
+				row[i]['colspan'] += 1
+				if position == delimiter_positions[len(delimiter_positions) - 1]:  # last cell in row, adjust colspan to get max number columns
+					colspan_allocated = 0
+					for cell_index in range(number_of_parts):
+						colspan_allocated += row[cell_index]['colspan']
+					row[column_index]['colspan'] += number_of_columns - colspan_allocated
+			elif position < delimiter_positions[j]:
+				raise ValueError("Wrong cell formatting")
+			else:
+				break
+		return row[column_index]
 
-	_matchGridTableSeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
 	separator_indices = [i for i, line in enumerate(lines) if is_separator(line)]
 
 	print(separator_indices)
@@ -522,11 +539,26 @@ def parse_pandoc_table_with_spans(pandoc_table):
 				del_positions = [lines[separator_index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[separator_index][delimiter_positions_start + 1:]]
 				delimiter_positions.append(min(del_positions) if del_positions else -1)
 	has_header = False
+	header_delimiter_positions = []
 	for index in separator_indices:
 		if _matchGridTableHeaderSeparator.match(lines[index]):
 			has_header = True
 			header_separator_index = index
 			header_rows = []
+			parts = re.split(r"\s*\+\s*", lines[index].strip("+"))
+			default_alignments = []
+			#Calculate default alignments and positions of delimiters
+			for part_index in range(len(parts)):
+				if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
+					default_alignments.append("align=\"left\"")
+				elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
+					default_alignments.append("align=\"right\"")
+				else:
+					default_alignments.append("align=\"center\"")
+				# Delimiter position
+				delimiter_positions_start = delimiter_positions[part_index - 1] if part_index != 0 else 0
+				del_positions = [lines[index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[index][delimiter_positions_start + 1:]]
+				header_delimiter_positions.append(min(del_positions) if del_positions else -1)
 
 	data_rows = []
 	for row in range(len(separator_indices) - 1):
@@ -550,14 +582,15 @@ def parse_pandoc_table_with_spans(pandoc_table):
 					# Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator)
 					# we need to assign the default alignment as defined in the header separator line
 					# We may not need the code below, as that supports alignment per cell and row
-					alignments = []
-					for part_index in range(len(parts)):
-						if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
-							alignments.append("align=\"left\"")
-						elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
-							alignments.append("align=\"right\"")
-						else:
-							alignments.append("align=\"center\"")
+					#alignments = []
+					#for part_index in range(len(parts)):
+					#	if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
+					#		alignments.append("align=\"left\"")
+					#	elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
+					#		alignments.append("align=\"right\"")
+					#	else:
+					#		alignments.append("align=\"center\"")
+					header_delimiter_index = 0
 					for i in range(number_of_columns_row):
 						delimiter_index += len(parts[i]) + 1
 						table_row.append({
@@ -565,9 +598,21 @@ def parse_pandoc_table_with_spans(pandoc_table):
 							"rowspan": 0,
 							"colspan": 0,
 							"colspan_adjusted": False,
-							"alignment": alignments[i] if alignments[i] else "align=\"center\"",
+							"alignment": default_alignments[i] if i == 0 else "align=\"center\"",
 							"position": delimiter_index # Position of cell delimiter +
 						})
+						#Set alignment as defined by header separator line
+						while header_delimiter_index in range(len(default_alignments)) and table_row[i]['position'] > header_delimiter_positions[header_delimiter_index]:
+							header_delimiter_index += 1
+						if header_delimiter_index in range(len(default_alignments)):
+							if table_row[i]['position'] < header_delimiter_positions[header_delimiter_index]:
+								table_row[i]['alignment'] = default_alignments[header_delimiter_index]
+							elif table_row[i]['position'] == header_delimiter_positions[header_delimiter_index]:
+								table_row[i]['alignment'] = default_alignments[i]
+								header_delimiter_index += 1
+						else:
+							raise ValueError("Invalid table formatting")
+
 					for i in range(number_of_columns):
 						auxiliar_row.append({
 							"content": None,
@@ -585,10 +630,11 @@ def parse_pandoc_table_with_spans(pandoc_table):
 					if _matchGridTableBodySeparator.match(line): # Partial separator
 						has_merged_cells = True
 						cells = re.split(r"\s*[\|\+]\s*", line.strip("|").strip("+")) # (?<!\\)[\|\+]
-						if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
+						if len(cells) <= number_of_columns: # Colspan: Positions of | with respect to + need to be determined
 							for i in range(len(cells)):
-								if _matchGridTableSeparatorLine.match(cells[i]):  # A new row is to be added
+								if _matchGridTableBodySeparatorLine.match(cells[i]):  # A new row is to be added
 									use_auxiliar_row[i] = True
+									list_flags[i] = False
 									if cells[i].startswith(":") and not cells[i].endswith(":"):
 										auxiliar_row[i]['alignment'] = "align=\"left\""
 									elif not cells[i].startswith(":") and  cells[i].endswith(":"):
@@ -596,145 +642,49 @@ def parse_pandoc_table_with_spans(pandoc_table):
 									else:
 										auxiliar_row[i]['alignment'] = "align=\"center\""
 								else:
+									#Handle content of the cell
 									list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
-									#if table_row[i]['content'] is None:
-									#	table_row[i]['rowspan'] += 1
-									#	table_row[i]['colspan'] += 1
-										#if cells[i].strip().startswith("- "):  # List
-										#	handling_list = True
-										#	print(cells[i])
-										#	table_row[i]['content'] += cells[i] + "\n"  # Add newline to know when the list element ends
-										#elif handling_list: # any other content when handling list is concatenated to the last list element
-										#	table_row[i]['content'].strip("\n")
-										#	table_row[i]['content'] += cells[i] + "\n"
-										#elif cells[i].strip(): #separation between list and other paragraph
-										#	handling_list = False
-										#	table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
-										#else:
-										#	table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
-									#else:
-									#	if cells[i].strip().startswith("- "): # List
-									#		print(cells[i])
-									#		table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
-									#	else:
-									#		table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])									# Cell which is not separator
+									# Cell which is not separator
 									table_row[i]['rowspan'] += 1
 									if not table_row[i]['colspan_adjusted']:
 										table_row[i]['colspan_adjusted'] = True
-										for j in range(i, len(cells)):
-											delimiter_start = table_row[j-1]['position'] if j != 0 else 0
-											positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "|+" if delimiter in line[delimiter_start + 1:]]
-											position = min(positions) if positions else -1
-											if position > delimiter_positions_start[j]: # Colspan to add
-												table_row[i]['colspan'] += 1
-											elif position < delimiter_positions_start[j]:
-												raise ValueError("Wrong cell formatting")
-											else:
-												break
-						elif len(cells) == number_of_columns: # Simple row with partial separator, # A new row is to be added
-							for i in range(len(cells)):
-								if _matchGridTableSeparatorLine.match(cells[i]):  # Update cell in new row
-									use_auxiliar_row[i] = True
-									if cells[i].startswith(":") and not cells[i].endswith(":"):
-										auxiliar_row[i]['alignment'] = "align=\"left\""
-									elif not cells[i].startswith(":") and  cells[i].endswith(":"):
-										auxiliar_row[i]['alignment'] = "align=\"right\""
-									else:
-										auxiliar_row[i]['alignment'] = "align=\"center\""
-								else:
-									list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
-
-#									if table_row[i]['content'] is None:
-#										table_row[i]['rowspan'] += 1
-#										table_row[i]['colspan'] += 1
-#										if cells[i].strip().startswith("- "): # List
-#											print(cells[i])
-#											table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
-#										else:
-#											table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i])
-#									else:
-#										if cells[i].strip().startswith("- "): # List
-#											print(cells[i])
-#											table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
-#										else:
-#											table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
-
-									# Cell which is not separator
-									table_row[i]['rowspan'] += 1
-									# Not needed, no colspan as number of cells is equal to number of columns
-									#for j in range(i, len(cells)):
-									#	delimiter_start = table_row[j-1]['position'] if j != 0 else 0
-									#	positions = [line.find(delimiter,delimiter_start+1) for delimiter in "|+" if delimiter in line[delimiter_start+1:]]
-									#	position = min(positions) if positions else -1
-									#	if position > table_row[i]['position']:  # Only colspan to be increased
-									#		table_row[i]['colspan'] += 1
-									#	elif position + 1  < table_row[i]['position']:
-									#		raise ValueError("Wrong cell formatting")
-									#	else:
-									#		break
-
+										#TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
+										table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions)
+						#elif len(cells) == number_of_columns: # Simple row with partial separator, # A new row is to be added
+						#	for i in range(len(cells)):
+						#		if _matchGridTableBodySeparatorLine.match(cells[i]):  # Update cell in new row
+						#			use_auxiliar_row[i] = True
+						#			list_flags[i] = False
+						#			if cells[i].startswith(":") and not cells[i].endswith(":"):
+						#				auxiliar_row[i]['alignment'] = "align=\"left\""
+						#			elif not cells[i].startswith(":") and  cells[i].endswith(":"):
+						#				auxiliar_row[i]['alignment'] = "align=\"right\""
+						#			else:
+						#				auxiliar_row[i]['alignment'] = "align=\"center\""
+						#		else:
+						#			#Handle content of the cell
+						#			list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
+						#			# Cell which is not separator
+						#			table_row[i]['rowspan'] += 1
+						#			# Adjusting of colspan not needed, no colspan as number of cells is equal to number of columns
 						else:
 							raise ValueError("More cells than columns found")
 					else: # Data row
 						cells = re.split(r"\s*\|\s*", line.strip("|"))
 						if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
 							for i in range(len(cells)):
+								# Handle content of the cell
 								list_flags[i], table_row[i] = handling_content(table_row[i], cells[i], list_flags[i])
-#								if table_row[i]['content'] is None:
-#									table_row[i]['rowspan'] += 1
-#									table_row[i]['colspan'] += 1
-#									if cells[i].strip().startswith("- "):  # List
-#										print(cells[i])
-#										table_row[i]['content'] += cells[i] + "\n"  # Add newline to know when the list element ends
-#									else:
-#										table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
-#								else:
-#									if cells[i].strip().startswith("- "):  # List
-#										print(cells[i])
-#										table_row[i]['content'] += cells[i] + "\n"  # Add newline to know when the list element ends
-#									else:
-#										table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
 								if not table_row[i]['colspan_adjusted']:
 									table_row[i]['colspan_adjusted'] = True
-									for j in range(i, len(cells)):
-										delimiter_start = table_row[j-1]['position'] if j != 0 else 0
-										if line.find("|", delimiter_start+1) > delimiter_positions[j]: # Colspan to be increased
-											table_row[i]['colspan'] += 1
-											if line.find("|", delimiter_start + 1) == delimiter_positions[len(delimiter_positions) - 1]:  # last cell in row, adjust colspan to get max number columns
-												colspan_remaining = 0
-												for cell_index in range(number_of_columns_row):
-													colspan_remaining += table_row[cell_index]['colspan']
-												table_row[i]['colspan'] += number_of_columns - colspan_remaining
-										elif line.find("|", delimiter_start+1) < delimiter_positions[j]:
-											raise ValueError("Wrong cell formatting")
-										else:
-											break
-
+									table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions)
 						elif len(cells) == number_of_columns: # Simple row
 							for i in range(len(cells)):
 								if use_auxiliar_row[i]:
-									if auxiliar_row[i]['content'] is None:
-										auxiliar_row[i]['rowspan'] += 1
-										auxiliar_row[i]['colspan'] += 1
-										auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
-									else:
-										auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
+									list_flags[i], auxiliar_row[i] = handling_content(auxiliar_row[i], cells[i],list_flags[i])
 								else:
+									# Handle content of the cell
 									list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
-#									if table_row[i]['content'] is None:
-#										table_row[i]['rowspan'] += 1
-#										table_row[i]['colspan'] += 1
-#										if cells[i].strip().startswith("- "):  # List
-#											print(cells[i])
-#											table_row[i]['content'] += cells[i] + "\n"  # Add newline to know when the list element ends
-#										else:
-#											table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
-#									else:
-#										if cells[i].strip().startswith("- "):  # List
-#											print(cells[i])
-#											table_row[i]['content'] += cells[i] + "\n"  # Add newline to know when the list element ends
-#										else:
-#											table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i])
 						else:
 							raise ValueError("More cells than columns found")
 				else:
@@ -850,7 +800,7 @@ def generate_html_table_with_spans(pandoc_table):
 					continue
 				else:
 					# Prepare content, in case there's a list
-					print(cell['content'])
+					#print(cell['content'])
 					if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>",
 											 cell['content']):  # Update cell in new row
 						#print("MATCHING")
@@ -860,6 +810,8 @@ def generate_html_table_with_spans(pandoc_table):
 							list += "<li>" + match[1] + "</li>"
 						list += "</ul>"
 						cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell['content'])
+						# Enforce left alignment if cell contains a list
+						cell['alignment'] = "align=\"left\""
 					#else:
 					#	print("NOT MATCHING")
 
@@ -886,6 +838,8 @@ def generate_html_table_with_spans(pandoc_table):
 						list += "<li>" + match[1] + "</li>"
 					list += "</ul>"
 					cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell['content'])
+					# Enforce left alignment if cell contains a list
+					cell['alignment'] = "align=\"left\""
 				#else:
 					#print("NOT MATCHING")
 				rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
-- 
GitLab