From d59cfbc2f99f58e9a1fb7f9ce7c4f7df5eeb040e Mon Sep 17 00:00:00 2001
From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org>
Date: Tue, 26 Nov 2024 14:11:30 +0100
Subject: [PATCH] Some cleanup + parsing converting lists in cells to html
 lists

---
 toMkdocs/toMkdocs.py | 266 +++++++++++++++++++++++++++++++++----------
 1 file changed, 204 insertions(+), 62 deletions(-)

diff --git a/toMkdocs/toMkdocs.py b/toMkdocs/toMkdocs.py
index 3908718..69427be 100644
--- a/toMkdocs/toMkdocs.py
+++ b/toMkdocs/toMkdocs.py
@@ -417,12 +417,13 @@ _matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE)
 _matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE)
 _matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE)
 _matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECASE)
-_matchTable = re.compile(r'^\s*\|.*\|\s$', re.IGNORECASE)
+_matchTable = re.compile(r'^\s*\|.*\|\s*$', re.IGNORECASE)
 _matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE)
 _matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE)
-_matchGridTableBodySeparator = re.compile(r'.*\+([-:]+\+)+.*$', re.IGNORECASE)
+_matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE)
 _matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE)
 _match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE)
+_matchListInContent = re.compile(r'^(?:\s*(P<marker>[-*+]|\s*\d+\.))\s+(P<content>.+)$', re.IGNORECASE)
 _markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE)
 _htmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE)
 _htmlAnchorLink = re.compile(r'<a\s+name="([^"]*)">[^<]*</a>', re.IGNORECASE)
@@ -466,6 +467,42 @@ def parse_pandoc_table_with_spans(pandoc_table):
 		_matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE)
 		return _matchGridTableSeparator.match(line)
 
+	def handling_content(cell, content, list_flag):
+		if cell['content'] is None:
+			cell['rowspan'] += 1
+			cell['colspan'] += 1
+			if content.strip().startswith("- "):  # List
+				list_flag = True
+				print(content)
+				cell['content'] = content.strip() + "\n"  # Add newline to know when the list element ends
+			elif list_flag:  # any other content when handling list is concatenated to the last list element
+				cell['content'] += content.strip() + "\n"
+			elif cells[i].strip() == "":  # separation between list and other paragraph
+				list_flag = False
+				cell['content'] = re.sub(r'\\\s*$', "\n", content)
+			else:
+				cell['content'] = re.sub(r'\\\s*$', "\n", content.strip())
+		else:
+			if content.strip().startswith("- "):  # List
+				if not list_flag:
+					cell['content'] += "\n"
+					#cell['content'] = cell['content'].strip("\n")
+				list_flag = True
+				cell['content'] += content.strip() + "\n"  # Add newline to know when the list element ends
+			elif list_flag:  # any other content when handling list is concatenated to the last list element
+				cell['content'] = cell['content'].strip("\n")
+				cell['content'] += " " + content.strip() + "\n"
+			elif cells[i].strip() == "":  # separation between list and other paragraph
+				list_flag = False
+				#content = re.sub(r'\\\s*$', "\n", content.strip())
+				cell['content'] += "\n" if not cell['content'].endswith("\n") else ""
+			else:
+				content = re.sub(r'\\\s*$', "\n", content.strip())
+				cell['content'] += " " + content
+		#print(cell['content'])
+		return list_flag, cell
+
+
 	_matchGridTableSeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
 	separator_indices = [i for i, line in enumerate(lines) if is_separator(line)]
 
@@ -490,11 +527,13 @@ def parse_pandoc_table_with_spans(pandoc_table):
 			has_header = True
 			header_separator_index = index
 			header_rows = []
+
 	data_rows = []
 	for row in range(len(separator_indices) - 1):
 		table_row = []
 		auxiliar_row = []
 		use_auxiliar_row = []
+		list_flags = []
 		has_merged_cells = False
 		in_data_row = False
 		start, end = separator_indices[row], separator_indices[row + 1]
@@ -508,24 +547,38 @@ def parse_pandoc_table_with_spans(pandoc_table):
 					parts = re.split(r"\s*\+\s*", line.strip("+"))
 					# Add as many cells as columns with span attributes
 					delimiter_index = 0
+					# Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator)
+					# we need to assign the default alignment as defined in the header separator line
+					# We may not need the code below, as that supports alignment per cell and row
+					alignments = []
+					for part_index in range(len(parts)):
+						if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
+							alignments.append("align=\"left\"")
+						elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
+							alignments.append("align=\"right\"")
+						else:
+							alignments.append("align=\"center\"")
 					for i in range(number_of_columns_row):
 						delimiter_index += len(parts[i]) + 1
 						table_row.append({
-							"content": "NOCONTENT",
+							"content": None,
 							"rowspan": 0,
 							"colspan": 0,
 							"colspan_adjusted": False,
+							"alignment": alignments[i] if alignments[i] else "align=\"center\"",
 							"position": delimiter_index # Position of cell delimiter +
 						})
 					for i in range(number_of_columns):
 						auxiliar_row.append({
-							"content": "NOCONTENT",
+							"content": None,
 							"rowspan": 0,
 							"colspan": 0,
 							"colspan_adjusted": False,
+							"alignment": "align=\"center\"",
 							"position": 0
 						})
 						use_auxiliar_row.append(False)
+						list_flags.append(False)
 
 				elif in_data_row:
 					# Regular data row or partial separator
@@ -536,14 +589,35 @@ def parse_pandoc_table_with_spans(pandoc_table):
 							for i in range(len(cells)):
 								if _matchGridTableSeparatorLine.match(cells[i]):  # A new row is to be added
 									use_auxiliar_row[i] = True
-								else:
-									if table_row[i]['content'] == "NOCONTENT":
-										table_row[i]['rowspan'] += 1
-										table_row[i]['colspan'] += 1
-										table_row[i]['content'] = cells[i]
+									if cells[i].startswith(":") and not cells[i].endswith(":"):
+										auxiliar_row[i]['alignment'] = "align=\"left\""
+									elif not cells[i].startswith(":") and  cells[i].endswith(":"):
+										auxiliar_row[i]['alignment'] = "align=\"right\""
 									else:
-										table_row[i]['content'] += cells[i]
-									# Cell which is not separator
+										auxiliar_row[i]['alignment'] = "align=\"center\""
+								else:
+									list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
+									#if table_row[i]['content'] is None:
+									#	table_row[i]['rowspan'] += 1
+									#	table_row[i]['colspan'] += 1
+										#if cells[i].strip().startswith("- "):  # List
+										#	handling_list = True
+										#	print(cells[i])
+										#	table_row[i]['content'] += cells[i] + "\n"  # Add newline to know when the list element ends
+										#elif handling_list: # any other content when handling list is concatenated to the last list element
+										#	table_row[i]['content'].strip("\n")
+										#	table_row[i]['content'] += cells[i] + "\n"
+										#elif cells[i].strip(): #separation between list and other paragraph
+										#	handling_list = False
+										#	table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
+										#else:
+										#	table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
+									#else:
+									#	if cells[i].strip().startswith("- "): # List
+									#		print(cells[i])
+									#		table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
+									#	else:
+									#		table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])									# Cell which is not separator
 									table_row[i]['rowspan'] += 1
 									if not table_row[i]['colspan_adjusted']:
 										table_row[i]['colspan_adjusted'] = True
@@ -561,13 +635,30 @@ def parse_pandoc_table_with_spans(pandoc_table):
 							for i in range(len(cells)):
 								if _matchGridTableSeparatorLine.match(cells[i]):  # Update cell in new row
 									use_auxiliar_row[i] = True
-								else:
-									if table_row[i]['content'] == "NOCONTENT":
-										table_row[i]['rowspan'] += 1
-										table_row[i]['colspan'] += 1
-										table_row[i]['content'] = cells[i]
+									if cells[i].startswith(":") and not cells[i].endswith(":"):
+										auxiliar_row[i]['alignment'] = "align=\"left\""
+									elif not cells[i].startswith(":") and  cells[i].endswith(":"):
+										auxiliar_row[i]['alignment'] = "align=\"right\""
 									else:
-										table_row[i]['content'] += cells[i]
+										auxiliar_row[i]['alignment'] = "align=\"center\""
+								else:
+									list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
+
+#									if table_row[i]['content'] is None:
+#										table_row[i]['rowspan'] += 1
+#										table_row[i]['colspan'] += 1
+#										if cells[i].strip().startswith("- "): # List
+#											print(cells[i])
+#											table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
+#										else:
+#											table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i])
+#									else:
+#										if cells[i].strip().startswith("- "): # List
+#											print(cells[i])
+#											table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
+#										else:
+#											table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
+
 									# Cell which is not separator
 									table_row[i]['rowspan'] += 1
 									# Not needed, no colspan as number of cells is equal to number of columns
@@ -588,40 +679,62 @@ def parse_pandoc_table_with_spans(pandoc_table):
 						cells = re.split(r"\s*\|\s*", line.strip("|"))
 						if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
 							for i in range(len(cells)):
-								if table_row[i]['content'] == "NOCONTENT":
-									table_row[i]['rowspan'] += 1
-									table_row[i]['colspan'] += 1
-									table_row[i]['content'] = cells[i]
-								else:
-									table_row[i]['content'] += cells[i]
+								list_flags[i], table_row[i] = handling_content(table_row[i], cells[i], list_flags[i])
+#								if table_row[i]['content'] is None:
+#									table_row[i]['rowspan'] += 1
+#									table_row[i]['colspan'] += 1
+#									if cells[i].strip().startswith("- "):  # List
+#										print(cells[i])
+#										table_row[i]['content'] += cells[i] + "\n"  # Add newline to know when the list element ends
+#									else:
+#										table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
+#								else:
+#									if cells[i].strip().startswith("- "):  # List
+#										print(cells[i])
+#										table_row[i]['content'] += cells[i] + "\n"  # Add newline to know when the list element ends
+#									else:
+#										table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
 								if not table_row[i]['colspan_adjusted']:
 									table_row[i]['colspan_adjusted'] = True
 									for j in range(i, len(cells)):
 										delimiter_start = table_row[j-1]['position'] if j != 0 else 0
 										if line.find("|", delimiter_start+1) > delimiter_positions[j]: # Colspan to be increased
 											table_row[i]['colspan'] += 1
+											if line.find("|", delimiter_start + 1) == delimiter_positions[len(delimiter_positions) - 1]:  # last cell in row, adjust colspan to get max number columns
+												colspan_remaining = 0
+												for cell_index in range(number_of_columns_row):
+													colspan_remaining += table_row[cell_index]['colspan']
+												table_row[i]['colspan'] += number_of_columns - colspan_remaining
 										elif line.find("|", delimiter_start+1) < delimiter_positions[j]:
 											raise ValueError("Wrong cell formatting")
 										else:
-
 											break
 
 						elif len(cells) == number_of_columns: # Simple row
 							for i in range(len(cells)):
 								if use_auxiliar_row[i]:
-									if auxiliar_row[i]['content'] == "NOCONTENT":
+									if auxiliar_row[i]['content'] is None:
 										auxiliar_row[i]['rowspan'] += 1
 										auxiliar_row[i]['colspan'] += 1
-										auxiliar_row[i]['content'] = cells[i]
+										auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
 									else:
-										auxiliar_row[i]['content'] += cells[i]
+										auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
 								else:
-									if table_row[i]['content'] == "NOCONTENT":
-										table_row[i]['rowspan'] += 1
-										table_row[i]['colspan'] += 1
-										table_row[i]['content'] = cells[i]
-									else:
-										table_row[i]['content'] += cells[i]
+									list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
+#									if table_row[i]['content'] is None:
+#										table_row[i]['rowspan'] += 1
+#										table_row[i]['colspan'] += 1
+#										if cells[i].strip().startswith("- "):  # List
+#											print(cells[i])
+#											table_row[i]['content'] += cells[i] + "\n"  # Add newline to know when the list element ends
+#										else:
+#											table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
+#									else:
+#										if cells[i].strip().startswith("- "):  # List
+#											print(cells[i])
+#											table_row[i]['content'] += cells[i] + "\n"  # Add newline to know when the list element ends
+#										else:
+#											table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i])
 						else:
 							raise ValueError("More cells than columns found")
 				else:
@@ -638,39 +751,47 @@ def parse_pandoc_table_with_spans(pandoc_table):
 
 	#print(header_rows)
 	#print(data_rows)
-	# Correct newlines characters
-	for row in header_rows:
-		for cell in row:
-			cell['content'] = cell['content'].replace("\\", "<br>")
-	for row in data_rows:
-		for cell in row:
-			cell['content'] = cell['content'].replace("\\", "<br>")
 	# Check if there are any data rows
 	if not data_rows and not header_rows:
 		raise ValueError("No valid rows found in the provided Pandoc table.")
 
 	# Format text
-	bold = "<strong>"
+	for rows in [header_rows, data_rows]:
+		bold = "<strong>"
+		italic = "<i>"
+		for row in rows:
+			for cell in row:
+				if cell['content'] is not None:
+					# Replacing "<" by &lt;
+					cell['content'] = cell['content'].replace("<", "&lt;")
+
+					#Bold
+					for bold_characters in ["**", "__"]:
+						while cell['content'].find(bold_characters) != -1:
+							cell['content'] = cell['content'].replace(bold_characters, bold, 1)
+							if bold == "<strong>":
+								bold = "</strong>"
+							else:
+								bold = "<strong>"
+					#Italic
+					while cell['content'].find("_") != -1 and cell['content'].find("\_") == -1:
+						cell['content'] = cell['content'].rstrip() .replace("_", italic, 1)
+						if italic == "<i>":
+							italic = "</i>"
+						else:
+							italic = "<i>"
+					while cell['content'].find("\_") != -1:
+						cell['content'] = cell['content'].rstrip().replace("\_", "_", 1)
+
+	# Correct newlines characters
 	for row in header_rows:
 		for cell in row:
-			while cell['content'].find("**") != -1:
-				cell['content'] = cell['content'].replace("**", bold, 1)
-				if bold == "<strong>":
-					bold = "</strong>"
-				else:
-					bold = "<strong>"
-	bold = "<strong>"
+			cell['content'] = cell['content'].replace("\n", "<br />") if cell['content'] is not None else None
 	for row in data_rows:
 		for cell in row:
-			while cell['content'].find("**") != -1:
-				cell['content'] = cell['content'].replace("**", bold, 1)
-				if bold == "<strong>":
-					bold = "</strong>"
-				else:
-					bold = "<strong>"
+			cell['content'] = cell['content'].replace("\n", "<br />") if cell['content'] is not None else None
 
 	# Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows
-
 	forward_rowspan = []
 	for row_index in range(len(header_rows)):
 		if len(forward_rowspan) == 0:
@@ -701,12 +822,7 @@ def parse_pandoc_table_with_spans(pandoc_table):
 				forward_rowspan[cell_index] = data_rows[row_index][cell_index]['rowspan'] - 1
 		if not sum == number_of_columns:
 			raise ValueError("Grid table not converted properly")
-	#if has_header:
-	#	table_with_spans = header_rows
-
-	#table_with_spans += data_rows
 
-	#return table_with_spans
 	return header_rows, data_rows
 
 def generate_html_table_with_spans(pandoc_table):
@@ -733,9 +849,23 @@ def generate_html_table_with_spans(pandoc_table):
 				if cell['rowspan'] == 0 or cell['colspan'] == 0:
 					continue
 				else:
+					# Prepare content, in case there's a list
+					print(cell['content'])
+					if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>",
+											 cell['content']):  # Update cell in new row
+						#print("MATCHING")
+						list = "<ul>"
+						# Build list the matches
+						for match in matches:
+							list += "<li>" + match[1] + "</li>"
+						list += "</ul>"
+						cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell['content'])
+					#else:
+					#	print("NOT MATCHING")
+
 					rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
 					colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
-					html += f"            <td{rowspan}{colspan}>{cell['content']}</td>\n"
+					html += f"            <th{rowspan}{colspan} {cell['alignment']}>{cell['content']}</th>\n"
 			html += "        </tr>\n"
 		html += "    </thead>\n"
 
@@ -746,9 +876,21 @@ def generate_html_table_with_spans(pandoc_table):
 			if cell['rowspan'] == 0 or cell['colspan'] == 0:
 				continue
 			else:
+				#Prepare content, in case there's a list
+				#print(cell['content'])
+				if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell['content']):  # Update cell in new row
+					#print("MATCHING")
+					list = "<ul>"
+					# Build list the matches
+					for match in matches:
+						list += "<li>" + match[1] + "</li>"
+					list += "</ul>"
+					cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell['content'])
+				#else:
+					#print("NOT MATCHING")
 				rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
 				colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
-				html += f"            <td{rowspan}{colspan}>{cell['content']}</td>\n"
+				html += f"            <td{rowspan}{colspan} {cell['alignment']}>{cell['content']}</td>\n"
 		html += "        </tr>\n"
 
 	html += "    </tbody>\n"
-- 
GitLab