From bc7807601c2098fdb9de22f53de43558cd16639e Mon Sep 17 00:00:00 2001
From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org>
Date: Thu, 21 Nov 2024 10:41:55 +0100
Subject: [PATCH] Support for grid tables and equations on mkdocs

---
 generateChangemarks/.gitlab-ci.yml |   4 +-
 toMkdocs/mkdocs.yml                |   6 +
 toMkdocs/toMkdocs.py               | 346 ++++++++++++++++++++++++++++-
 3 files changed, 350 insertions(+), 6 deletions(-)

diff --git a/generateChangemarks/.gitlab-ci.yml b/generateChangemarks/.gitlab-ci.yml
index d99c090..5e48e0e 100644
--- a/generateChangemarks/.gitlab-ci.yml
+++ b/generateChangemarks/.gitlab-ci.yml
@@ -197,12 +197,12 @@ pages:
      curl "${CI_API_V4_URL}/projects/$TOOLS_SCRIPTS_PROJECT_ID/repository/files/toMkdocs%2Fstylesheets%2Fextra%2Ecss/raw?ref=master" >> extra.css
     - mkdir -p docs/stylesheets && mv extra.css docs/stylesheets/
     - |
-     curl "${CI_API_V4_URL}/projects/$TOOLS_SCRIPTS_PROJECT_ID/repository/files/toMkdocs%2Fmkdocs%2Eyml/raw?ref=master" >> mkdocs.yml
+     curl "${CI_API_V4_URL}/projects/$TOOLS_SCRIPTS_PROJECT_ID/repository/files/toMkdocs%2Fmkdocs%2Eyml/raw?ref=gridtables" >> mkdocs.yml
     - |
      curl "${CI_API_V4_URL}/projects/$TOOLS_SCRIPTS_PROJECT_ID/repository/files/toMkdocs%2FindexDownload%2Emd/raw?ref=master" >> indexDownload.md
     - mkdir -p docs/download && mv indexDownload.md docs/download/index.md
     - |
-     curl "${CI_API_V4_URL}/projects/$TOOLS_SCRIPTS_PROJECT_ID/repository/files/toMkdocs%2FtoMkdocs%2Epy/raw?ref=master" >> toMkdocs.py
+     curl "${CI_API_V4_URL}/projects/$TOOLS_SCRIPTS_PROJECT_ID/repository/files/toMkdocs%2FtoMkdocs%2Epy/raw?ref=gridtables" >> toMkdocs.py
     - |
      export SPEC_NAME=$(ls | grep -E "(TS|TR|WI).*\.md" | cut -d'.' -f1)
     - |
diff --git a/toMkdocs/mkdocs.yml b/toMkdocs/mkdocs.yml
index c1c3ac8..dc5cf77 100644
--- a/toMkdocs/mkdocs.yml
+++ b/toMkdocs/mkdocs.yml
@@ -60,6 +60,8 @@ markdown_extensions:
       pygments_lang_class: true
   - pymdownx.inlinehilite
   - pymdownx.snippets
+  - pymdownx.arithmatex:
+      generic: true
   - pymdownx.superfences:
       custom_fences:
         - name: mermaid
@@ -69,6 +71,10 @@ markdown_extensions:
      alternate_style: true
   - tables
 
+extra_javascript:
+  - javascripts/mathjax.js
+  - https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js
+
 ##############################################################################
 
 extra:
diff --git a/toMkdocs/toMkdocs.py b/toMkdocs/toMkdocs.py
index 49778de..3908718 100644
--- a/toMkdocs/toMkdocs.py
+++ b/toMkdocs/toMkdocs.py
@@ -11,6 +11,7 @@ from enum import Enum, auto
 import argparse, re, os, shutil, hashlib, base64
 from dataclasses import dataclass
 from rich import print
+from html import escape
 
 verbose = False
 veryVerbose = False
@@ -418,6 +419,9 @@ _matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE)
 _matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECASE)
 _matchTable = re.compile(r'^\s*\|.*\|\s$', re.IGNORECASE)
 _matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE)
+_matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE)
+_matchGridTableBodySeparator = re.compile(r'.*\+([-:]+\+)+.*$', re.IGNORECASE)
+_matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE)
 _match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE)
 _markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE)
 _htmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE)
@@ -447,6 +451,309 @@ def shortHash(value:str, length:int) -> str:
 				).digest()
 			 ).decode()[:length]
 
+def parse_pandoc_table_with_spans(pandoc_table):
+	"""
+	Parse a Pandoc-style grid table into a structure for HTML conversion with rowspan and colspan.
+
+	:param pandoc_table: String of the Pandoc-style grid table.
+	:return: List of lists representing the table with metadata for spans.
+	"""
+	# Split the input into lines
+	lines = [line.strip() for line in pandoc_table.strip().split("\n")]
+
+	# Detect separator lines by pattern (it does not take into account partial separators
+	def is_separator(line):
+		_matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE)
+		return _matchGridTableSeparator.match(line)
+
+	_matchGridTableSeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
+	separator_indices = [i for i, line in enumerate(lines) if is_separator(line)]
+
+	print(separator_indices)
+	if not separator_indices:
+		raise ValueError("No valid separators found in the provided Pandoc table.")
+
+	# Calculate max number of columns
+	delimiter_positions = []
+	number_of_columns = 0
+	for separator_index in separator_indices:
+		if lines[separator_index].count("+") - 1 > number_of_columns:
+			number_of_columns = lines[separator_index].count("+") - 1
+			delimiter_positions = []
+			for j in range(number_of_columns):
+				delimiter_positions_start = delimiter_positions[j - 1] if j != 0 else 0
+				del_positions = [lines[separator_index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[separator_index][delimiter_positions_start + 1:]]
+				delimiter_positions.append(min(del_positions) if del_positions else -1)
+	has_header = False
+	for index in separator_indices:
+		if _matchGridTableHeaderSeparator.match(lines[index]):
+			has_header = True
+			header_separator_index = index
+			header_rows = []
+	data_rows = []
+	for row in range(len(separator_indices) - 1):
+		table_row = []
+		auxiliar_row = []
+		use_auxiliar_row = []
+		has_merged_cells = False
+		in_data_row = False
+		start, end = separator_indices[row], separator_indices[row + 1]
+		row_lines = lines[start:end]  # Lines between separators including separator line start as it gives information about the number of columns of the row
+		if row_lines:
+			# Combine multiline content into single strings for each cell
+			for line in row_lines:
+				if is_separator(line) and not in_data_row:
+					number_of_columns_row = line.count("+") - 1
+					in_data_row = True
+					parts = re.split(r"\s*\+\s*", line.strip("+"))
+					# Add as many cells as columns with span attributes
+					delimiter_index = 0
+					for i in range(number_of_columns_row):
+						delimiter_index += len(parts[i]) + 1
+						table_row.append({
+							"content": "NOCONTENT",
+							"rowspan": 0,
+							"colspan": 0,
+							"colspan_adjusted": False,
+							"position": delimiter_index # Position of cell delimiter +
+						})
+					for i in range(number_of_columns):
+						auxiliar_row.append({
+							"content": "NOCONTENT",
+							"rowspan": 0,
+							"colspan": 0,
+							"colspan_adjusted": False,
+							"position": 0
+						})
+						use_auxiliar_row.append(False)
+
+				elif in_data_row:
+					# Regular data row or partial separator
+					if _matchGridTableBodySeparator.match(line): # Partial separator
+						has_merged_cells = True
+						cells = re.split(r"\s*[\|\+]\s*", line.strip("|").strip("+")) # (?<!\\)[\|\+]
+						if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
+							for i in range(len(cells)):
+								if _matchGridTableSeparatorLine.match(cells[i]):  # A new row is to be added
+									use_auxiliar_row[i] = True
+								else:
+									if table_row[i]['content'] == "NOCONTENT":
+										table_row[i]['rowspan'] += 1
+										table_row[i]['colspan'] += 1
+										table_row[i]['content'] = cells[i]
+									else:
+										table_row[i]['content'] += cells[i]
+									# Cell which is not separator
+									table_row[i]['rowspan'] += 1
+									if not table_row[i]['colspan_adjusted']:
+										table_row[i]['colspan_adjusted'] = True
+										for j in range(i, len(cells)):
+											delimiter_start = table_row[j-1]['position'] if j != 0 else 0
+											positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "|+" if delimiter in line[delimiter_start + 1:]]
+											position = min(positions) if positions else -1
+											if position > delimiter_positions_start[j]: # Colspan to add
+												table_row[i]['colspan'] += 1
+											elif position < delimiter_positions_start[j]:
+												raise ValueError("Wrong cell formatting")
+											else:
+												break
+						elif len(cells) == number_of_columns: # Simple row with partial separator, # A new row is to be added
+							for i in range(len(cells)):
+								if _matchGridTableSeparatorLine.match(cells[i]):  # Update cell in new row
+									use_auxiliar_row[i] = True
+								else:
+									if table_row[i]['content'] == "NOCONTENT":
+										table_row[i]['rowspan'] += 1
+										table_row[i]['colspan'] += 1
+										table_row[i]['content'] = cells[i]
+									else:
+										table_row[i]['content'] += cells[i]
+									# Cell which is not separator
+									table_row[i]['rowspan'] += 1
+									# Not needed, no colspan as number of cells is equal to number of columns
+									#for j in range(i, len(cells)):
+									#	delimiter_start = table_row[j-1]['position'] if j != 0 else 0
+									#	positions = [line.find(delimiter,delimiter_start+1) for delimiter in "|+" if delimiter in line[delimiter_start+1:]]
+									#	position = min(positions) if positions else -1
+									#	if position > table_row[i]['position']:  # Only colspan to be increased
+									#		table_row[i]['colspan'] += 1
+									#	elif position + 1  < table_row[i]['position']:
+									#		raise ValueError("Wrong cell formatting")
+									#	else:
+									#		break
+
+						else:
+							raise ValueError("More cells than columns found")
+					else: # Data row
+						cells = re.split(r"\s*\|\s*", line.strip("|"))
+						if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
+							for i in range(len(cells)):
+								if table_row[i]['content'] == "NOCONTENT":
+									table_row[i]['rowspan'] += 1
+									table_row[i]['colspan'] += 1
+									table_row[i]['content'] = cells[i]
+								else:
+									table_row[i]['content'] += cells[i]
+								if not table_row[i]['colspan_adjusted']:
+									table_row[i]['colspan_adjusted'] = True
+									for j in range(i, len(cells)):
+										delimiter_start = table_row[j-1]['position'] if j != 0 else 0
+										if line.find("|", delimiter_start+1) > delimiter_positions[j]: # Colspan to be increased
+											table_row[i]['colspan'] += 1
+										elif line.find("|", delimiter_start+1) < delimiter_positions[j]:
+											raise ValueError("Wrong cell formatting")
+										else:
+
+											break
+
+						elif len(cells) == number_of_columns: # Simple row
+							for i in range(len(cells)):
+								if use_auxiliar_row[i]:
+									if auxiliar_row[i]['content'] == "NOCONTENT":
+										auxiliar_row[i]['rowspan'] += 1
+										auxiliar_row[i]['colspan'] += 1
+										auxiliar_row[i]['content'] = cells[i]
+									else:
+										auxiliar_row[i]['content'] += cells[i]
+								else:
+									if table_row[i]['content'] == "NOCONTENT":
+										table_row[i]['rowspan'] += 1
+										table_row[i]['colspan'] += 1
+										table_row[i]['content'] = cells[i]
+									else:
+										table_row[i]['content'] += cells[i]
+						else:
+							raise ValueError("More cells than columns found")
+				else:
+					raise ValueError("No separator line found for row starting")
+
+			if has_header and start >= header_separator_index: # table_row and auxiliar_row are part of data_rows
+				data_rows.append(table_row)
+				if has_merged_cells:
+					data_rows.append(auxiliar_row)
+			elif has_header and start < header_separator_index: # table_row and auxiliar_row are part of header_rows
+				header_rows.append(table_row)
+				if has_merged_cells:
+					header_rows.append(auxiliar_row)
+
+	#print(header_rows)
+	#print(data_rows)
+	# Correct newlines characters
+	for row in header_rows:
+		for cell in row:
+			cell['content'] = cell['content'].replace("\\", "<br>")
+	for row in data_rows:
+		for cell in row:
+			cell['content'] = cell['content'].replace("\\", "<br>")
+	# Check if there are any data rows
+	if not data_rows and not header_rows:
+		raise ValueError("No valid rows found in the provided Pandoc table.")
+
+	# Format text
+	bold = "<strong>"
+	for row in header_rows:
+		for cell in row:
+			while cell['content'].find("**") != -1:
+				cell['content'] = cell['content'].replace("**", bold, 1)
+				if bold == "<strong>":
+					bold = "</strong>"
+				else:
+					bold = "<strong>"
+	bold = "<strong>"
+	for row in data_rows:
+		for cell in row:
+			while cell['content'].find("**") != -1:
+				cell['content'] = cell['content'].replace("**", bold, 1)
+				if bold == "<strong>":
+					bold = "</strong>"
+				else:
+					bold = "<strong>"
+
+	# Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows
+
+	forward_rowspan = []
+	for row_index in range(len(header_rows)):
+		if len(forward_rowspan) == 0:
+			forward_rowspan = [0 for _ in range(len(header_rows[row_index]))]
+		sum = 0
+		for cell_index in range(len(header_rows[row_index])):
+			sum += header_rows[row_index][cell_index]['colspan']
+			if row_index > 0 and header_rows[row_index][cell_index]['colspan'] == 0:
+				if forward_rowspan[cell_index] > 0:
+					sum += 1
+				forward_rowspan[cell_index] -= 1
+			if forward_rowspan[cell_index] == 0 and header_rows[row_index][cell_index]['rowspan'] > 1:
+				forward_rowspan[cell_index] = header_rows[row_index][cell_index]['rowspan'] -1
+		if not sum == number_of_columns:
+			raise ValueError("Grid table not converted properly")
+	forward_rowspan = []
+	for row_index in range(len(data_rows)):
+		if len(forward_rowspan) == 0:
+			forward_rowspan = [0 for _ in range(len(data_rows[row_index]))]
+		sum = 0
+		for cell_index in range(len(data_rows[row_index])):
+			sum += data_rows[row_index][cell_index]['colspan']
+			if row_index > 0 and data_rows[row_index][cell_index]['colspan'] == 0:
+				if forward_rowspan[cell_index] > 0:
+					sum += 1
+				forward_rowspan[cell_index] -= 1
+			if forward_rowspan[cell_index] == 0 and data_rows[row_index][cell_index]['rowspan'] > 1:
+				forward_rowspan[cell_index] = data_rows[row_index][cell_index]['rowspan'] - 1
+		if not sum == number_of_columns:
+			raise ValueError("Grid table not converted properly")
+	#if has_header:
+	#	table_with_spans = header_rows
+
+	#table_with_spans += data_rows
+
+	#return table_with_spans
+	return header_rows, data_rows
+
+def generate_html_table_with_spans(pandoc_table):
+	"""
+	Generate an HTML table from a Pandoc-style grid table with row and column spans.
+
+	:param pandoc_table: String of the Pandoc-style grid table.
+	:return: HTML string.
+	"""
+	grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table)
+
+	html = "<table>\n"
+	has_header = False
+
+	for row in grid_header:
+		for cell in row:
+			if cell['rowspan'] != 0 and cell['colspan'] != 0:
+				has_header = True
+	if has_header:
+		html += "    <thead>\n"
+		for row in grid_header:
+			html += "        <tr>\n"
+			for cell in row:
+				if cell['rowspan'] == 0 or cell['colspan'] == 0:
+					continue
+				else:
+					rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
+					colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
+					html += f"            <td{rowspan}{colspan}>{cell['content']}</td>\n"
+			html += "        </tr>\n"
+		html += "    </thead>\n"
+
+	html += "    <tbody>\n"
+	for row in grid_body:
+		html += "        <tr>\n"
+		for cell in row:
+			if cell['rowspan'] == 0 or cell['colspan'] == 0:
+				continue
+			else:
+				rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
+				colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
+				html += f"            <td{rowspan}{colspan}>{cell['content']}</td>\n"
+		html += "        </tr>\n"
+
+	html += "    </tbody>\n"
+	html += "</table>"
+	return html
 
 def analyseMarkdown(filename:str) -> Document:
 	"""	Analyse the markdown file and split it into clauses.
@@ -473,6 +780,9 @@ def analyseMarkdown(filename:str) -> Document:
 	inCodefence = False
 	inTable = False
 	tableHasSeparator = False
+	inGridTable = False
+	gridTableHasSeparator = False
+	gridTable = ""
 	for line in inLines:
 
 		# Detect and handle codefences
@@ -493,7 +803,7 @@ def analyseMarkdown(filename:str) -> Document:
 			continue
 
 		# Detect and handle tables
-		if _matchTable.match(line) and not inTable:
+		if _matchTable.match(line) and not inTable and not inGridTable:
 			inTable = True
 			outClauses[-1].append(Line(line, LineType.TABLEHEADER))
 			continue
@@ -512,8 +822,36 @@ def analyseMarkdown(filename:str) -> Document:
 				outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW
 				# continue with other matches
 
+		#Detect grid tables and convert them to html table
+		if _matchGridTable.match(line) and not inGridTable:
+			inGridTable = True
+			#outClauses[-1].append(Line(line, LineType.TABLEHEADER))
+			gridTable += line
+			continue
+		if inGridTable:
+			if _matchGridTableHeaderSeparator.match(line) or _matchGridTableBodySeparator.match(line):
+				#outClauses[-1].append(Line(line, LineType.TABLESEPARATOR))
+				gridTable += line
+				continue
+			elif _matchTable.match(line):
+				#outClauses[-1].append(Line(line, LineType.TABLEROW))
+				gridTable += line
+				continue
+			else:
+				inGridTable = False
+				# Mark the previous line as the last row in the table
+				#outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW
+				print(gridTable)
+				htmltable = ""
+				htmltable = generate_html_table_with_spans(gridTable)
+				print(htmltable)
+				for row in htmltable:
+					outClauses[-1].append(Line(row, LineType.TABLEROW))
+				gridTable = ""
+		# continue with other matches
+
 		# Detect notes
-  		# Notes are lines that start with a '>'.
+		# Notes are lines that start with a '>'.
 		if _matchNote.match(line):
 			outClauses[-1].append(Line(line, LineType.NOTE))
 			continue
@@ -537,7 +875,7 @@ def analyseMarkdown(filename:str) -> Document:
 			clauseTitle = re.sub(_htmlTag, '', clauseTitle)
 			headerNumber = _matchHeaderNumber.search(clauseTitle)
 			outClauses.append(Clause(len(m.groups()[0]), # level
-						  		   headerNumber.group() if headerNumber else shortHash(clauseTitle, 6),
+								   headerNumber.group() if headerNumber else shortHash(clauseTitle, 6),
 								   clauseTitle, 
 								   []))
 			_lineType = LineType.HEADING
@@ -591,7 +929,7 @@ def processDocument(args:argparse.Namespace) -> None:
 
 if __name__ == '__main__':
 	parser = argparse.ArgumentParser(description = 'Convert oneM2M markdown specificatios to MkDocs format',
-								     formatter_class = argparse.ArgumentDefaultsHelpFormatter)
+									 formatter_class = argparse.ArgumentDefaultsHelpFormatter)
 
 	parser.add_argument('--verbose', '-v', action = 'store_true', help = 'verbose output during processing')
 	parser.add_argument('--very-verbose', '-vv', action = 'store_true', help = 'very verbose output during processing')
-- 
GitLab