Support for grid tables and equations on mkdocs

bc780760 · Miguel Angel Reina Ortega · fd0dfa13 · bc780760 · bc780760 · bc780760
Commit bc780760 authored 7 months ago by Miguel Angel Reina Ortega
--- a/generateChangemarks/.gitlab-ci.yml
+++ b/generateChangemarks/.gitlab-ci.yml
@@ -197,12 +197,12 @@ pages:
     curl "${CI_API_V4_URL}/projects/$TOOLS_SCRIPTS_PROJECT_ID/repository/files/toMkdocs%2Fstylesheets%2Fextra%2Ecss/raw?ref=master" >> extra.css
    - mkdir -p docs/stylesheets && mv extra.css docs/stylesheets/
    - |
-     curl "${CI_API_V4_URL}/projects/$TOOLS_SCRIPTS_PROJECT_ID/repository/files/toMkdocs%2Fmkdocs%2Eyml/raw?ref=master" >> mkdocs.yml
+     curl "${CI_API_V4_URL}/projects/$TOOLS_SCRIPTS_PROJECT_ID/repository/files/toMkdocs%2Fmkdocs%2Eyml/raw?ref=gridtables" >> mkdocs.yml
    - |
     curl "${CI_API_V4_URL}/projects/$TOOLS_SCRIPTS_PROJECT_ID/repository/files/toMkdocs%2FindexDownload%2Emd/raw?ref=master" >> indexDownload.md
    - mkdir -p docs/download && mv indexDownload.md docs/download/index.md
    - |
-     curl "${CI_API_V4_URL}/projects/$TOOLS_SCRIPTS_PROJECT_ID/repository/files/toMkdocs%2FtoMkdocs%2Epy/raw?ref=master" >> toMkdocs.py
+     curl "${CI_API_V4_URL}/projects/$TOOLS_SCRIPTS_PROJECT_ID/repository/files/toMkdocs%2FtoMkdocs%2Epy/raw?ref=gridtables" >> toMkdocs.py
    - |
     export SPEC_NAME=$(ls | grep -E "(TS|TR|WI).*\.md" | cut -d'.' -f1)
    - |

--- a/toMkdocs/mkdocs.yml
+++ b/toMkdocs/mkdocs.yml
@@ -60,6 +60,8 @@ markdown_extensions:
      pygments_lang_class: true
  - pymdownx.inlinehilite
  - pymdownx.snippets
+  - pymdownx.arithmatex:
+      generic: true
  - pymdownx.superfences:
      custom_fences:
        - name: mermaid
@@ -69,6 +71,10 @@ markdown_extensions:
     alternate_style: true
  - tables

+extra_javascript:
+  - javascripts/mathjax.js
+  - https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js
+
 ##############################################################################

 extra:

--- a/toMkdocs/toMkdocs.py
+++ b/toMkdocs/toMkdocs.py
@@ -11,6 +11,7 @@ from enum import Enum, auto
 import argparse, re, os, shutil, hashlib, base64
 from dataclasses import dataclass
 from rich import print
+from html import escape

 verbose = False
 veryVerbose = False
@@ -418,6 +419,9 @@ _matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE)
 _matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECASE)
 _matchTable = re.compile(r'^\s*\|.*\|\s$', re.IGNORECASE)
 _matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE)
+_matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE)
+_matchGridTableBodySeparator = re.compile(r'.*\+([-:]+\+)+.*$', re.IGNORECASE)
+_matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE)
 _match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE)
 _markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE)
 _htmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE)
@@ -447,6 +451,309 @@ def shortHash(value:str, length:int) -> str:
 				).digest()
 			 ).decode()[:length]

+def parse_pandoc_table_with_spans(pandoc_table):
+	"""
+	Parse a Pandoc-style grid table into a structure for HTML conversion with rowspan and colspan.
+
+	:param pandoc_table: String of the Pandoc-style grid table.
+	:return: List of lists representing the table with metadata for spans.
+	"""
+	# Split the input into lines
+	lines = [line.strip() for line in pandoc_table.strip().split("\n")]
+
+	# Detect separator lines by pattern (it does not take into account partial separators
+	def is_separator(line):
+		_matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE)
+		return _matchGridTableSeparator.match(line)
+
+	_matchGridTableSeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
+	separator_indices = [i for i, line in enumerate(lines) if is_separator(line)]
+
+	print(separator_indices)
+	if not separator_indices:
+		raise ValueError("No valid separators found in the provided Pandoc table.")
+
+	# Calculate max number of columns
+	delimiter_positions = []
+	number_of_columns = 0
+	for separator_index in separator_indices:
+		if lines[separator_index].count("+") - 1 > number_of_columns:
+			number_of_columns = lines[separator_index].count("+") - 1
+			delimiter_positions = []
+			for j in range(number_of_columns):
+				delimiter_positions_start = delimiter_positions[j - 1] if j != 0 else 0
+				del_positions = [lines[separator_index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[separator_index][delimiter_positions_start + 1:]]
+				delimiter_positions.append(min(del_positions) if del_positions else -1)
+	has_header = False
+	for index in separator_indices:
+		if _matchGridTableHeaderSeparator.match(lines[index]):
+			has_header = True
+			header_separator_index = index
+			header_rows = []
+	data_rows = []
+	for row in range(len(separator_indices) - 1):
+		table_row = []
+		auxiliar_row = []
+		use_auxiliar_row = []
+		has_merged_cells = False
+		in_data_row = False
+		start, end = separator_indices[row], separator_indices[row + 1]
+		row_lines = lines[start:end]  # Lines between separators including separator line start as it gives information about the number of columns of the row
+		if row_lines:
+			# Combine multiline content into single strings for each cell
+			for line in row_lines:
+				if is_separator(line) and not in_data_row:
+					number_of_columns_row = line.count("+") - 1
+					in_data_row = True
+					parts = re.split(r"\s*\+\s*", line.strip("+"))
+					# Add as many cells as columns with span attributes
+					delimiter_index = 0
+					for i in range(number_of_columns_row):
+						delimiter_index += len(parts[i]) + 1
+						table_row.append({
+							"content": "NOCONTENT",
+							"rowspan": 0,
+							"colspan": 0,
+							"colspan_adjusted": False,
+							"position": delimiter_index # Position of cell delimiter +
+						})
+					for i in range(number_of_columns):
+						auxiliar_row.append({
+							"content": "NOCONTENT",
+							"rowspan": 0,
+							"colspan": 0,
+							"colspan_adjusted": False,
+							"position": 0
+						})
+						use_auxiliar_row.append(False)
+
+				elif in_data_row:
+					# Regular data row or partial separator
+					if _matchGridTableBodySeparator.match(line): # Partial separator
+						has_merged_cells = True
+						cells = re.split(r"\s*[\|\+]\s*", line.strip("|").strip("+")) # (?<!\\)[\|\+]
+						if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
+							for i in range(len(cells)):
+								if _matchGridTableSeparatorLine.match(cells[i]):  # A new row is to be added
+									use_auxiliar_row[i] = True
+								else:
+									if table_row[i]['content'] == "NOCONTENT":
+										table_row[i]['rowspan'] += 1
+										table_row[i]['colspan'] += 1
+										table_row[i]['content'] = cells[i]
+									else:
+										table_row[i]['content'] += cells[i]
+									# Cell which is not separator
+									table_row[i]['rowspan'] += 1
+									if not table_row[i]['colspan_adjusted']:
+										table_row[i]['colspan_adjusted'] = True
+										for j in range(i, len(cells)):
+											delimiter_start = table_row[j-1]['position'] if j != 0 else 0
+											positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "|+" if delimiter in line[delimiter_start + 1:]]
+											position = min(positions) if positions else -1
+											if position > delimiter_positions_start[j]: # Colspan to add
+												table_row[i]['colspan'] += 1
+											elif position < delimiter_positions_start[j]:
+												raise ValueError("Wrong cell formatting")
+											else:
+												break
+						elif len(cells) == number_of_columns: # Simple row with partial separator, # A new row is to be added
+							for i in range(len(cells)):
+								if _matchGridTableSeparatorLine.match(cells[i]):  # Update cell in new row
+									use_auxiliar_row[i] = True
+								else:
+									if table_row[i]['content'] == "NOCONTENT":
+										table_row[i]['rowspan'] += 1
+										table_row[i]['colspan'] += 1
+										table_row[i]['content'] = cells[i]
+									else:
+										table_row[i]['content'] += cells[i]
+									# Cell which is not separator
+									table_row[i]['rowspan'] += 1
+									# Not needed, no colspan as number of cells is equal to number of columns
+									#for j in range(i, len(cells)):
+									#	delimiter_start = table_row[j-1]['position'] if j != 0 else 0
+									#	positions = [line.find(delimiter,delimiter_start+1) for delimiter in "|+" if delimiter in line[delimiter_start+1:]]
+									#	position = min(positions) if positions else -1
+									#	if position > table_row[i]['position']:  # Only colspan to be increased
+									#		table_row[i]['colspan'] += 1
+									#	elif position + 1  < table_row[i]['position']:
+									#		raise ValueError("Wrong cell formatting")
+									#	else:
+									#		break
+
+						else:
+							raise ValueError("More cells than columns found")
+					else: # Data row
+						cells = re.split(r"\s*\|\s*", line.strip("|"))
+						if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
+							for i in range(len(cells)):
+								if table_row[i]['content'] == "NOCONTENT":
+									table_row[i]['rowspan'] += 1
+									table_row[i]['colspan'] += 1
+									table_row[i]['content'] = cells[i]
+								else:
+									table_row[i]['content'] += cells[i]
+								if not table_row[i]['colspan_adjusted']:
+									table_row[i]['colspan_adjusted'] = True
+									for j in range(i, len(cells)):
+										delimiter_start = table_row[j-1]['position'] if j != 0 else 0
+										if line.find("|", delimiter_start+1) > delimiter_positions[j]: # Colspan to be increased
+											table_row[i]['colspan'] += 1
+										elif line.find("|", delimiter_start+1) < delimiter_positions[j]:
+											raise ValueError("Wrong cell formatting")
+										else:
+
+											break
+
+						elif len(cells) == number_of_columns: # Simple row
+							for i in range(len(cells)):
+								if use_auxiliar_row[i]:
+									if auxiliar_row[i]['content'] == "NOCONTENT":
+										auxiliar_row[i]['rowspan'] += 1
+										auxiliar_row[i]['colspan'] += 1
+										auxiliar_row[i]['content'] = cells[i]
+									else:
+										auxiliar_row[i]['content'] += cells[i]
+								else:
+									if table_row[i]['content'] == "NOCONTENT":
+										table_row[i]['rowspan'] += 1
+										table_row[i]['colspan'] += 1
+										table_row[i]['content'] = cells[i]
+									else:
+										table_row[i]['content'] += cells[i]
+						else:
+							raise ValueError("More cells than columns found")
+				else:
+					raise ValueError("No separator line found for row starting")
+
+			if has_header and start >= header_separator_index: # table_row and auxiliar_row are part of data_rows
+				data_rows.append(table_row)
+				if has_merged_cells:
+					data_rows.append(auxiliar_row)
+			elif has_header and start < header_separator_index: # table_row and auxiliar_row are part of header_rows
+				header_rows.append(table_row)
+				if has_merged_cells:
+					header_rows.append(auxiliar_row)
+
+	#print(header_rows)
+	#print(data_rows)
+	# Correct newlines characters
+	for row in header_rows:
+		for cell in row:
+			cell['content'] = cell['content'].replace("\\", "<br>")
+	for row in data_rows:
+		for cell in row:
+			cell['content'] = cell['content'].replace("\\", "<br>")
+	# Check if there are any data rows
+	if not data_rows and not header_rows:
+		raise ValueError("No valid rows found in the provided Pandoc table.")
+
+	# Format text
+	bold = "<strong>"
+	for row in header_rows:
+		for cell in row:
+			while cell['content'].find("**") != -1:
+				cell['content'] = cell['content'].replace("**", bold, 1)
+				if bold == "<strong>":
+					bold = "</strong>"
+				else:
+					bold = "<strong>"
+	bold = "<strong>"
+	for row in data_rows:
+		for cell in row:
+			while cell['content'].find("**") != -1:
+				cell['content'] = cell['content'].replace("**", bold, 1)
+				if bold == "<strong>":
+					bold = "</strong>"
+				else:
+					bold = "<strong>"
+
+	# Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows
+
+	forward_rowspan = []
+	for row_index in range(len(header_rows)):
+		if len(forward_rowspan) == 0:
+			forward_rowspan = [0 for _ in range(len(header_rows[row_index]))]
+		sum = 0
+		for cell_index in range(len(header_rows[row_index])):
+			sum += header_rows[row_index][cell_index]['colspan']
+			if row_index > 0 and header_rows[row_index][cell_index]['colspan'] == 0:
+				if forward_rowspan[cell_index] > 0:
+					sum += 1
+				forward_rowspan[cell_index] -= 1
+			if forward_rowspan[cell_index] == 0 and header_rows[row_index][cell_index]['rowspan'] > 1:
+				forward_rowspan[cell_index] = header_rows[row_index][cell_index]['rowspan'] -1
+		if not sum == number_of_columns:
+			raise ValueError("Grid table not converted properly")
+	forward_rowspan = []
+	for row_index in range(len(data_rows)):
+		if len(forward_rowspan) == 0:
+			forward_rowspan = [0 for _ in range(len(data_rows[row_index]))]
+		sum = 0
+		for cell_index in range(len(data_rows[row_index])):
+			sum += data_rows[row_index][cell_index]['colspan']
+			if row_index > 0 and data_rows[row_index][cell_index]['colspan'] == 0:
+				if forward_rowspan[cell_index] > 0:
+					sum += 1
+				forward_rowspan[cell_index] -= 1
+			if forward_rowspan[cell_index] == 0 and data_rows[row_index][cell_index]['rowspan'] > 1:
+				forward_rowspan[cell_index] = data_rows[row_index][cell_index]['rowspan'] - 1
+		if not sum == number_of_columns:
+			raise ValueError("Grid table not converted properly")
+	#if has_header:
+	#	table_with_spans = header_rows
+
+	#table_with_spans += data_rows
+
+	#return table_with_spans
+	return header_rows, data_rows
+
+def generate_html_table_with_spans(pandoc_table):
+	"""
+	Generate an HTML table from a Pandoc-style grid table with row and column spans.
+
+	:param pandoc_table: String of the Pandoc-style grid table.
+	:return: HTML string.
+	"""
+	grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table)
+
+	html = "<table>\n"
+	has_header = False
+
+	for row in grid_header:
+		for cell in row:
+			if cell['rowspan'] != 0 and cell['colspan'] != 0:
+				has_header = True
+	if has_header:
+		html += "    <thead>\n"
+		for row in grid_header:
+			html += "        <tr>\n"
+			for cell in row:
+				if cell['rowspan'] == 0 or cell['colspan'] == 0:
+					continue
+				else:
+					rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
+					colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
+					html += f"            <td{rowspan}{colspan}>{cell['content']}</td>\n"
+			html += "        </tr>\n"
+		html += "    </thead>\n"
+
+	html += "    <tbody>\n"
+	for row in grid_body:
+		html += "        <tr>\n"
+		for cell in row:
+			if cell['rowspan'] == 0 or cell['colspan'] == 0:
+				continue
+			else:
+				rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
+				colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
+				html += f"            <td{rowspan}{colspan}>{cell['content']}</td>\n"
+		html += "        </tr>\n"
+
+	html += "    </tbody>\n"
+	html += "</table>"
+	return html

 def analyseMarkdown(filename:str) -> Document:
 	"""	Analyse the markdown file and split it into clauses.
@@ -473,6 +780,9 @@ def analyseMarkdown(filename:str) -> Document:
 	inCodefence = False
 	inTable = False
 	tableHasSeparator = False
+	inGridTable = False
+	gridTableHasSeparator = False
+	gridTable = ""
 	for line in inLines:

 		# Detect and handle codefences
@@ -493,7 +803,7 @@ def analyseMarkdown(filename:str) -> Document:
 			continue

 		# Detect and handle tables
-		if _matchTable.match(line) and not inTable:
+		if _matchTable.match(line) and not inTable and not inGridTable:
 			inTable = True
 			outClauses[-1].append(Line(line, LineType.TABLEHEADER))
 			continue
@@ -512,6 +822,34 @@ def analyseMarkdown(filename:str) -> Document:
 				outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW
 				# continue with other matches

+		#Detect grid tables and convert them to html table
+		if _matchGridTable.match(line) and not inGridTable:
+			inGridTable = True
+			#outClauses[-1].append(Line(line, LineType.TABLEHEADER))
+			gridTable += line
+			continue
+		if inGridTable:
+			if _matchGridTableHeaderSeparator.match(line) or _matchGridTableBodySeparator.match(line):
+				#outClauses[-1].append(Line(line, LineType.TABLESEPARATOR))
+				gridTable += line
+				continue
+			elif _matchTable.match(line):
+				#outClauses[-1].append(Line(line, LineType.TABLEROW))
+				gridTable += line
+				continue
+			else:
+				inGridTable = False
+				# Mark the previous line as the last row in the table
+				#outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW
+				print(gridTable)
+				htmltable = ""
+				htmltable = generate_html_table_with_spans(gridTable)
+				print(htmltable)
+				for row in htmltable:
+					outClauses[-1].append(Line(row, LineType.TABLEROW))
+				gridTable = ""
+		# continue with other matches
+
 		# Detect notes
 		# Notes are lines that start with a '>'.
 		if _matchNote.match(line):