From 4bfb77b766d333ac67734eba80fb83aca3b120b6 Mon Sep 17 00:00:00 2001
From: ankraft <an.kraft@gmail.com>
Date: Mon, 7 Apr 2025 13:10:23 +0200
Subject: [PATCH] Improved support for handling grid tables (reduced width when
 possible, support rowspans). However, rogue empty cells cannot always be
 detected.

---
 gridTable.py | 146 +++++++++++++++++++++++++++++++++++++++++----------
 spec2md.py   |  76 ++++++++++++++++++---------
 2 files changed, 168 insertions(+), 54 deletions(-)

diff --git a/gridTable.py b/gridTable.py
index c859bef..35d7f5d 100644
--- a/gridTable.py
+++ b/gridTable.py
@@ -9,6 +9,7 @@
 import re
 
 colspanMarker = '~~COLSPAN~~'
+rowspanMarker = '~~ROWSPAN~~'
 
 def markdownToGrid(markdownLines:list[str]) -> list[str]:
 	"""	Convert a markdown table to a grid table. 
@@ -34,7 +35,6 @@ def markdownToGrid(markdownLines:list[str]) -> list[str]:
 		for line in markdownLines
 	]
 
-
 	# Get maximum width for each column 
 	colWidths = []
 	maxCols = max(len(row) for row in rows)
@@ -45,12 +45,11 @@ def markdownToGrid(markdownLines:list[str]) -> list[str]:
 
 	# Process merged cells - combine content with previous cell
 	for row in rows:
-		for i in range(len(row)-1, 0, -1): # Work backwards to avoid index issues
+		for i in range(len(row)-1, -1, -1): # Work backwards to avoid index issues
 			if row[i].strip() == colspanMarker:
 				row[i-1] = row[i-1] + ' '*(colWidths[i-1] - len(row[i-1]))+ ' '*(colWidths[i]+3) # Merge with empty content
 				# row[i] = None 	# type:ignore[call-overload] # Indicate removal
 
-	
 	# Pad any rows that are too short
 	for row in rows:
 		while len(row) < maxCols:
@@ -64,19 +63,41 @@ def markdownToGrid(markdownLines:list[str]) -> list[str]:
 	
 	# Header row
 	result.append('|' + '|'.join(
-		f' {rows[0][i]:<{colWidths[i]}} ' for i in range(len(rows[0])) if rows[0][i] is not None
+		f'{rows[0][i]:<{colWidths[i]}}' for i in range(len(rows[0])) if rows[0][i] is not None
 	) + '|')
 	
 	# Header separator
 	result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+')
 	
 	# Data rows
-	for row in rows[2:]:
+	for rowIndex, row in enumerate(rows[2:]):
+
+		# The following code detects if cells in the next row have rowspan marker(s)
+		# If so, it will merge the cells with the current one and remove the rowspan marker
+		# from that cell
+		nextRowCellsMerged:list[bool] = []
+
+		if rowIndex < len(rows)-3:
+			for cellIndex, cell in enumerate(rows[rowIndex+3]):
+				if cell.strip() == rowspanMarker:
+					nextRowCellsMerged.append(True)
+					rows[rowIndex+3][cellIndex] = cell.replace(rowspanMarker, ' '*len(rowspanMarker))
+				else:	
+					nextRowCellsMerged.append(False)
+			# nextRowCellsMerged = [ cell.strip() == rowspanMarker for cell in rows[rowIndex+3] ]
+		else:
+			nextRowCellsMerged = [ False for _ in rows[rowIndex+2] ]
+
 		result.append('|' + '|'.join(
-			f' {row[i]:<{colWidths[i]}} ' for i in range(len(row)) if row[i] is not None
+			f'{row[i]:<{colWidths[i]}}' 
+				if row[i] != rowspanMarker else '' 
+				for i in range(len(row)) 
+				if row[i] is not None
 		) + '|')
-		result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+')
-	
+
+		# Add separator line, if not merged
+		result.append('+' + '+'.join('-' * (w + 2) if not nextRowCellsMerged[cellIndex] else ' ' * (w + 2)
+							   for cellIndex, w in enumerate(colWidths)) + '+')
 	return result
 
 
@@ -90,6 +111,49 @@ def formatGridTable(lines: list[str]) -> list[str]:
 	Returns:
 		Formatted grid table as list of strings
 	"""
+
+	def _getCellsFromRow(row:str) -> list[str]:
+		"""Helper function to extract cells from a row.
+
+			This is done by splitting the row string by the '|' character
+			and returning the cells as a list. The first and last elements
+			are ignored as they are empty strings.
+
+			Args:
+				row: The row string to split.
+
+			Returns:
+				A list of cells extracted from the row.
+		"""
+		return row.strip().split('|')[1:-1]
+	
+
+	def _guessColumnWidth(columnID:int) -> int:
+		"""Helper function to guess the width of a column.
+
+			This is done by checking the content of the cells in the column
+			and returning the maximum width found. This value may not be 
+			accurate if the column contains merged cells, but it is a good
+			approximation.
+
+			Args:
+				columnID: The column ID to check.
+
+			Returns:
+				The guessed width of the column.s
+		"""
+		width = 0
+		for row in lines:
+			if row.startswith('|'):
+				rowCells = _getCellsFromRow(row)
+				if columnID < len(rowCells):
+					cellLines = rowCells[columnID].rstrip().split('\\\n')
+					for line in cellLines:
+						if line != colspanMarker:
+							width = max(width, len(line.rstrip()))
+		return width
+
+	
 	if not lines or len(lines) < 3:
 		return lines
 
@@ -101,55 +165,81 @@ def formatGridTable(lines: list[str]) -> list[str]:
 	for row in lines:
 		if row.startswith('|'):
 			# Split cells and get their lengths
-			rowCells = row.strip().split('|')[1:-1]
+			rowCells = _getCellsFromRow(row)
 			for i, cell in enumerate(rowCells):
 				if i >= len(colWidths):
 					continue
 				# Calculate maximum width of each line in the cell. Lines could be multilines, so we need to split them.
-				cellLines = cell.strip().split('\\\n')
-				cellWidth = max(len(line.strip()) if line != colspanMarker else 0
+				cellLines = cell.rstrip().split('\\\n')
+				requiredCellWidth = max(len(line.rstrip()) if line != colspanMarker else 0
 								for line in cellLines)
-				if cellWidth > colWidths[i]:
-					colWidths[i] = cellWidth
+
+				if requiredCellWidth > colWidths[i]:
+					# Check if the next cell or cells are colspan markers
+					# If so, then sum the widths of the current and next cells and increase the width
+					# only if the required size is still bigger than the current one
+					# Check for colspan markers
+					overAllCellWidth = colWidths[i]
+					nextIdx = i + 1
+					while nextIdx < len(rowCells) and rowCells[nextIdx].strip() == colspanMarker:
+						cw = colWidths[nextIdx]
+						if cw == 0:
+							cw = _guessColumnWidth(nextIdx)
+						overAllCellWidth += cw
+						nextIdx += 1
+					if requiredCellWidth > overAllCellWidth:
+						# Increase the width of the current cell
+						colWidths[i] += requiredCellWidth-overAllCellWidth
+
 
 	# Process each line
 	for line in lines:
-		if line.startswith('+-'):
+		# Normal separator line can either start with '+ ' or '+-'
+		if line.startswith('+-') or line.startswith('+ '):	
+			# Get the kind of row separator for each column
+			_originalSeparator = [ l[0] for l in line.split('+')[1:-1] ]
 			# Separator line - rebuild with correct column widths
-			result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+')
+			result.append('+' + '+'.join(_originalSeparator[colIndex] * (w) 
+							   			 for colIndex, w in enumerate(colWidths)
+							   			 if colWidths[colIndex] > 0  ) + '+')
 			continue
 		elif line.startswith('+='):
 			# Separator line - rebuild with correct column widths
-			result.append('+' + '+'.join('=' * (w + 2) for w in colWidths) + '+')
+			result.append('+' + '+'.join('=' * (w) 
+										 for colIndex, w in enumerate(colWidths)
+										 if colWidths[colIndex] > 0 ) + '+')
 			continue
 		elif line.startswith('+:='):
 			# Separator line - rebuild with correct column widths
 			# ATTN: This is a special casse. It assumes that all columns are left-aligned.
-			result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+')
+			result.append('+:' + '+:'.join('=' * (w-1) 
+								  		   for colIndex, w in enumerate(colWidths)
+										   if colWidths[colIndex] > 0 ) + '+')
 			continue
 
+
 		elif line.startswith('|'):
 			# Content line
-			cells = line.strip().split('|')[1:-1]
+			cells = line.rstrip().split('|')[1:-1]
 			formattedCells = []
 			i = 0
 			while i < len(cells):
-				cell = cells[i].strip()
-				if cell == colspanMarker:
+				cell = cells[i].rstrip()
+				if cell.strip() == colspanMarker:
 					# Skip merged cells - they were handled with previous cell
 					i += 1
 					continue
-				
+
 				# Calculate width for potentially merged cells
 				width = colWidths[i]
 				nextIdx = i + 1
 				while nextIdx < len(cells) and cells[nextIdx].strip() == colspanMarker:
-					width += colWidths[nextIdx] + 3  # +3 for the cell borders
+					width += colWidths[nextIdx] + 1
 					nextIdx += 1
-				
+
 				# Format the cell content
-				formattedCells.append(f' {cell:<{width}} ')
-				i += 1
+				formattedCells.append(f'{cell:<{width}}')
+				i = nextIdx
 
 			result.append('|' + '|'.join(formattedCells) + '|')
 
@@ -204,10 +294,10 @@ def handleMultiLineGridTable(lines: list[str]) -> list[str]:
 						else:
 							# Use the part if available, otherwise empty string
 							text = cellParts[line_idx] if line_idx < len(cellParts) else ''
-						newCells.append(text.strip())
-					new_line = '|' + '|'.join(f' {cell} ' for cell in newCells) + '|'
+						newCells.append(text.rstrip())
+					newLine = '|' + '|'.join(f'{cell}' for cell in newCells) + '|'
 					# Store with original line index as key
-					rowLines[i] = rowLines.get(i, []) + [new_line]
+					rowLines[i] = rowLines.get(i, []) + [newLine]
 			else:
 				# No line breaks, keep original line
 				rowLines[i] = [line]
diff --git a/spec2md.py b/spec2md.py
index a1bcac8..c2f9d58 100644
--- a/spec2md.py
+++ b/spec2md.py
@@ -28,7 +28,8 @@ from rich import inspect
 import configparser, zipfile
 from lxml import etree as ET
 
-from gridTable import markdownToGrid, isGridTableStart, handleMultiLineGridTable, formatGridTable, colspanMarker
+from gridTable import markdownToGrid, isGridTableStart, handleMultiLineGridTable, \
+	formatGridTable, colspanMarker, rowspanMarker
 
 class Style(IntEnum):
 	code = auto()
@@ -84,6 +85,7 @@ _print:Callable = print
 
 # Some predefined tags and attributes
 wns = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
+w14ns = 'http://schemas.microsoft.com/office/word/2010/wordml'
 _val = f'{{{wns}}}val'
 
 class SectionNumbers(object):
@@ -352,12 +354,12 @@ def processDocuments(documents:list[str],
 							 'footnoteRef',
 							 'annotationRef',
 			)
-			
+			newParagraphs = 0
 
 			def _parseXML(element:ET.Element, inCell:Optional[bool] = False) -> str:
 				"""	Recursively parse a document paragraph.
 				"""
-				nonlocal _ignoredTags
+				nonlocal _ignoredTags, newParagraphs
 
 				_result = ''
 				tag = strippedTag(element.tag)	# remove namespaces for easier handlings
@@ -394,7 +396,7 @@ def processDocuments(documents:list[str],
 
 					case 'br':
 						_result += _linebreak
-						
+					
 					case 'bookmarkStart' | 'bookmarkEnd':		# TODO ?
 						pass
 
@@ -497,14 +499,16 @@ def processDocuments(documents:list[str],
 			# _print(ET.fromstring(elem._p.xml))
 			match elem:
 				case Paragraph():	# type: ignore[misc]
-					return _parseXML(ET.fromstring(elem._p.xml))
+					return _parseXML(ET.fromstring(elem._p.xml)).rstrip()
 				case _Cell():		# type: ignore[misc]
 					# Iterate over all paragraphs in the cell and parse them
 					# Create a list of parsed paragraphs and join them with linebreaks
-					return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip() 
+					return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip()
 										   for p in elem.paragraphs ])
 				case ET._Element():
-					return _parseXML(elem)
+					# return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip()
+					# 					   for p in elem.paragraphs ])
+					return '<br />'.join([ _parseXML(elem).rstrip()])
 				case _:
 					return ''
 
@@ -769,23 +773,45 @@ def processDocuments(documents:list[str],
 						nrRows = 0
 						colSpanDetected = False
 						for row in elem.rows:
+							_row = ET.fromstring(row._tr.xml)
 							cells:list[str] = []
-							colspanCounter = 0
-							for cell in row.cells:
+							for cell in _row.findall('.//w:tc', namespaces = { 'w' : wns }):
+
+								colspanCounter = 1  # Default value if no gridspan is specified
+								gridspanElem = cell.find('.//w:tcPr/w:gridSpan', namespaces={'w': wns})
+								if gridspanElem is not None and _val in gridspanElem.attrib:
+									colspanCounter = int(gridspanElem.attrib[_val])
+									colSpanDetected = True  # Set flag that colspan was found
+
+								# Vertical merge
+								gridspanElem = cell.find('.//w:tcPr/w:vMerge', namespaces={'w': wns})
+								if gridspanElem is not None and _val not in gridspanElem.attrib:
+									cells.append(rowspanMarker)
+								
+								else:
+
+									# Extract text from cell
+									# Find all paragraphs in the cell
+									_pl:list[str] = []
+									for p in cell.findall('.//w:p', namespaces={'w': wns}):
+										_pl.append(getTextFromXML(p))
+									# Add the text to the cell
+									if len(_pl) > 0:
+										cells.append(_linebreak.join(_pl))
+									else:
+										cells.append('')
+
+								# Handle colspan formatting
 								if not forceMarkdownTables:
-									if colspanCounter > 0:
-										cells.append(colspanMarker)	# add at least a space
+									if colspanCounter >= 1:
+										for _ in range(colspanCounter-1):
+											cells.append(colspanMarker)
 										colspanCounter -= 1
-										continue
-									if cell._tc.grid_span > 1:
-										colSpanDetected = True
-										colspanCounter = cell._tc.grid_span - 1
-								elif cell._tc.grid_span > 1:
-									colSpanDetected = True
-								cells.append(f'{getTextFromXML(cell)} ')	# add at least a space
 							rows.append(cells)
 							nrRows += 1
-						
+
+						# for r in rows:
+						# 	_print(r)
 						
 						# Warning if this is a single-row table
 						if nrRows == 1:
@@ -850,8 +876,6 @@ def processDocuments(documents:list[str],
 							line = line.replace(ch, f'<mark>Non-ASCII character {ch} / {hex(ord(ch))}</mark>')
 							lines[i] = line
 
-		
-
 			#
 			#	Remove multiple bold / italics on/off occurances
 			#	Sometimes word doesn't remove empty bold-on/bold-off (or italics) indicatros
@@ -861,6 +885,9 @@ def processDocuments(documents:list[str],
 				line = lines[i]
 				line = line.replace('__', '')
 				line = line.replace('****', '')
+				line = line.replace('**  ', '** ')
+				line = line.replace('_  ', '_ ')
+				line = line.replace('** **', ' ')
 				#line = line.replace('  ', ' ')
 				lines[i] = line
 
@@ -966,9 +993,6 @@ def processDocuments(documents:list[str],
 				for fid, text in footnotes.items():
 					lines.append(f'[^{fid}]: {text}')
 
-			#
-			#	List unresolved CAPTION markers
-			#
 			#
 			#	List unresolved CAPTION markers
 			#
@@ -976,11 +1000,10 @@ def processDocuments(documents:list[str],
 				if _captionMarker in line:
 					_print(f'[yellow]({linenumber(i)}) Unresolved / unreferenced figure caption: "{line}"[/yellow]')
 			
-
+			
 			#
 			#	Correct formatting of Grid tables after all other changes have been applied
 			#
-
 			if not forceMarkdownTables:
 				gridTable:list[str] = []
 				result:list[str] = []
@@ -1004,6 +1027,7 @@ def processDocuments(documents:list[str],
 					# not in grid table
 					result.append(line)
 				lines = result
+
 				
 			
 			#
-- 
GitLab