Compare revisions

Andreas Kraft · Andreas Kraft · 59f07be5 · 59f07be5 · 59f07be5 · 59f07be5
--- a/README.md
+++ b/README.md
@@ -20,6 +20,32 @@ python3 -m pip install -r requirements.txt
 python3 spec2md.py <path-to-word-document>
 ```

+### Command Line Options
+
+```
+usage: spec2md.py [-h] [--outdir <output directory>] [--skip-image-conversion] [--force-markdown-tables]
+                  document [document ...]
+
+positional arguments:
+  document              documents to parse
+
+options:
+  -h, --help            show this help message and exit
+  --outdir <output directory>, -o <output directory>
+                        specify output directory (default: out)
+  --skip-image-conversion, -sic
+                        skip image conversion step (default: False)
+  --force-markdown-tables, -mdt
+                        Force markdown instead of grid format for tables with colspans (default: False)
+
+```
+
+- `--outdir` or `-o` specifies the output directory. The default is `out`.
+- `--skip-image-conversion` or `-sic` skips the image conversion step. The default is to convert images, but this may not be necessary if the images have already been converted.
+- `--force-markdown-tables` or `-mdt` forces the converter to generate markdown tables instead of grid tables. The default is to generate grid tables for tables with colspans. This option is useful to generate a first version of the table that can be manually adjusted later.
+
+
+
 ## FAQ

 ### The converter doesn't seem to generate image files.
@@ -60,9 +86,16 @@ Lists in table cells are also not possible. One may use html lists for this, but
 ```


+### How to convert a table with colspans?
+
+The converter will try to convert tables with colspans to grid tables. If the `--force-markdown-tables` option is used, then the table will be converted to a normal markdown table. If the table has colspans, then the cells will just be repeated to fill a table row.
+
+This may not be the desired result, but markdown doesn't support colspans. A solution is to use grid tables instead.  
+

 ## Changes

+- **2025-01-15** - Improved handling of tables with colspans (converting them to simple grid tables). Improved error messages (added line numbers). Improved error detection for tables.
 - **2024-01-09** - Added support for merging consecutive code paragraphs into a single code block.
 - **2023-08-18** - Improved handling of sometimes broken inline formatting in table cells. Adding more default heading formats.
 - **2023-07-27** - Added converting bold and italic text in paragraphs, headers and tables.
\ No newline at end of file
--- a/config.ini
+++ b/config.ini
@@ -90,11 +90,26 @@ ignore = toc 1, toc 2, toc 3, toc 4, toc 5, toc 6, toc 7, toc 8, toc 9
 ; The characters to be replaced and the characters that make the
 ; replacement string must be specified as hex values
 ; To remove a character from the file set it to 00 (2 zeros)
+;
+; The following are some common characters that can be replaced as well.
+
+; Registered trademark (®) — (`&reg;`)
+; Trademark (™) — (`&trade;`)
+; Euro (€) — (`&euro;`)
+; Left arrow (←) — (`&larr;`)
+; Up arrow (↑) — (`&uarr;`)
+; Right arrow (→) — (`&rarr;`)
+; Down arrow (↓) — (`&darr;`)
+; Degree (°) — (`&#176;`)
+; Pi (π) — (`&#960;`)
+

 ; "(c)"
-a9 = 286329
+; a9 = 286329
+a9 = &copy;
 ; "(R)"
-ae = 285229
+; ae = 285229
+ae = &reg;
 ; space
 a0 = 20
 ; double quote

--- a/gridTable.py
+++ b/gridTable.py
+#
+#	gritTable.py
+#
+#	Grid Table support functions for markdown conversion.
+#
+#	(c) 2025 by Andreas Kraft
+#	License: BSD 3-Clause License. See the LICENSE file for further details.
+#
+import re
+
+colspanMarker = '~~COLSPAN~~'
+
+def markdownToGrid(markdownLines:list[str]) -> list[str]:
+	"""	Convert a markdown table to a grid table. 
+		Cells containing ~~XX~~ will be merged with the previous cell.
+
+		Args:
+			markdownLines: The markdown lines to convert.
+		
+		Return:
+			The converted grid table.
+	"""
+	
+	# Check if there are enough lines to create a table
+	if not markdownLines or len(markdownLines) < 3:
+		return markdownLines
+	
+	# Replace all <br> with <br /> in all lines
+	markdownLines = [ re.sub(r'<br\s*/?>', '<br />', line) for line in markdownLines ]
+	
+	# Split each line into cells and clean whitespace
+	rows = [
+		[cell.strip() for cell in line.strip('|').split('|')]
+		for line in markdownLines
+	]
+
+
+	# Get maximum width for each column 
+	colWidths = []
+	maxCols = max(len(row) for row in rows)
+	for col in range(maxCols):
+		width = max(len(str(row[col])) if col < len(row) else 0 for row in rows)
+		colWidths.append(width)
+
+
+	# Process merged cells - combine content with previous cell
+	for row in rows:
+		for i in range(len(row)-1, 0, -1): # Work backwards to avoid index issues
+			if row[i].strip() == colspanMarker:
+				row[i-1] = row[i-1] + ' '*(colWidths[i-1] - len(row[i-1]))+ ' '*(colWidths[i]+3) # Merge with empty content
+				# row[i] = None 	# type:ignore[call-overload] # Indicate removal
+
+	
+	# Pad any rows that are too short
+	for row in rows:
+		while len(row) < maxCols:
+			row.append('')
+	
+	# Generate grid table
+	result = []
+	
+	# Top border
+	result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+')
+	
+	# Header row
+	result.append('|' + '|'.join(
+		f' {rows[0][i]:<{colWidths[i]}} ' for i in range(len(rows[0])) if rows[0][i] is not None
+	) + '|')
+	
+	# Header separator
+	result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+')
+	
+	# Data rows
+	for row in rows[2:]:
+		result.append('|' + '|'.join(
+			f' {row[i]:<{colWidths[i]}} ' for i in range(len(row)) if row[i] is not None
+		) + '|')
+		result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+')
+	
+	return result
+
+
+def formatGridTable(lines: list[str]) -> list[str]:
+	"""Format a grid table by adjusting column widths and alignments.
+	Supports merged cells marked with ~~COLSPAN~~.
+	
+	Args:
+		lines: List of strings containing a grid table
+		
+	Returns:
+		Formatted grid table as list of strings
+	"""
+	if not lines or len(lines) < 3:
+		return lines
+
+	# Get column widths from first separator line
+	colWidths = [len(col.strip()) for col in lines[0].split('+')[1:-1]]
+	result = []
+
+	# Adjust column widths if any cell is longer
+	for row in lines:
+		if row.startswith('|'):
+			# Split cells and get their lengths
+			rowCells = row.strip().split('|')[1:-1]
+			for i, cell in enumerate(rowCells):
+				if i >= len(colWidths):
+					continue
+				cellWidth = len(cell.strip())
+				if cellWidth > colWidths[i]:
+					colWidths[i] = cellWidth
+
+	# Process each line
+	for line in lines:
+		if line.startswith('+-'):
+			# Separator line - rebuild with correct column widths
+			result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+')
+			continue
+		elif line.startswith('+='):
+			# Separator line - rebuild with correct column widths
+			result.append('+' + '+'.join('=' * (w + 2) for w in colWidths) + '+')
+			continue
+		elif line.startswith('+:='):
+			# Separator line - rebuild with correct column widths
+			# ATTN: This is a special casse. It assumes that all columns are left-aligned.
+			result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+')
+			continue
+
+		elif line.startswith('|'):
+			# Content line
+			cells = line.strip().split('|')[1:-1]
+			formattedCells = []
+			i = 0
+			while i < len(cells):
+				cell = cells[i].strip()
+				if cell == colspanMarker:
+					# Skip merged cells - they were handled with previous cell
+					i += 1
+					continue
+				
+				# Calculate width for potentially merged cells
+				width = colWidths[i]
+				nextIdx = i + 1
+				while nextIdx < len(cells) and cells[nextIdx].strip() == colspanMarker:
+					width += colWidths[nextIdx] + 3  # +3 for the cell borders
+					nextIdx += 1
+				
+				# Format the cell content
+				formattedCells.append(f' {cell:<{width}} ')
+				i += 1
+
+			result.append('|' + '|'.join(formattedCells) + '|')
+
+	return result
+
+
+def handleMultiLineGridTable(lines: list[str]) -> list[str]:
+	"""Handle multiline cells in a grid table by splitting cells with <br /> markers.
+	
+	Args:
+		lines: List of strings containing a grid table
+		
+	Returns:
+		List of strings with multiline cells properly formatted
+	"""
+	result = []
+	rowLines:dict[int, list[str]] = {}  # Map to store line fragments for each row
+
+	# Process each line
+	for i, line in enumerate(lines):
+		if line.startswith('|'):  # Content line
+			# Split the line into cells
+			cells = line.strip().split('|')[1:-1]
+			
+			# Process each cell for line breaks
+			maxLines = 1
+			splitCells = []
+			for cell in cells:
+				# Check if cell contains colspan marker
+				if cell.strip() == colspanMarker:
+					# For colspan cells, create same number of parts filled with marker
+					splitCells.append([colspanMarker])
+				else:
+					parts = cell.split('<br />')
+					if len(parts) > 1:
+						# Found line breaks in cell
+						# Add "\" to each part except the last
+						parts = [ p + '\\' if i < len(parts)-1 else p 
+								for i, p in enumerate(parts) ]
+
+					splitCells.append(parts)
+				maxLines = max(maxLines, len(parts))
+			
+			# If we found line breaks, create multiple content lines
+			if maxLines > 1:
+				for line_idx in range(maxLines):
+					newCells = []
+					for cellParts in splitCells:
+						if len(cellParts) == 1 and cellParts[0].strip() == colspanMarker:
+							# For colspan cells, always use the marker
+							text = colspanMarker
+						else:
+							# Use the part if available, otherwise empty string
+							text = cellParts[line_idx] if line_idx < len(cellParts) else ''
+						newCells.append(text.strip())
+					new_line = '|' + '|'.join(f' {cell} ' for cell in newCells) + '|'
+					# Store with original line index as key
+					rowLines[i] = rowLines.get(i, []) + [new_line]
+			else:
+				# No line breaks, keep original line
+				rowLines[i] = [line]
+		else:
+			# Border lines are kept as is
+			rowLines[i] = [line]
+	
+	# Reconstruct the table
+	for i in range(len(lines)):
+		result.extend(rowLines.get(i, []))
+
+	return result
+
+
+
+def isGridTableStart(line: str) -> bool:
+	"""Check if a line marks the start of a grid table.
+
+	Args:
+		line: The line to check.
+	
+	Returns:
+		True if this is a table start line, False otherwise.
+	"""
+	return line.startswith('+') and line.endswith('+') and '-' in line and not '=' in line
+
--- a/spec2md.py
+++ b/spec2md.py
@@ -28,6 +28,8 @@ from rich import inspect
 import configparser, zipfile
 from lxml import etree as ET

+from gridTable import markdownToGrid, isGridTableStart, handleMultiLineGridTable, formatGridTable, colspanMarker
+
 class Style(IntEnum):
 	code = auto()
 	example = auto()
@@ -192,17 +194,50 @@ class DocumentConfiguration(object):
 		self.generateToc = config.getboolean('toc', 'generateToc', fallback = False)

 		# characters
-		self.characters = { int(c, 16) : binascii.unhexlify(config.get('characters', c)).decode('utf-8')	# type: ignore [attr-defined]
-							for c in config['characters'] }
+		# self.characters = { int(c, 16) : binascii.unhexlify(config.get('characters', c)).decode('utf-8')	# type: ignore [attr-defined]
+		# 					for c in config['characters'] }
+		self.characters = {}
+		for c,v in config['characters'].items():
+			if v.startswith('&'):
+				# HTML entity
+				self.characters[int(c, 16)] = v
+			else:
+				# Unicode character
+				self.characters[int(c, 16)] = binascii.unhexlify(config.get('characters', c)).decode('utf-8')	# type: ignore [attr-defined]

 		# Media & Converter
 		self.emfConverterPng = config.get('media', 'emfConverterPng', fallback = None)
 		self.emfConverterSvg = config.get('media', 'emfConverterSvg', fallback = None)


+def richString(text:str) -> str:
+	"""	Return a rich string for the console output.
+
+		Args:
+			text: The text to convert to a rich string.
+		
+		Return:
+			The converted text.
+	"""
+	return text.replace('[', '\\[')
+

+def linenumber(idx:int) -> str:
+	"""	Return the formatted line number. 

-def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:bool) -> None:
+		Args:
+			idx: The index to get the line number for.
+		
+		Return:
+			The formatted line number with leading zeros.
+	"""
+	return f'{idx+1:0{5}}'	# currently 5 digits
+
+
+def processDocuments(documents:list[str], 
+					 outDirectory:str, 
+					 skipImageConversion:bool,
+					 forceMarkdownTables:bool) -> None:
 	docs:Dict[str, Tuple[Document, DocumentConfiguration, Any]]		= {}
 	ptasks 															= {}
 	mediaRelations:Dict[str, str] 									= {}
@@ -582,6 +617,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 			# 	Processing the document			
 			lines:list[str] = []
 			imageIndex = 1
+			lastTableCaption:str = '<unknown caption>'

 			for elem in docItems:
 				paragraphNr += 1
@@ -664,6 +700,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 							caption = replaceNL(text).strip()
 							anchor = f'<a name="table_{caption[6:].split(":")[0].strip()}"></a>' if caption.startswith('Table ') and ':' in caption else ''
 							lines.append(f'**{caption}**{anchor}')
+							lastTableCaption = caption

 						#	Image Caption
 						elif style in docConfig.imagecaption:
@@ -716,38 +753,76 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:

 						# Print Unhandled tokens also to the console
 						else:
-							_print(f'{paragraphNr} {style}: {elem.style}: {text}')
+							_print(f'[yellow]({linenumber(len(lines))}) Undefined paragraph style "{elem.style.name}":[/yellow] [grey39]{text}')
 							lines.append(text)


 					case 'Table':
 						rows:list[list[str]] = []
 						nrRows = 0
+						colSpanDetected = False
 						for row in elem.rows:
 							cells:list[str] = []
+							colspanCounter = 0
 							for cell in row.cells:
+								if not forceMarkdownTables:
+									if colspanCounter > 0:
+										cells.append(colspanMarker)	# add at least a space
+										colspanCounter -= 1
+										continue
+									if cell._tc.grid_span > 1:
+										colSpanDetected = True
+										colspanCounter = cell._tc.grid_span - 1
+								elif cell._tc.grid_span > 1:
+									colSpanDetected = True
 								cells.append(f'{getTextFromXML(cell)} ')	# add at least a space
 							rows.append(cells)
 							nrRows += 1
 						
+						
 						# Warning if this is a single-row table
 						if nrRows == 1:
-							_print(f'[red]Single-row table found. Such tables cannot be converted to markdown.[/red] Please consider to change the following table in the original document:\n[grey39]{rows[0]}', highlight = False)
+							_print(f'[red]({linenumber(len(lines)+2)}) Single-row table found. Such tables cannot be converted to markdown.[/red]Consider to change the following table in the original document:\n[grey39]{rows[0]}', highlight = False)

-						lines.append('')	# Add an empty line before a table
+						# Warning if a table with colspans is detected
+						if colSpanDetected:
+							if forceMarkdownTables:
+								_print(f'[yellow]({linenumber(len(lines)+2)}) Table with colspans found: [/yellow][grey39]{richString(lastTableCaption)}[/grey39]\nConsider to convert it manually to a grid table', highlight = False)
+
+						tableLines:list[str] = []
+
+						errorDetected:bool = False
 						for idx, row in enumerate(rows):

 							# Check for a table caption and add separator line
 							if idx == 1:
-								lines.append('-'.join('|' * (len(row) + 1) ))
+								tableLines.append('-'.join('|' * (len(row) + 1) ))
+
+							# # Check if the number of columns is the same as the previous row and add cells if smaller
+
+							if idx > 0 and len(row) != len(rows[idx-1]):
+								_print(f'[red]({linenumber(len(lines))}) Number of columns in table row {idx} does not match the previous row.[/red]\nTable may need extra attention', highlight = False)
+								errorDetected = True
 							
 							# Add table row
-							lines.append(f'|{"|".join(row)}|'
+							tableLines.append(f'|{"|".join(row)}|'
 										 .replace('\n', _linebreak))	# replace line breaks in cells
+						
+						# if colSpanDetected and gridTableForColspan then convert to grid table
+						if colSpanDetected and not forceMarkdownTables and not errorDetected:
+							lines.append('')	# Add an empty line before a table
+							lines.append('<mark>Table with colspans converted to grid table. Please check and adjust manually if necessary.</mark>')
+							tableLines = markdownToGrid(tableLines)
+						
+						lines.append('')	# Add an empty line before a table
+						if errorDetected:
+							lines.append('<mark>The table below caused an error during conversion and may need extra attention</mark>')
+							lines.append('')	# Add an empty line before a table
+						lines.extend(tableLines)
 						lines.append('')	# Add another empty line after a table
 					
 					case _:
-						_print('[blue] {type(elem).__name__}')
+						_print(f'[blue]({linenumber(len(lines))}) {type(elem).__name__}')

 			#
 			#	Replace non-ascii characters
@@ -764,7 +839,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 							line = line.replace(ch, rch)	# we need the line for further replacements
 							lines[i] = line
 						else:
-							_print(f'[yellow]Non-ASCII character (consider to add a replacement in the config.ini file): "{ch}" / {ord(ch)} / {hex(ord(ch))}')
+							_print(f'[yellow]({linenumber(i)}) Non-ASCII character (consider to add a replacement in the config.ini file): "{ch}" / {ord(ch)} / {hex(ord(ch))}')
 		

 			#
@@ -884,10 +959,42 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 			#
 			#	List unresolved CAPTION markers
 			#
-			for i in range(len(lines)):
-				line = lines[i]
+			#
+			#	List unresolved CAPTION markers
+			#
+			for i, line in enumerate(lines):
 				if _captionMarker in line:
-					_print(f'[yellow]Unresolved / unreferenced figure caption : \[{i}] "{line}"')
+					_print(f'[yellow]({linenumber(i)}) Unresolved / unreferenced figure caption: "{line}"[/yellow]')
+			
+
+			#
+			#	Correct formatting of Grid tables after all other changes have been applied
+			#
+
+			if not forceMarkdownTables:
+				gridTable:list[str] = []
+				result:list[str] = []
+				for i, line in enumerate(lines):
+
+					# Check for grid table start
+					if isGridTableStart(line) and not gridTable:
+						gridTable = [ line ]
+						continue
+					# Are we in a grid table?
+					if gridTable:
+						# Is the current line still part of the grid table?
+						if line.startswith(('|', '+')):
+							gridTable.append(line)
+							continue
+						# grid table finished. Assign and clear
+						gridTable = handleMultiLineGridTable(gridTable)
+						result.extend(formatGridTable(gridTable))
+						gridTable = []
+						continue
+					# not in grid table
+					result.append(line)
+				lines = result
+				
 			
 			#
 			#	Write produced Markdown file
@@ -946,6 +1053,7 @@ if __name__ == '__main__':
 	parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 	parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>',  help = 'specify output directory')
 	parser.add_argument('--skip-image-conversion', '-sic', action='store_true', dest='skipImageConversion',  help = 'skip image conversion step')
+	parser.add_argument('--force-markdown-tables', '-mdt', action='store_true', dest='forceMarkdownTables',  help = 'Force markdown instead of grid format for tables with colspans')

 	parser.add_argument('document', nargs = '+', help = 'documents to parse')
 	args = parser.parse_args()
@@ -953,5 +1061,8 @@ if __name__ == '__main__':
 		# Process documents and print output
 	os.makedirs(args.outDirectory, exist_ok = True)

-	processDocuments(sorted(args.document), args.outDirectory, args.skipImageConversion)
+	processDocuments(sorted(args.document), 
+				  	 args.outDirectory, 
+					 args.skipImageConversion,
+					 args.forceMarkdownTables)
No results found