Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • tools/spec2md
1 result
Show changes
......@@ -20,6 +20,32 @@ python3 -m pip install -r requirements.txt
python3 spec2md.py <path-to-word-document>
```
### Command Line Options
```
usage: spec2md.py [-h] [--outdir <output directory>] [--skip-image-conversion] [--force-markdown-tables]
document [document ...]
positional arguments:
document documents to parse
options:
-h, --help show this help message and exit
--outdir <output directory>, -o <output directory>
specify output directory (default: out)
--skip-image-conversion, -sic
skip image conversion step (default: False)
--force-markdown-tables, -mdt
Force markdown instead of grid format for tables with colspans (default: False)
```
- `--outdir` or `-o` specifies the output directory. The default is `out`.
- `--skip-image-conversion` or `-sic` skips the image conversion step. The default is to convert images, but this may not be necessary if the images have already been converted.
- `--force-markdown-tables` or `-mdt` forces the converter to generate markdown tables instead of grid tables. The default is to generate grid tables for tables with colspans. This option is useful to generate a first version of the table that can be manually adjusted later.
## FAQ
### The converter doesn't seem to generate image files.
......@@ -60,9 +86,16 @@ Lists in table cells are also not possible. One may use html lists for this, but
```
### How to convert a table with colspans?
The converter will try to convert tables with colspans to grid tables. If the `--force-markdown-tables` option is used, then the table will be converted to a normal markdown table. If the table has colspans, then the cells will just be repeated to fill a table row.
This may not be the desired result, but markdown doesn't support colspans. A solution is to use grid tables instead.
## Changes
- **2025-01-15** - Improved handling of tables with colspans (converting them to simple grid tables). Improved error messages (added line numbers). Improved error detection for tables.
- **2024-01-09** - Added support for merging consecutive code paragraphs into a single code block.
- **2023-08-18** - Improved handling of sometimes broken inline formatting in table cells. Adding more default heading formats.
- **2023-07-27** - Added converting bold and italic text in paragraphs, headers and tables.
\ No newline at end of file
......@@ -90,11 +90,26 @@ ignore = toc 1, toc 2, toc 3, toc 4, toc 5, toc 6, toc 7, toc 8, toc 9
; The characters to be replaced and the characters that make the
; replacement string must be specified as hex values
; To remove a character from the file set it to 00 (2 zeros)
;
; The following are some common characters that can be replaced as well.
; Registered trademark (®) — (`&reg;`)
; Trademark (™) — (`&trade;`)
; Euro (€) — (`&euro;`)
; Left arrow (←) — (`&larr;`)
; Up arrow (↑) — (`&uarr;`)
; Right arrow (→) — (`&rarr;`)
; Down arrow (↓) — (`&darr;`)
; Degree (°) — (`&#176;`)
; Pi (π) — (`&#960;`)
; "(c)"
a9 = 286329
; a9 = 286329
a9 = &copy;
; "(R)"
ae = 285229
; ae = 285229
ae = &reg;
; space
a0 = 20
; double quote
......
#
# gritTable.py
#
# Grid Table support functions for markdown conversion.
#
# (c) 2025 by Andreas Kraft
# License: BSD 3-Clause License. See the LICENSE file for further details.
#
import re
colspanMarker = '~~COLSPAN~~'
def markdownToGrid(markdownLines:list[str]) -> list[str]:
""" Convert a markdown table to a grid table.
Cells containing ~~XX~~ will be merged with the previous cell.
Args:
markdownLines: The markdown lines to convert.
Return:
The converted grid table.
"""
# Check if there are enough lines to create a table
if not markdownLines or len(markdownLines) < 3:
return markdownLines
# Replace all <br> with <br /> in all lines
markdownLines = [ re.sub(r'<br\s*/?>', '<br />', line) for line in markdownLines ]
# Split each line into cells and clean whitespace
rows = [
[cell.strip() for cell in line.strip('|').split('|')]
for line in markdownLines
]
# Get maximum width for each column
colWidths = []
maxCols = max(len(row) for row in rows)
for col in range(maxCols):
width = max(len(str(row[col])) if col < len(row) else 0 for row in rows)
colWidths.append(width)
# Process merged cells - combine content with previous cell
for row in rows:
for i in range(len(row)-1, 0, -1): # Work backwards to avoid index issues
if row[i].strip() == colspanMarker:
row[i-1] = row[i-1] + ' '*(colWidths[i-1] - len(row[i-1]))+ ' '*(colWidths[i]+3) # Merge with empty content
# row[i] = None # type:ignore[call-overload] # Indicate removal
# Pad any rows that are too short
for row in rows:
while len(row) < maxCols:
row.append('')
# Generate grid table
result = []
# Top border
result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+')
# Header row
result.append('|' + '|'.join(
f' {rows[0][i]:<{colWidths[i]}} ' for i in range(len(rows[0])) if rows[0][i] is not None
) + '|')
# Header separator
result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+')
# Data rows
for row in rows[2:]:
result.append('|' + '|'.join(
f' {row[i]:<{colWidths[i]}} ' for i in range(len(row)) if row[i] is not None
) + '|')
result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+')
return result
def formatGridTable(lines: list[str]) -> list[str]:
"""Format a grid table by adjusting column widths and alignments.
Supports merged cells marked with ~~COLSPAN~~.
Args:
lines: List of strings containing a grid table
Returns:
Formatted grid table as list of strings
"""
if not lines or len(lines) < 3:
return lines
# Get column widths from first separator line
colWidths = [len(col.strip()) for col in lines[0].split('+')[1:-1]]
result = []
# Adjust column widths if any cell is longer
for row in lines:
if row.startswith('|'):
# Split cells and get their lengths
rowCells = row.strip().split('|')[1:-1]
for i, cell in enumerate(rowCells):
if i >= len(colWidths):
continue
cellWidth = len(cell.strip())
if cellWidth > colWidths[i]:
colWidths[i] = cellWidth
# Process each line
for line in lines:
if line.startswith('+-'):
# Separator line - rebuild with correct column widths
result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+')
continue
elif line.startswith('+='):
# Separator line - rebuild with correct column widths
result.append('+' + '+'.join('=' * (w + 2) for w in colWidths) + '+')
continue
elif line.startswith('+:='):
# Separator line - rebuild with correct column widths
# ATTN: This is a special casse. It assumes that all columns are left-aligned.
result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+')
continue
elif line.startswith('|'):
# Content line
cells = line.strip().split('|')[1:-1]
formattedCells = []
i = 0
while i < len(cells):
cell = cells[i].strip()
if cell == colspanMarker:
# Skip merged cells - they were handled with previous cell
i += 1
continue
# Calculate width for potentially merged cells
width = colWidths[i]
nextIdx = i + 1
while nextIdx < len(cells) and cells[nextIdx].strip() == colspanMarker:
width += colWidths[nextIdx] + 3 # +3 for the cell borders
nextIdx += 1
# Format the cell content
formattedCells.append(f' {cell:<{width}} ')
i += 1
result.append('|' + '|'.join(formattedCells) + '|')
return result
def handleMultiLineGridTable(lines: list[str]) -> list[str]:
"""Handle multiline cells in a grid table by splitting cells with <br /> markers.
Args:
lines: List of strings containing a grid table
Returns:
List of strings with multiline cells properly formatted
"""
result = []
rowLines:dict[int, list[str]] = {} # Map to store line fragments for each row
# Process each line
for i, line in enumerate(lines):
if line.startswith('|'): # Content line
# Split the line into cells
cells = line.strip().split('|')[1:-1]
# Process each cell for line breaks
maxLines = 1
splitCells = []
for cell in cells:
# Check if cell contains colspan marker
if cell.strip() == colspanMarker:
# For colspan cells, create same number of parts filled with marker
splitCells.append([colspanMarker])
else:
parts = cell.split('<br />')
if len(parts) > 1:
# Found line breaks in cell
# Add "\" to each part except the last
parts = [ p + '\\' if i < len(parts)-1 else p
for i, p in enumerate(parts) ]
splitCells.append(parts)
maxLines = max(maxLines, len(parts))
# If we found line breaks, create multiple content lines
if maxLines > 1:
for line_idx in range(maxLines):
newCells = []
for cellParts in splitCells:
if len(cellParts) == 1 and cellParts[0].strip() == colspanMarker:
# For colspan cells, always use the marker
text = colspanMarker
else:
# Use the part if available, otherwise empty string
text = cellParts[line_idx] if line_idx < len(cellParts) else ''
newCells.append(text.strip())
new_line = '|' + '|'.join(f' {cell} ' for cell in newCells) + '|'
# Store with original line index as key
rowLines[i] = rowLines.get(i, []) + [new_line]
else:
# No line breaks, keep original line
rowLines[i] = [line]
else:
# Border lines are kept as is
rowLines[i] = [line]
# Reconstruct the table
for i in range(len(lines)):
result.extend(rowLines.get(i, []))
return result
def isGridTableStart(line: str) -> bool:
"""Check if a line marks the start of a grid table.
Args:
line: The line to check.
Returns:
True if this is a table start line, False otherwise.
"""
return line.startswith('+') and line.endswith('+') and '-' in line and not '=' in line
......@@ -28,6 +28,8 @@ from rich import inspect
import configparser, zipfile
from lxml import etree as ET
from gridTable import markdownToGrid, isGridTableStart, handleMultiLineGridTable, formatGridTable, colspanMarker
class Style(IntEnum):
code = auto()
example = auto()
......@@ -192,17 +194,50 @@ class DocumentConfiguration(object):
self.generateToc = config.getboolean('toc', 'generateToc', fallback = False)
# characters
self.characters = { int(c, 16) : binascii.unhexlify(config.get('characters', c)).decode('utf-8') # type: ignore [attr-defined]
for c in config['characters'] }
# self.characters = { int(c, 16) : binascii.unhexlify(config.get('characters', c)).decode('utf-8') # type: ignore [attr-defined]
# for c in config['characters'] }
self.characters = {}
for c,v in config['characters'].items():
if v.startswith('&'):
# HTML entity
self.characters[int(c, 16)] = v
else:
# Unicode character
self.characters[int(c, 16)] = binascii.unhexlify(config.get('characters', c)).decode('utf-8') # type: ignore [attr-defined]
# Media & Converter
self.emfConverterPng = config.get('media', 'emfConverterPng', fallback = None)
self.emfConverterSvg = config.get('media', 'emfConverterSvg', fallback = None)
def richString(text:str) -> str:
""" Return a rich string for the console output.
Args:
text: The text to convert to a rich string.
Return:
The converted text.
"""
return text.replace('[', '\\[')
def linenumber(idx:int) -> str:
""" Return the formatted line number.
def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:bool) -> None:
Args:
idx: The index to get the line number for.
Return:
The formatted line number with leading zeros.
"""
return f'{idx+1:0{5}}' # currently 5 digits
def processDocuments(documents:list[str],
outDirectory:str,
skipImageConversion:bool,
forceMarkdownTables:bool) -> None:
docs:Dict[str, Tuple[Document, DocumentConfiguration, Any]] = {}
ptasks = {}
mediaRelations:Dict[str, str] = {}
......@@ -582,6 +617,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
# Processing the document
lines:list[str] = []
imageIndex = 1
lastTableCaption:str = '<unknown caption>'
for elem in docItems:
paragraphNr += 1
......@@ -664,6 +700,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
caption = replaceNL(text).strip()
anchor = f'<a name="table_{caption[6:].split(":")[0].strip()}"></a>' if caption.startswith('Table ') and ':' in caption else ''
lines.append(f'**{caption}**{anchor}')
lastTableCaption = caption
# Image Caption
elif style in docConfig.imagecaption:
......@@ -716,38 +753,76 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
# Print Unhandled tokens also to the console
else:
_print(f'{paragraphNr} {style}: {elem.style}: {text}')
_print(f'[yellow]({linenumber(len(lines))}) Undefined paragraph style "{elem.style.name}":[/yellow] [grey39]{text}')
lines.append(text)
case 'Table':
rows:list[list[str]] = []
nrRows = 0
colSpanDetected = False
for row in elem.rows:
cells:list[str] = []
colspanCounter = 0
for cell in row.cells:
if not forceMarkdownTables:
if colspanCounter > 0:
cells.append(colspanMarker) # add at least a space
colspanCounter -= 1
continue
if cell._tc.grid_span > 1:
colSpanDetected = True
colspanCounter = cell._tc.grid_span - 1
elif cell._tc.grid_span > 1:
colSpanDetected = True
cells.append(f'{getTextFromXML(cell)} ') # add at least a space
rows.append(cells)
nrRows += 1
# Warning if this is a single-row table
if nrRows == 1:
_print(f'[red]Single-row table found. Such tables cannot be converted to markdown.[/red] Please consider to change the following table in the original document:\n[grey39]{rows[0]}', highlight = False)
_print(f'[red]({linenumber(len(lines)+2)}) Single-row table found. Such tables cannot be converted to markdown.[/red]Consider to change the following table in the original document:\n[grey39]{rows[0]}', highlight = False)
lines.append('') # Add an empty line before a table
# Warning if a table with colspans is detected
if colSpanDetected:
if forceMarkdownTables:
_print(f'[yellow]({linenumber(len(lines)+2)}) Table with colspans found: [/yellow][grey39]{richString(lastTableCaption)}[/grey39]\nConsider to convert it manually to a grid table', highlight = False)
tableLines:list[str] = []
errorDetected:bool = False
for idx, row in enumerate(rows):
# Check for a table caption and add separator line
if idx == 1:
lines.append('-'.join('|' * (len(row) + 1) ))
tableLines.append('-'.join('|' * (len(row) + 1) ))
# # Check if the number of columns is the same as the previous row and add cells if smaller
if idx > 0 and len(row) != len(rows[idx-1]):
_print(f'[red]({linenumber(len(lines))}) Number of columns in table row {idx} does not match the previous row.[/red]\nTable may need extra attention', highlight = False)
errorDetected = True
# Add table row
lines.append(f'|{"|".join(row)}|'
tableLines.append(f'|{"|".join(row)}|'
.replace('\n', _linebreak)) # replace line breaks in cells
# if colSpanDetected and gridTableForColspan then convert to grid table
if colSpanDetected and not forceMarkdownTables and not errorDetected:
lines.append('') # Add an empty line before a table
lines.append('<mark>Table with colspans converted to grid table. Please check and adjust manually if necessary.</mark>')
tableLines = markdownToGrid(tableLines)
lines.append('') # Add an empty line before a table
if errorDetected:
lines.append('<mark>The table below caused an error during conversion and may need extra attention</mark>')
lines.append('') # Add an empty line before a table
lines.extend(tableLines)
lines.append('') # Add another empty line after a table
case _:
_print('[blue] {type(elem).__name__}')
_print(f'[blue]({linenumber(len(lines))}) {type(elem).__name__}')
#
# Replace non-ascii characters
......@@ -764,7 +839,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
line = line.replace(ch, rch) # we need the line for further replacements
lines[i] = line
else:
_print(f'[yellow]Non-ASCII character (consider to add a replacement in the config.ini file): "{ch}" / {ord(ch)} / {hex(ord(ch))}')
_print(f'[yellow]({linenumber(i)}) Non-ASCII character (consider to add a replacement in the config.ini file): "{ch}" / {ord(ch)} / {hex(ord(ch))}')
#
......@@ -884,10 +959,42 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
#
# List unresolved CAPTION markers
#
for i in range(len(lines)):
line = lines[i]
#
# List unresolved CAPTION markers
#
for i, line in enumerate(lines):
if _captionMarker in line:
_print(f'[yellow]Unresolved / unreferenced figure caption : \[{i}] "{line}"')
_print(f'[yellow]({linenumber(i)}) Unresolved / unreferenced figure caption: "{line}"[/yellow]')
#
# Correct formatting of Grid tables after all other changes have been applied
#
if not forceMarkdownTables:
gridTable:list[str] = []
result:list[str] = []
for i, line in enumerate(lines):
# Check for grid table start
if isGridTableStart(line) and not gridTable:
gridTable = [ line ]
continue
# Are we in a grid table?
if gridTable:
# Is the current line still part of the grid table?
if line.startswith(('|', '+')):
gridTable.append(line)
continue
# grid table finished. Assign and clear
gridTable = handleMultiLineGridTable(gridTable)
result.extend(formatGridTable(gridTable))
gridTable = []
continue
# not in grid table
result.append(line)
lines = result
#
# Write produced Markdown file
......@@ -946,6 +1053,7 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>', help = 'specify output directory')
parser.add_argument('--skip-image-conversion', '-sic', action='store_true', dest='skipImageConversion', help = 'skip image conversion step')
parser.add_argument('--force-markdown-tables', '-mdt', action='store_true', dest='forceMarkdownTables', help = 'Force markdown instead of grid format for tables with colspans')
parser.add_argument('document', nargs = '+', help = 'documents to parse')
args = parser.parse_args()
......@@ -953,5 +1061,8 @@ if __name__ == '__main__':
# Process documents and print output
os.makedirs(args.outDirectory, exist_ok = True)
processDocuments(sorted(args.document), args.outDirectory, args.skipImageConversion)
processDocuments(sorted(args.document),
args.outDirectory,
args.skipImageConversion,
args.forceMarkdownTables)