Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • tools/spec2md
1 result
Show changes
Commits on Source (2)
...@@ -143,12 +143,14 @@ ff09 = 2920 ...@@ -143,12 +143,14 @@ ff09 = 2920
d7 = 78 d7 = 78
; Ligature "fi" ; Ligature "fi"
fb01 = 6669 fb01 = 6669
; "<=>"
; f0df = 3c3d3e
; "<="
f0df = 3c3d
; "<=" ; "<="
f0fd = 3c3d ;f0fd = 3c3d
; "=>" ; "=>"
f0e0 = 3d3e f0e0 = 3d3e
; "<=>"
f0df = 3c3d3e
; "<->" ; "<->"
f0f3 = 266c743b2d3e f0f3 = 266c743b2d3e
; subscript 2 ; subscript 2
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
import re import re
colspanMarker = '~~COLSPAN~~' colspanMarker = '~~COLSPAN~~'
rowspanMarker = '~~ROWSPAN~~'
def markdownToGrid(markdownLines:list[str]) -> list[str]: def markdownToGrid(markdownLines:list[str]) -> list[str]:
""" Convert a markdown table to a grid table. """ Convert a markdown table to a grid table.
...@@ -34,7 +35,6 @@ def markdownToGrid(markdownLines:list[str]) -> list[str]: ...@@ -34,7 +35,6 @@ def markdownToGrid(markdownLines:list[str]) -> list[str]:
for line in markdownLines for line in markdownLines
] ]
# Get maximum width for each column # Get maximum width for each column
colWidths = [] colWidths = []
maxCols = max(len(row) for row in rows) maxCols = max(len(row) for row in rows)
...@@ -45,12 +45,11 @@ def markdownToGrid(markdownLines:list[str]) -> list[str]: ...@@ -45,12 +45,11 @@ def markdownToGrid(markdownLines:list[str]) -> list[str]:
# Process merged cells - combine content with previous cell # Process merged cells - combine content with previous cell
for row in rows: for row in rows:
for i in range(len(row)-1, 0, -1): # Work backwards to avoid index issues for i in range(len(row)-1, -1, -1): # Work backwards to avoid index issues
if row[i].strip() == colspanMarker: if row[i].strip() == colspanMarker:
row[i-1] = row[i-1] + ' '*(colWidths[i-1] - len(row[i-1]))+ ' '*(colWidths[i]+3) # Merge with empty content row[i-1] = row[i-1] + ' '*(colWidths[i-1] - len(row[i-1]))+ ' '*(colWidths[i]+3) # Merge with empty content
# row[i] = None # type:ignore[call-overload] # Indicate removal # row[i] = None # type:ignore[call-overload] # Indicate removal
# Pad any rows that are too short # Pad any rows that are too short
for row in rows: for row in rows:
while len(row) < maxCols: while len(row) < maxCols:
...@@ -64,19 +63,41 @@ def markdownToGrid(markdownLines:list[str]) -> list[str]: ...@@ -64,19 +63,41 @@ def markdownToGrid(markdownLines:list[str]) -> list[str]:
# Header row # Header row
result.append('|' + '|'.join( result.append('|' + '|'.join(
f' {rows[0][i]:<{colWidths[i]}} ' for i in range(len(rows[0])) if rows[0][i] is not None f'{rows[0][i]:<{colWidths[i]}}' for i in range(len(rows[0])) if rows[0][i] is not None
) + '|') ) + '|')
# Header separator # Header separator
result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+') result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+')
# Data rows # Data rows
for row in rows[2:]: for rowIndex, row in enumerate(rows[2:]):
# The following code detects if cells in the next row have rowspan marker(s)
# If so, it will merge the cells with the current one and remove the rowspan marker
# from that cell
nextRowCellsMerged:list[bool] = []
if rowIndex < len(rows)-3:
for cellIndex, cell in enumerate(rows[rowIndex+3]):
if cell.strip() == rowspanMarker:
nextRowCellsMerged.append(True)
rows[rowIndex+3][cellIndex] = cell.replace(rowspanMarker, ' '*len(rowspanMarker))
else:
nextRowCellsMerged.append(False)
# nextRowCellsMerged = [ cell.strip() == rowspanMarker for cell in rows[rowIndex+3] ]
else:
nextRowCellsMerged = [ False for _ in rows[rowIndex+2] ]
result.append('|' + '|'.join( result.append('|' + '|'.join(
f' {row[i]:<{colWidths[i]}} ' for i in range(len(row)) if row[i] is not None f'{row[i]:<{colWidths[i]}}'
if row[i] != rowspanMarker else ''
for i in range(len(row))
if row[i] is not None
) + '|') ) + '|')
result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+')
# Add separator line, if not merged
result.append('+' + '+'.join('-' * (w + 2) if not nextRowCellsMerged[cellIndex] else ' ' * (w + 2)
for cellIndex, w in enumerate(colWidths)) + '+')
return result return result
...@@ -90,6 +111,49 @@ def formatGridTable(lines: list[str]) -> list[str]: ...@@ -90,6 +111,49 @@ def formatGridTable(lines: list[str]) -> list[str]:
Returns: Returns:
Formatted grid table as list of strings Formatted grid table as list of strings
""" """
def _getCellsFromRow(row:str) -> list[str]:
"""Helper function to extract cells from a row.
This is done by splitting the row string by the '|' character
and returning the cells as a list. The first and last elements
are ignored as they are empty strings.
Args:
row: The row string to split.
Returns:
A list of cells extracted from the row.
"""
return row.strip().split('|')[1:-1]
def _guessColumnWidth(columnID:int) -> int:
"""Helper function to guess the width of a column.
This is done by checking the content of the cells in the column
and returning the maximum width found. This value may not be
accurate if the column contains merged cells, but it is a good
approximation.
Args:
columnID: The column ID to check.
Returns:
The guessed width of the column.s
"""
width = 0
for row in lines:
if row.startswith('|'):
rowCells = _getCellsFromRow(row)
if columnID < len(rowCells):
cellLines = rowCells[columnID].rstrip().split('\\\n')
for line in cellLines:
if line != colspanMarker:
width = max(width, len(line.rstrip()))
return width
if not lines or len(lines) < 3: if not lines or len(lines) < 3:
return lines return lines
...@@ -101,55 +165,81 @@ def formatGridTable(lines: list[str]) -> list[str]: ...@@ -101,55 +165,81 @@ def formatGridTable(lines: list[str]) -> list[str]:
for row in lines: for row in lines:
if row.startswith('|'): if row.startswith('|'):
# Split cells and get their lengths # Split cells and get their lengths
rowCells = row.strip().split('|')[1:-1] rowCells = _getCellsFromRow(row)
for i, cell in enumerate(rowCells): for i, cell in enumerate(rowCells):
if i >= len(colWidths): if i >= len(colWidths):
continue continue
# Calculate maximum width of each line in the cell. Lines could be multilines, so we need to split them. # Calculate maximum width of each line in the cell. Lines could be multilines, so we need to split them.
cellLines = cell.strip().split('\\\n') cellLines = cell.rstrip().split('\\\n')
cellWidth = max(len(line.strip()) if line != colspanMarker else 0 requiredCellWidth = max(len(line.rstrip()) if line != colspanMarker else 0
for line in cellLines) for line in cellLines)
if cellWidth > colWidths[i]:
colWidths[i] = cellWidth if requiredCellWidth > colWidths[i]:
# Check if the next cell or cells are colspan markers
# If so, then sum the widths of the current and next cells and increase the width
# only if the required size is still bigger than the current one
# Check for colspan markers
overAllCellWidth = colWidths[i]
nextIdx = i + 1
while nextIdx < len(rowCells) and rowCells[nextIdx].strip() == colspanMarker:
cw = colWidths[nextIdx]
if cw == 0:
cw = _guessColumnWidth(nextIdx)
overAllCellWidth += cw
nextIdx += 1
if requiredCellWidth > overAllCellWidth:
# Increase the width of the current cell
colWidths[i] += requiredCellWidth-overAllCellWidth
# Process each line # Process each line
for line in lines: for line in lines:
if line.startswith('+-'): # Normal separator line can either start with '+ ' or '+-'
if line.startswith('+-') or line.startswith('+ '):
# Get the kind of row separator for each column
_originalSeparator = [ l[0] for l in line.split('+')[1:-1] ]
# Separator line - rebuild with correct column widths # Separator line - rebuild with correct column widths
result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+') result.append('+' + '+'.join(_originalSeparator[colIndex] * (w)
for colIndex, w in enumerate(colWidths)
if colWidths[colIndex] > 0 ) + '+')
continue continue
elif line.startswith('+='): elif line.startswith('+='):
# Separator line - rebuild with correct column widths # Separator line - rebuild with correct column widths
result.append('+' + '+'.join('=' * (w + 2) for w in colWidths) + '+') result.append('+' + '+'.join('=' * (w)
for colIndex, w in enumerate(colWidths)
if colWidths[colIndex] > 0 ) + '+')
continue continue
elif line.startswith('+:='): elif line.startswith('+:='):
# Separator line - rebuild with correct column widths # Separator line - rebuild with correct column widths
# ATTN: This is a special casse. It assumes that all columns are left-aligned. # ATTN: This is a special casse. It assumes that all columns are left-aligned.
result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+') result.append('+:' + '+:'.join('=' * (w-1)
for colIndex, w in enumerate(colWidths)
if colWidths[colIndex] > 0 ) + '+')
continue continue
elif line.startswith('|'): elif line.startswith('|'):
# Content line # Content line
cells = line.strip().split('|')[1:-1] cells = line.rstrip().split('|')[1:-1]
formattedCells = [] formattedCells = []
i = 0 i = 0
while i < len(cells): while i < len(cells):
cell = cells[i].strip() cell = cells[i].rstrip()
if cell == colspanMarker: if cell.strip() == colspanMarker:
# Skip merged cells - they were handled with previous cell # Skip merged cells - they were handled with previous cell
i += 1 i += 1
continue continue
# Calculate width for potentially merged cells # Calculate width for potentially merged cells
width = colWidths[i] width = colWidths[i]
nextIdx = i + 1 nextIdx = i + 1
while nextIdx < len(cells) and cells[nextIdx].strip() == colspanMarker: while nextIdx < len(cells) and cells[nextIdx].strip() == colspanMarker:
width += colWidths[nextIdx] + 3 # +3 for the cell borders width += colWidths[nextIdx] + 1
nextIdx += 1 nextIdx += 1
# Format the cell content # Format the cell content
formattedCells.append(f' {cell:<{width}} ') formattedCells.append(f'{cell:<{width}}')
i += 1 i = nextIdx
result.append('|' + '|'.join(formattedCells) + '|') result.append('|' + '|'.join(formattedCells) + '|')
...@@ -204,10 +294,10 @@ def handleMultiLineGridTable(lines: list[str]) -> list[str]: ...@@ -204,10 +294,10 @@ def handleMultiLineGridTable(lines: list[str]) -> list[str]:
else: else:
# Use the part if available, otherwise empty string # Use the part if available, otherwise empty string
text = cellParts[line_idx] if line_idx < len(cellParts) else '' text = cellParts[line_idx] if line_idx < len(cellParts) else ''
newCells.append(text.strip()) newCells.append(text.rstrip())
new_line = '|' + '|'.join(f' {cell} ' for cell in newCells) + '|' newLine = '|' + '|'.join(f'{cell}' for cell in newCells) + '|'
# Store with original line index as key # Store with original line index as key
rowLines[i] = rowLines.get(i, []) + [new_line] rowLines[i] = rowLines.get(i, []) + [newLine]
else: else:
# No line breaks, keep original line # No line breaks, keep original line
rowLines[i] = [line] rowLines[i] = [line]
......
...@@ -28,7 +28,8 @@ from rich import inspect ...@@ -28,7 +28,8 @@ from rich import inspect
import configparser, zipfile import configparser, zipfile
from lxml import etree as ET from lxml import etree as ET
from gridTable import markdownToGrid, isGridTableStart, handleMultiLineGridTable, formatGridTable, colspanMarker from gridTable import markdownToGrid, isGridTableStart, handleMultiLineGridTable, \
formatGridTable, colspanMarker, rowspanMarker
class Style(IntEnum): class Style(IntEnum):
code = auto() code = auto()
...@@ -84,6 +85,7 @@ _print:Callable = print ...@@ -84,6 +85,7 @@ _print:Callable = print
# Some predefined tags and attributes # Some predefined tags and attributes
wns = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' wns = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
w14ns = 'http://schemas.microsoft.com/office/word/2010/wordml'
_val = f'{{{wns}}}val' _val = f'{{{wns}}}val'
class SectionNumbers(object): class SectionNumbers(object):
...@@ -352,12 +354,12 @@ def processDocuments(documents:list[str], ...@@ -352,12 +354,12 @@ def processDocuments(documents:list[str],
'footnoteRef', 'footnoteRef',
'annotationRef', 'annotationRef',
) )
newParagraphs = 0
def _parseXML(element:ET.Element, inCell:Optional[bool] = False) -> str: def _parseXML(element:ET.Element, inCell:Optional[bool] = False) -> str:
""" Recursively parse a document paragraph. """ Recursively parse a document paragraph.
""" """
nonlocal _ignoredTags nonlocal _ignoredTags, newParagraphs
_result = '' _result = ''
tag = strippedTag(element.tag) # remove namespaces for easier handlings tag = strippedTag(element.tag) # remove namespaces for easier handlings
...@@ -394,7 +396,7 @@ def processDocuments(documents:list[str], ...@@ -394,7 +396,7 @@ def processDocuments(documents:list[str],
case 'br': case 'br':
_result += _linebreak _result += _linebreak
case 'bookmarkStart' | 'bookmarkEnd': # TODO ? case 'bookmarkStart' | 'bookmarkEnd': # TODO ?
pass pass
...@@ -497,14 +499,16 @@ def processDocuments(documents:list[str], ...@@ -497,14 +499,16 @@ def processDocuments(documents:list[str],
# _print(ET.fromstring(elem._p.xml)) # _print(ET.fromstring(elem._p.xml))
match elem: match elem:
case Paragraph(): # type: ignore[misc] case Paragraph(): # type: ignore[misc]
return _parseXML(ET.fromstring(elem._p.xml)) return _parseXML(ET.fromstring(elem._p.xml)).rstrip()
case _Cell(): # type: ignore[misc] case _Cell(): # type: ignore[misc]
# Iterate over all paragraphs in the cell and parse them # Iterate over all paragraphs in the cell and parse them
# Create a list of parsed paragraphs and join them with linebreaks # Create a list of parsed paragraphs and join them with linebreaks
return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip() return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip()
for p in elem.paragraphs ]) for p in elem.paragraphs ])
case ET._Element(): case ET._Element():
return _parseXML(elem) # return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip()
# for p in elem.paragraphs ])
return '<br />'.join([ _parseXML(elem).rstrip()])
case _: case _:
return '' return ''
...@@ -769,23 +773,45 @@ def processDocuments(documents:list[str], ...@@ -769,23 +773,45 @@ def processDocuments(documents:list[str],
nrRows = 0 nrRows = 0
colSpanDetected = False colSpanDetected = False
for row in elem.rows: for row in elem.rows:
_row = ET.fromstring(row._tr.xml)
cells:list[str] = [] cells:list[str] = []
colspanCounter = 0 for cell in _row.findall('.//w:tc', namespaces = { 'w' : wns }):
for cell in row.cells:
colspanCounter = 1 # Default value if no gridspan is specified
gridspanElem = cell.find('.//w:tcPr/w:gridSpan', namespaces={'w': wns})
if gridspanElem is not None and _val in gridspanElem.attrib:
colspanCounter = int(gridspanElem.attrib[_val])
colSpanDetected = True # Set flag that colspan was found
# Vertical merge
gridspanElem = cell.find('.//w:tcPr/w:vMerge', namespaces={'w': wns})
if gridspanElem is not None and _val not in gridspanElem.attrib:
cells.append(rowspanMarker)
else:
# Extract text from cell
# Find all paragraphs in the cell
_pl:list[str] = []
for p in cell.findall('.//w:p', namespaces={'w': wns}):
_pl.append(getTextFromXML(p))
# Add the text to the cell
if len(_pl) > 0:
cells.append(_linebreak.join(_pl))
else:
cells.append('')
# Handle colspan formatting
if not forceMarkdownTables: if not forceMarkdownTables:
if colspanCounter > 0: if colspanCounter >= 1:
cells.append(colspanMarker) # add at least a space for _ in range(colspanCounter-1):
cells.append(colspanMarker)
colspanCounter -= 1 colspanCounter -= 1
continue
if cell._tc.grid_span > 1:
colSpanDetected = True
colspanCounter = cell._tc.grid_span - 1
elif cell._tc.grid_span > 1:
colSpanDetected = True
cells.append(f'{getTextFromXML(cell)} ') # add at least a space
rows.append(cells) rows.append(cells)
nrRows += 1 nrRows += 1
# for r in rows:
# _print(r)
# Warning if this is a single-row table # Warning if this is a single-row table
if nrRows == 1: if nrRows == 1:
...@@ -850,8 +876,6 @@ def processDocuments(documents:list[str], ...@@ -850,8 +876,6 @@ def processDocuments(documents:list[str],
line = line.replace(ch, f'<mark>Non-ASCII character {ch} / {hex(ord(ch))}</mark>') line = line.replace(ch, f'<mark>Non-ASCII character {ch} / {hex(ord(ch))}</mark>')
lines[i] = line lines[i] = line
# #
# Remove multiple bold / italics on/off occurances # Remove multiple bold / italics on/off occurances
# Sometimes word doesn't remove empty bold-on/bold-off (or italics) indicatros # Sometimes word doesn't remove empty bold-on/bold-off (or italics) indicatros
...@@ -861,6 +885,9 @@ def processDocuments(documents:list[str], ...@@ -861,6 +885,9 @@ def processDocuments(documents:list[str],
line = lines[i] line = lines[i]
line = line.replace('__', '') line = line.replace('__', '')
line = line.replace('****', '') line = line.replace('****', '')
line = line.replace('** ', '** ')
line = line.replace('_ ', '_ ')
line = line.replace('** **', ' ')
#line = line.replace(' ', ' ') #line = line.replace(' ', ' ')
lines[i] = line lines[i] = line
...@@ -966,9 +993,6 @@ def processDocuments(documents:list[str], ...@@ -966,9 +993,6 @@ def processDocuments(documents:list[str],
for fid, text in footnotes.items(): for fid, text in footnotes.items():
lines.append(f'[^{fid}]: {text}') lines.append(f'[^{fid}]: {text}')
#
# List unresolved CAPTION markers
#
# #
# List unresolved CAPTION markers # List unresolved CAPTION markers
# #
...@@ -976,11 +1000,10 @@ def processDocuments(documents:list[str], ...@@ -976,11 +1000,10 @@ def processDocuments(documents:list[str],
if _captionMarker in line: if _captionMarker in line:
_print(f'[yellow]({linenumber(i)}) Unresolved / unreferenced figure caption: "{line}"[/yellow]') _print(f'[yellow]({linenumber(i)}) Unresolved / unreferenced figure caption: "{line}"[/yellow]')
# #
# Correct formatting of Grid tables after all other changes have been applied # Correct formatting of Grid tables after all other changes have been applied
# #
if not forceMarkdownTables: if not forceMarkdownTables:
gridTable:list[str] = [] gridTable:list[str] = []
result:list[str] = [] result:list[str] = []
...@@ -1004,6 +1027,7 @@ def processDocuments(documents:list[str], ...@@ -1004,6 +1027,7 @@ def processDocuments(documents:list[str],
# not in grid table # not in grid table
result.append(line) result.append(line)
lines = result lines = result
# #
......