Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • tools/spec2md
1 result
Show changes
Commits on Source (2)
......@@ -143,12 +143,14 @@ ff09 = 2920
d7 = 78
; Ligature "fi"
fb01 = 6669
; "<=>"
; f0df = 3c3d3e
; "<="
f0df = 3c3d
; "<="
f0fd = 3c3d
;f0fd = 3c3d
; "=>"
f0e0 = 3d3e
; "<=>"
f0df = 3c3d3e
; "<->"
f0f3 = 266c743b2d3e
; subscript 2
......
......@@ -9,6 +9,7 @@
import re
colspanMarker = '~~COLSPAN~~'
rowspanMarker = '~~ROWSPAN~~'
def markdownToGrid(markdownLines:list[str]) -> list[str]:
""" Convert a markdown table to a grid table.
......@@ -34,7 +35,6 @@ def markdownToGrid(markdownLines:list[str]) -> list[str]:
for line in markdownLines
]
# Get maximum width for each column
colWidths = []
maxCols = max(len(row) for row in rows)
......@@ -45,12 +45,11 @@ def markdownToGrid(markdownLines:list[str]) -> list[str]:
# Process merged cells - combine content with previous cell
for row in rows:
for i in range(len(row)-1, 0, -1): # Work backwards to avoid index issues
for i in range(len(row)-1, -1, -1): # Work backwards to avoid index issues
if row[i].strip() == colspanMarker:
row[i-1] = row[i-1] + ' '*(colWidths[i-1] - len(row[i-1]))+ ' '*(colWidths[i]+3) # Merge with empty content
# row[i] = None # type:ignore[call-overload] # Indicate removal
# Pad any rows that are too short
for row in rows:
while len(row) < maxCols:
......@@ -64,19 +63,41 @@ def markdownToGrid(markdownLines:list[str]) -> list[str]:
# Header row
result.append('|' + '|'.join(
f' {rows[0][i]:<{colWidths[i]}} ' for i in range(len(rows[0])) if rows[0][i] is not None
f'{rows[0][i]:<{colWidths[i]}}' for i in range(len(rows[0])) if rows[0][i] is not None
) + '|')
# Header separator
result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+')
# Data rows
for row in rows[2:]:
for rowIndex, row in enumerate(rows[2:]):
# The following code detects if cells in the next row have rowspan marker(s)
# If so, it will merge the cells with the current one and remove the rowspan marker
# from that cell
nextRowCellsMerged:list[bool] = []
if rowIndex < len(rows)-3:
for cellIndex, cell in enumerate(rows[rowIndex+3]):
if cell.strip() == rowspanMarker:
nextRowCellsMerged.append(True)
rows[rowIndex+3][cellIndex] = cell.replace(rowspanMarker, ' '*len(rowspanMarker))
else:
nextRowCellsMerged.append(False)
# nextRowCellsMerged = [ cell.strip() == rowspanMarker for cell in rows[rowIndex+3] ]
else:
nextRowCellsMerged = [ False for _ in rows[rowIndex+2] ]
result.append('|' + '|'.join(
f' {row[i]:<{colWidths[i]}} ' for i in range(len(row)) if row[i] is not None
f'{row[i]:<{colWidths[i]}}'
if row[i] != rowspanMarker else ''
for i in range(len(row))
if row[i] is not None
) + '|')
result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+')
# Add separator line, if not merged
result.append('+' + '+'.join('-' * (w + 2) if not nextRowCellsMerged[cellIndex] else ' ' * (w + 2)
for cellIndex, w in enumerate(colWidths)) + '+')
return result
......@@ -90,6 +111,49 @@ def formatGridTable(lines: list[str]) -> list[str]:
Returns:
Formatted grid table as list of strings
"""
def _getCellsFromRow(row:str) -> list[str]:
"""Helper function to extract cells from a row.
This is done by splitting the row string by the '|' character
and returning the cells as a list. The first and last elements
are ignored as they are empty strings.
Args:
row: The row string to split.
Returns:
A list of cells extracted from the row.
"""
return row.strip().split('|')[1:-1]
def _guessColumnWidth(columnID:int) -> int:
"""Helper function to guess the width of a column.
This is done by checking the content of the cells in the column
and returning the maximum width found. This value may not be
accurate if the column contains merged cells, but it is a good
approximation.
Args:
columnID: The column ID to check.
Returns:
The guessed width of the column.s
"""
width = 0
for row in lines:
if row.startswith('|'):
rowCells = _getCellsFromRow(row)
if columnID < len(rowCells):
cellLines = rowCells[columnID].rstrip().split('\\\n')
for line in cellLines:
if line != colspanMarker:
width = max(width, len(line.rstrip()))
return width
if not lines or len(lines) < 3:
return lines
......@@ -101,55 +165,81 @@ def formatGridTable(lines: list[str]) -> list[str]:
for row in lines:
if row.startswith('|'):
# Split cells and get their lengths
rowCells = row.strip().split('|')[1:-1]
rowCells = _getCellsFromRow(row)
for i, cell in enumerate(rowCells):
if i >= len(colWidths):
continue
# Calculate maximum width of each line in the cell. Lines could be multilines, so we need to split them.
cellLines = cell.strip().split('\\\n')
cellWidth = max(len(line.strip()) if line != colspanMarker else 0
cellLines = cell.rstrip().split('\\\n')
requiredCellWidth = max(len(line.rstrip()) if line != colspanMarker else 0
for line in cellLines)
if cellWidth > colWidths[i]:
colWidths[i] = cellWidth
if requiredCellWidth > colWidths[i]:
# Check if the next cell or cells are colspan markers
# If so, then sum the widths of the current and next cells and increase the width
# only if the required size is still bigger than the current one
# Check for colspan markers
overAllCellWidth = colWidths[i]
nextIdx = i + 1
while nextIdx < len(rowCells) and rowCells[nextIdx].strip() == colspanMarker:
cw = colWidths[nextIdx]
if cw == 0:
cw = _guessColumnWidth(nextIdx)
overAllCellWidth += cw
nextIdx += 1
if requiredCellWidth > overAllCellWidth:
# Increase the width of the current cell
colWidths[i] += requiredCellWidth-overAllCellWidth
# Process each line
for line in lines:
if line.startswith('+-'):
# Normal separator line can either start with '+ ' or '+-'
if line.startswith('+-') or line.startswith('+ '):
# Get the kind of row separator for each column
_originalSeparator = [ l[0] for l in line.split('+')[1:-1] ]
# Separator line - rebuild with correct column widths
result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+')
result.append('+' + '+'.join(_originalSeparator[colIndex] * (w)
for colIndex, w in enumerate(colWidths)
if colWidths[colIndex] > 0 ) + '+')
continue
elif line.startswith('+='):
# Separator line - rebuild with correct column widths
result.append('+' + '+'.join('=' * (w + 2) for w in colWidths) + '+')
result.append('+' + '+'.join('=' * (w)
for colIndex, w in enumerate(colWidths)
if colWidths[colIndex] > 0 ) + '+')
continue
elif line.startswith('+:='):
# Separator line - rebuild with correct column widths
# ATTN: This is a special casse. It assumes that all columns are left-aligned.
result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+')
result.append('+:' + '+:'.join('=' * (w-1)
for colIndex, w in enumerate(colWidths)
if colWidths[colIndex] > 0 ) + '+')
continue
elif line.startswith('|'):
# Content line
cells = line.strip().split('|')[1:-1]
cells = line.rstrip().split('|')[1:-1]
formattedCells = []
i = 0
while i < len(cells):
cell = cells[i].strip()
if cell == colspanMarker:
cell = cells[i].rstrip()
if cell.strip() == colspanMarker:
# Skip merged cells - they were handled with previous cell
i += 1
continue
# Calculate width for potentially merged cells
width = colWidths[i]
nextIdx = i + 1
while nextIdx < len(cells) and cells[nextIdx].strip() == colspanMarker:
width += colWidths[nextIdx] + 3 # +3 for the cell borders
width += colWidths[nextIdx] + 1
nextIdx += 1
# Format the cell content
formattedCells.append(f' {cell:<{width}} ')
i += 1
formattedCells.append(f'{cell:<{width}}')
i = nextIdx
result.append('|' + '|'.join(formattedCells) + '|')
......@@ -204,10 +294,10 @@ def handleMultiLineGridTable(lines: list[str]) -> list[str]:
else:
# Use the part if available, otherwise empty string
text = cellParts[line_idx] if line_idx < len(cellParts) else ''
newCells.append(text.strip())
new_line = '|' + '|'.join(f' {cell} ' for cell in newCells) + '|'
newCells.append(text.rstrip())
newLine = '|' + '|'.join(f'{cell}' for cell in newCells) + '|'
# Store with original line index as key
rowLines[i] = rowLines.get(i, []) + [new_line]
rowLines[i] = rowLines.get(i, []) + [newLine]
else:
# No line breaks, keep original line
rowLines[i] = [line]
......
......@@ -28,7 +28,8 @@ from rich import inspect
import configparser, zipfile
from lxml import etree as ET
from gridTable import markdownToGrid, isGridTableStart, handleMultiLineGridTable, formatGridTable, colspanMarker
from gridTable import markdownToGrid, isGridTableStart, handleMultiLineGridTable, \
formatGridTable, colspanMarker, rowspanMarker
class Style(IntEnum):
code = auto()
......@@ -84,6 +85,7 @@ _print:Callable = print
# Some predefined tags and attributes
wns = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
w14ns = 'http://schemas.microsoft.com/office/word/2010/wordml'
_val = f'{{{wns}}}val'
class SectionNumbers(object):
......@@ -352,12 +354,12 @@ def processDocuments(documents:list[str],
'footnoteRef',
'annotationRef',
)
newParagraphs = 0
def _parseXML(element:ET.Element, inCell:Optional[bool] = False) -> str:
""" Recursively parse a document paragraph.
"""
nonlocal _ignoredTags
nonlocal _ignoredTags, newParagraphs
_result = ''
tag = strippedTag(element.tag) # remove namespaces for easier handlings
......@@ -394,7 +396,7 @@ def processDocuments(documents:list[str],
case 'br':
_result += _linebreak
case 'bookmarkStart' | 'bookmarkEnd': # TODO ?
pass
......@@ -497,14 +499,16 @@ def processDocuments(documents:list[str],
# _print(ET.fromstring(elem._p.xml))
match elem:
case Paragraph(): # type: ignore[misc]
return _parseXML(ET.fromstring(elem._p.xml))
return _parseXML(ET.fromstring(elem._p.xml)).rstrip()
case _Cell(): # type: ignore[misc]
# Iterate over all paragraphs in the cell and parse them
# Create a list of parsed paragraphs and join them with linebreaks
return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip()
return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip()
for p in elem.paragraphs ])
case ET._Element():
return _parseXML(elem)
# return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip()
# for p in elem.paragraphs ])
return '<br />'.join([ _parseXML(elem).rstrip()])
case _:
return ''
......@@ -769,23 +773,45 @@ def processDocuments(documents:list[str],
nrRows = 0
colSpanDetected = False
for row in elem.rows:
_row = ET.fromstring(row._tr.xml)
cells:list[str] = []
colspanCounter = 0
for cell in row.cells:
for cell in _row.findall('.//w:tc', namespaces = { 'w' : wns }):
colspanCounter = 1 # Default value if no gridspan is specified
gridspanElem = cell.find('.//w:tcPr/w:gridSpan', namespaces={'w': wns})
if gridspanElem is not None and _val in gridspanElem.attrib:
colspanCounter = int(gridspanElem.attrib[_val])
colSpanDetected = True # Set flag that colspan was found
# Vertical merge
gridspanElem = cell.find('.//w:tcPr/w:vMerge', namespaces={'w': wns})
if gridspanElem is not None and _val not in gridspanElem.attrib:
cells.append(rowspanMarker)
else:
# Extract text from cell
# Find all paragraphs in the cell
_pl:list[str] = []
for p in cell.findall('.//w:p', namespaces={'w': wns}):
_pl.append(getTextFromXML(p))
# Add the text to the cell
if len(_pl) > 0:
cells.append(_linebreak.join(_pl))
else:
cells.append('')
# Handle colspan formatting
if not forceMarkdownTables:
if colspanCounter > 0:
cells.append(colspanMarker) # add at least a space
if colspanCounter >= 1:
for _ in range(colspanCounter-1):
cells.append(colspanMarker)
colspanCounter -= 1
continue
if cell._tc.grid_span > 1:
colSpanDetected = True
colspanCounter = cell._tc.grid_span - 1
elif cell._tc.grid_span > 1:
colSpanDetected = True
cells.append(f'{getTextFromXML(cell)} ') # add at least a space
rows.append(cells)
nrRows += 1
# for r in rows:
# _print(r)
# Warning if this is a single-row table
if nrRows == 1:
......@@ -850,8 +876,6 @@ def processDocuments(documents:list[str],
line = line.replace(ch, f'<mark>Non-ASCII character {ch} / {hex(ord(ch))}</mark>')
lines[i] = line
#
# Remove multiple bold / italics on/off occurances
# Sometimes word doesn't remove empty bold-on/bold-off (or italics) indicatros
......@@ -861,6 +885,9 @@ def processDocuments(documents:list[str],
line = lines[i]
line = line.replace('__', '')
line = line.replace('****', '')
line = line.replace('** ', '** ')
line = line.replace('_ ', '_ ')
line = line.replace('** **', ' ')
#line = line.replace(' ', ' ')
lines[i] = line
......@@ -966,9 +993,6 @@ def processDocuments(documents:list[str],
for fid, text in footnotes.items():
lines.append(f'[^{fid}]: {text}')
#
# List unresolved CAPTION markers
#
#
# List unresolved CAPTION markers
#
......@@ -976,11 +1000,10 @@ def processDocuments(documents:list[str],
if _captionMarker in line:
_print(f'[yellow]({linenumber(i)}) Unresolved / unreferenced figure caption: "{line}"[/yellow]')
#
# Correct formatting of Grid tables after all other changes have been applied
#
if not forceMarkdownTables:
gridTable:list[str] = []
result:list[str] = []
......@@ -1004,6 +1027,7 @@ def processDocuments(documents:list[str],
# not in grid table
result.append(line)
lines = result
#
......