diff --git a/gridTable.py b/gridTable.py index c859bef90f429af7d826474494e6fb399e30dfa2..35d7f5dec978f2eb99eead2f22dfaa852dd3abfa 100644 --- a/gridTable.py +++ b/gridTable.py @@ -9,6 +9,7 @@ import re colspanMarker = '~~COLSPAN~~' +rowspanMarker = '~~ROWSPAN~~' def markdownToGrid(markdownLines:list[str]) -> list[str]: """ Convert a markdown table to a grid table. @@ -34,7 +35,6 @@ def markdownToGrid(markdownLines:list[str]) -> list[str]: for line in markdownLines ] - # Get maximum width for each column colWidths = [] maxCols = max(len(row) for row in rows) @@ -45,12 +45,11 @@ def markdownToGrid(markdownLines:list[str]) -> list[str]: # Process merged cells - combine content with previous cell for row in rows: - for i in range(len(row)-1, 0, -1): # Work backwards to avoid index issues + for i in range(len(row)-1, -1, -1): # Work backwards to avoid index issues if row[i].strip() == colspanMarker: row[i-1] = row[i-1] + ' '*(colWidths[i-1] - len(row[i-1]))+ ' '*(colWidths[i]+3) # Merge with empty content # row[i] = None # type:ignore[call-overload] # Indicate removal - # Pad any rows that are too short for row in rows: while len(row) < maxCols: @@ -64,19 +63,41 @@ def markdownToGrid(markdownLines:list[str]) -> list[str]: # Header row result.append('|' + '|'.join( - f' {rows[0][i]:<{colWidths[i]}} ' for i in range(len(rows[0])) if rows[0][i] is not None + f'{rows[0][i]:<{colWidths[i]}}' for i in range(len(rows[0])) if rows[0][i] is not None ) + '|') # Header separator result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+') # Data rows - for row in rows[2:]: + for rowIndex, row in enumerate(rows[2:]): + + # The following code detects if cells in the next row have rowspan marker(s) + # If so, it will merge the cells with the current one and remove the rowspan marker + # from that cell + nextRowCellsMerged:list[bool] = [] + + if rowIndex < len(rows)-3: + for cellIndex, cell in enumerate(rows[rowIndex+3]): + if cell.strip() == rowspanMarker: + nextRowCellsMerged.append(True) + rows[rowIndex+3][cellIndex] = cell.replace(rowspanMarker, ' '*len(rowspanMarker)) + else: + nextRowCellsMerged.append(False) + # nextRowCellsMerged = [ cell.strip() == rowspanMarker for cell in rows[rowIndex+3] ] + else: + nextRowCellsMerged = [ False for _ in rows[rowIndex+2] ] + result.append('|' + '|'.join( - f' {row[i]:<{colWidths[i]}} ' for i in range(len(row)) if row[i] is not None + f'{row[i]:<{colWidths[i]}}' + if row[i] != rowspanMarker else '' + for i in range(len(row)) + if row[i] is not None ) + '|') - result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+') - + + # Add separator line, if not merged + result.append('+' + '+'.join('-' * (w + 2) if not nextRowCellsMerged[cellIndex] else ' ' * (w + 2) + for cellIndex, w in enumerate(colWidths)) + '+') return result @@ -90,6 +111,49 @@ def formatGridTable(lines: list[str]) -> list[str]: Returns: Formatted grid table as list of strings """ + + def _getCellsFromRow(row:str) -> list[str]: + """Helper function to extract cells from a row. + + This is done by splitting the row string by the '|' character + and returning the cells as a list. The first and last elements + are ignored as they are empty strings. + + Args: + row: The row string to split. + + Returns: + A list of cells extracted from the row. + """ + return row.strip().split('|')[1:-1] + + + def _guessColumnWidth(columnID:int) -> int: + """Helper function to guess the width of a column. + + This is done by checking the content of the cells in the column + and returning the maximum width found. This value may not be + accurate if the column contains merged cells, but it is a good + approximation. + + Args: + columnID: The column ID to check. + + Returns: + The guessed width of the column.s + """ + width = 0 + for row in lines: + if row.startswith('|'): + rowCells = _getCellsFromRow(row) + if columnID < len(rowCells): + cellLines = rowCells[columnID].rstrip().split('\\\n') + for line in cellLines: + if line != colspanMarker: + width = max(width, len(line.rstrip())) + return width + + if not lines or len(lines) < 3: return lines @@ -101,55 +165,81 @@ def formatGridTable(lines: list[str]) -> list[str]: for row in lines: if row.startswith('|'): # Split cells and get their lengths - rowCells = row.strip().split('|')[1:-1] + rowCells = _getCellsFromRow(row) for i, cell in enumerate(rowCells): if i >= len(colWidths): continue # Calculate maximum width of each line in the cell. Lines could be multilines, so we need to split them. - cellLines = cell.strip().split('\\\n') - cellWidth = max(len(line.strip()) if line != colspanMarker else 0 + cellLines = cell.rstrip().split('\\\n') + requiredCellWidth = max(len(line.rstrip()) if line != colspanMarker else 0 for line in cellLines) - if cellWidth > colWidths[i]: - colWidths[i] = cellWidth + + if requiredCellWidth > colWidths[i]: + # Check if the next cell or cells are colspan markers + # If so, then sum the widths of the current and next cells and increase the width + # only if the required size is still bigger than the current one + # Check for colspan markers + overAllCellWidth = colWidths[i] + nextIdx = i + 1 + while nextIdx < len(rowCells) and rowCells[nextIdx].strip() == colspanMarker: + cw = colWidths[nextIdx] + if cw == 0: + cw = _guessColumnWidth(nextIdx) + overAllCellWidth += cw + nextIdx += 1 + if requiredCellWidth > overAllCellWidth: + # Increase the width of the current cell + colWidths[i] += requiredCellWidth-overAllCellWidth + # Process each line for line in lines: - if line.startswith('+-'): + # Normal separator line can either start with '+ ' or '+-' + if line.startswith('+-') or line.startswith('+ '): + # Get the kind of row separator for each column + _originalSeparator = [ l[0] for l in line.split('+')[1:-1] ] # Separator line - rebuild with correct column widths - result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+') + result.append('+' + '+'.join(_originalSeparator[colIndex] * (w) + for colIndex, w in enumerate(colWidths) + if colWidths[colIndex] > 0 ) + '+') continue elif line.startswith('+='): # Separator line - rebuild with correct column widths - result.append('+' + '+'.join('=' * (w + 2) for w in colWidths) + '+') + result.append('+' + '+'.join('=' * (w) + for colIndex, w in enumerate(colWidths) + if colWidths[colIndex] > 0 ) + '+') continue elif line.startswith('+:='): # Separator line - rebuild with correct column widths # ATTN: This is a special casse. It assumes that all columns are left-aligned. - result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+') + result.append('+:' + '+:'.join('=' * (w-1) + for colIndex, w in enumerate(colWidths) + if colWidths[colIndex] > 0 ) + '+') continue + elif line.startswith('|'): # Content line - cells = line.strip().split('|')[1:-1] + cells = line.rstrip().split('|')[1:-1] formattedCells = [] i = 0 while i < len(cells): - cell = cells[i].strip() - if cell == colspanMarker: + cell = cells[i].rstrip() + if cell.strip() == colspanMarker: # Skip merged cells - they were handled with previous cell i += 1 continue - + # Calculate width for potentially merged cells width = colWidths[i] nextIdx = i + 1 while nextIdx < len(cells) and cells[nextIdx].strip() == colspanMarker: - width += colWidths[nextIdx] + 3 # +3 for the cell borders + width += colWidths[nextIdx] + 1 nextIdx += 1 - + # Format the cell content - formattedCells.append(f' {cell:<{width}} ') - i += 1 + formattedCells.append(f'{cell:<{width}}') + i = nextIdx result.append('|' + '|'.join(formattedCells) + '|') @@ -204,10 +294,10 @@ def handleMultiLineGridTable(lines: list[str]) -> list[str]: else: # Use the part if available, otherwise empty string text = cellParts[line_idx] if line_idx < len(cellParts) else '' - newCells.append(text.strip()) - new_line = '|' + '|'.join(f' {cell} ' for cell in newCells) + '|' + newCells.append(text.rstrip()) + newLine = '|' + '|'.join(f'{cell}' for cell in newCells) + '|' # Store with original line index as key - rowLines[i] = rowLines.get(i, []) + [new_line] + rowLines[i] = rowLines.get(i, []) + [newLine] else: # No line breaks, keep original line rowLines[i] = [line] diff --git a/spec2md.py b/spec2md.py index a1bcac8b8269ab4affee22722106f4cde4369b40..c2f9d581ec51e7f5d2fcbe129a63be814e6572ee 100644 --- a/spec2md.py +++ b/spec2md.py @@ -28,7 +28,8 @@ from rich import inspect import configparser, zipfile from lxml import etree as ET -from gridTable import markdownToGrid, isGridTableStart, handleMultiLineGridTable, formatGridTable, colspanMarker +from gridTable import markdownToGrid, isGridTableStart, handleMultiLineGridTable, \ + formatGridTable, colspanMarker, rowspanMarker class Style(IntEnum): code = auto() @@ -84,6 +85,7 @@ _print:Callable = print # Some predefined tags and attributes wns = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' +w14ns = 'http://schemas.microsoft.com/office/word/2010/wordml' _val = f'{{{wns}}}val' class SectionNumbers(object): @@ -352,12 +354,12 @@ def processDocuments(documents:list[str], 'footnoteRef', 'annotationRef', ) - + newParagraphs = 0 def _parseXML(element:ET.Element, inCell:Optional[bool] = False) -> str: """ Recursively parse a document paragraph. """ - nonlocal _ignoredTags + nonlocal _ignoredTags, newParagraphs _result = '' tag = strippedTag(element.tag) # remove namespaces for easier handlings @@ -394,7 +396,7 @@ def processDocuments(documents:list[str], case 'br': _result += _linebreak - + case 'bookmarkStart' | 'bookmarkEnd': # TODO ? pass @@ -497,14 +499,16 @@ def processDocuments(documents:list[str], # _print(ET.fromstring(elem._p.xml)) match elem: case Paragraph(): # type: ignore[misc] - return _parseXML(ET.fromstring(elem._p.xml)) + return _parseXML(ET.fromstring(elem._p.xml)).rstrip() case _Cell(): # type: ignore[misc] # Iterate over all paragraphs in the cell and parse them # Create a list of parsed paragraphs and join them with linebreaks - return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip() + return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip() for p in elem.paragraphs ]) case ET._Element(): - return _parseXML(elem) + # return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip() + # for p in elem.paragraphs ]) + return '<br />'.join([ _parseXML(elem).rstrip()]) case _: return '' @@ -769,23 +773,45 @@ def processDocuments(documents:list[str], nrRows = 0 colSpanDetected = False for row in elem.rows: + _row = ET.fromstring(row._tr.xml) cells:list[str] = [] - colspanCounter = 0 - for cell in row.cells: + for cell in _row.findall('.//w:tc', namespaces = { 'w' : wns }): + + colspanCounter = 1 # Default value if no gridspan is specified + gridspanElem = cell.find('.//w:tcPr/w:gridSpan', namespaces={'w': wns}) + if gridspanElem is not None and _val in gridspanElem.attrib: + colspanCounter = int(gridspanElem.attrib[_val]) + colSpanDetected = True # Set flag that colspan was found + + # Vertical merge + gridspanElem = cell.find('.//w:tcPr/w:vMerge', namespaces={'w': wns}) + if gridspanElem is not None and _val not in gridspanElem.attrib: + cells.append(rowspanMarker) + + else: + + # Extract text from cell + # Find all paragraphs in the cell + _pl:list[str] = [] + for p in cell.findall('.//w:p', namespaces={'w': wns}): + _pl.append(getTextFromXML(p)) + # Add the text to the cell + if len(_pl) > 0: + cells.append(_linebreak.join(_pl)) + else: + cells.append('') + + # Handle colspan formatting if not forceMarkdownTables: - if colspanCounter > 0: - cells.append(colspanMarker) # add at least a space + if colspanCounter >= 1: + for _ in range(colspanCounter-1): + cells.append(colspanMarker) colspanCounter -= 1 - continue - if cell._tc.grid_span > 1: - colSpanDetected = True - colspanCounter = cell._tc.grid_span - 1 - elif cell._tc.grid_span > 1: - colSpanDetected = True - cells.append(f'{getTextFromXML(cell)} ') # add at least a space rows.append(cells) nrRows += 1 - + + # for r in rows: + # _print(r) # Warning if this is a single-row table if nrRows == 1: @@ -850,8 +876,6 @@ def processDocuments(documents:list[str], line = line.replace(ch, f'<mark>Non-ASCII character {ch} / {hex(ord(ch))}</mark>') lines[i] = line - - # # Remove multiple bold / italics on/off occurances # Sometimes word doesn't remove empty bold-on/bold-off (or italics) indicatros @@ -861,6 +885,9 @@ def processDocuments(documents:list[str], line = lines[i] line = line.replace('__', '') line = line.replace('****', '') + line = line.replace('** ', '** ') + line = line.replace('_ ', '_ ') + line = line.replace('** **', ' ') #line = line.replace(' ', ' ') lines[i] = line @@ -966,9 +993,6 @@ def processDocuments(documents:list[str], for fid, text in footnotes.items(): lines.append(f'[^{fid}]: {text}') - # - # List unresolved CAPTION markers - # # # List unresolved CAPTION markers # @@ -976,11 +1000,10 @@ def processDocuments(documents:list[str], if _captionMarker in line: _print(f'[yellow]({linenumber(i)}) Unresolved / unreferenced figure caption: "{line}"[/yellow]') - + # # Correct formatting of Grid tables after all other changes have been applied # - if not forceMarkdownTables: gridTable:list[str] = [] result:list[str] = [] @@ -1004,6 +1027,7 @@ def processDocuments(documents:list[str], # not in grid table result.append(line) lines = result + #