diff --git a/README.md b/README.md index df9a76c519ee2c5cf2bc72fb5f8b594436c0886a..1c5755bcc3cd5f03f06939955768e3ad00bb25b9 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,32 @@ python3 -m pip install -r requirements.txt python3 spec2md.py <path-to-word-document> ``` +### Command Line Options + +``` +usage: spec2md.py [-h] [--outdir <output directory>] [--skip-image-conversion] [--force-markdown-tables] + document [document ...] + +positional arguments: + document documents to parse + +options: + -h, --help show this help message and exit + --outdir <output directory>, -o <output directory> + specify output directory (default: out) + --skip-image-conversion, -sic + skip image conversion step (default: False) + --force-markdown-tables, -mdt + Force markdown instead of grid format for tables with colspans (default: False) + +``` + +- `--outdir` or `-o` specifies the output directory. The default is `out`. +- `--skip-image-conversion` or `-sic` skips the image conversion step. The default is to convert images, but this may not be necessary if the images have already been converted. +- `--force-markdown-tables` or `-mdt` forces the converter to generate markdown tables instead of grid tables. The default is to generate grid tables for tables with colspans. This option is useful to generate a first version of the table that can be manually adjusted later. + + + ## FAQ ### The converter doesn't seem to generate image files. @@ -60,9 +86,16 @@ Lists in table cells are also not possible. One may use html lists for this, but ``` +### How to convert a table with colspans? + +The converter will try to convert tables with colspans to grid tables. If the `--force-markdown-tables` option is used, then the table will be converted to a normal markdown table. If the table has colspans, then the cells will just be repeated to fill a table row. + +This may not be the desired result, but markdown doesn't support colspans. A solution is to use grid tables instead. + ## Changes +- **2025-01-15** - Improved handling of tables with colspans (converting them to simple grid tables). Improved error messages (added line numbers). Improved error detection for tables. - **2024-01-09** - Added support for merging consecutive code paragraphs into a single code block. - **2023-08-18** - Improved handling of sometimes broken inline formatting in table cells. Adding more default heading formats. - **2023-07-27** - Added converting bold and italic text in paragraphs, headers and tables. \ No newline at end of file diff --git a/gridTable.py b/gridTable.py new file mode 100644 index 0000000000000000000000000000000000000000..91ab0c71ee1837d9030166c392a310f5b0029969 --- /dev/null +++ b/gridTable.py @@ -0,0 +1,233 @@ +# +# gritTable.py +# +# Grid Table support functions for markdown conversion. +# +# (c) 2025 by Andreas Kraft +# License: BSD 3-Clause License. See the LICENSE file for further details. +# +import re + +colspanMarker = '~~COLSPAN~~' + +def markdownToGrid(markdownLines:list[str]) -> list[str]: + """ Convert a markdown table to a grid table. + Cells containing ~~XX~~ will be merged with the previous cell. + + Args: + markdownLines: The markdown lines to convert. + + Return: + The converted grid table. + """ + + # Check if there are enough lines to create a table + if not markdownLines or len(markdownLines) < 3: + return markdownLines + + # Replace all <br> with <br /> in all lines + markdownLines = [ re.sub(r'<br\s*/?>', '<br />', line) for line in markdownLines ] + + # Split each line into cells and clean whitespace + rows = [ + [cell.strip() for cell in line.strip('|').split('|')] + for line in markdownLines + ] + + + # Get maximum width for each column + colWidths = [] + maxCols = max(len(row) for row in rows) + for col in range(maxCols): + width = max(len(str(row[col])) if col < len(row) else 0 for row in rows) + colWidths.append(width) + + + # Process merged cells - combine content with previous cell + for row in rows: + for i in range(len(row)-1, 0, -1): # Work backwards to avoid index issues + if row[i].strip() == colspanMarker: + row[i-1] = row[i-1] + ' '*(colWidths[i-1] - len(row[i-1]))+ ' '*(colWidths[i]+3) # Merge with empty content + # row[i] = None # type:ignore[call-overload] # Indicate removal + + + # Pad any rows that are too short + for row in rows: + while len(row) < maxCols: + row.append('') + + # Generate grid table + result = [] + + # Top border + result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+') + + # Header row + result.append('|' + '|'.join( + f' {rows[0][i]:<{colWidths[i]}} ' for i in range(len(rows[0])) if rows[0][i] is not None + ) + '|') + + # Header separator + result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+') + + # Data rows + for row in rows[2:]: + result.append('|' + '|'.join( + f' {row[i]:<{colWidths[i]}} ' for i in range(len(row)) if row[i] is not None + ) + '|') + result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+') + + return result + + +def formatGridTable(lines: list[str]) -> list[str]: + """Format a grid table by adjusting column widths and alignments. + Supports merged cells marked with ~~COLSPAN~~. + + Args: + lines: List of strings containing a grid table + + Returns: + Formatted grid table as list of strings + """ + if not lines or len(lines) < 3: + return lines + + # Get column widths from first separator line + colWidths = [len(col.strip()) for col in lines[0].split('+')[1:-1]] + result = [] + + # Adjust column widths if any cell is longer + for row in lines: + if row.startswith('|'): + # Split cells and get their lengths + rowCells = row.strip().split('|')[1:-1] + for i, cell in enumerate(rowCells): + if i >= len(colWidths): + continue + cellWidth = len(cell.strip()) + if cellWidth > colWidths[i]: + colWidths[i] = cellWidth + + # Process each line + for line in lines: + if line.startswith('+-'): + # Separator line - rebuild with correct column widths + result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+') + continue + elif line.startswith('+='): + # Separator line - rebuild with correct column widths + result.append('+' + '+'.join('=' * (w + 2) for w in colWidths) + '+') + continue + elif line.startswith('+:='): + # Separator line - rebuild with correct column widths + # ATTN: This is a special casse. It assumes that all columns are left-aligned. + result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+') + continue + + elif line.startswith('|'): + # Content line + cells = line.strip().split('|')[1:-1] + formattedCells = [] + i = 0 + while i < len(cells): + cell = cells[i].strip() + if cell == colspanMarker: + # Skip merged cells - they were handled with previous cell + i += 1 + continue + + # Calculate width for potentially merged cells + width = colWidths[i] + nextIdx = i + 1 + while nextIdx < len(cells) and cells[nextIdx].strip() == colspanMarker: + width += colWidths[nextIdx] + 3 # +3 for the cell borders + nextIdx += 1 + + # Format the cell content + formattedCells.append(f' {cell:<{width}} ') + i += 1 + + result.append('|' + '|'.join(formattedCells) + '|') + + return result + + +def handleMultiLineGridTable(lines: list[str]) -> list[str]: + """Handle multiline cells in a grid table by splitting cells with <br /> markers. + + Args: + lines: List of strings containing a grid table + + Returns: + List of strings with multiline cells properly formatted + """ + result = [] + rowLines:dict[int, list[str]] = {} # Map to store line fragments for each row + + # Process each line + for i, line in enumerate(lines): + if line.startswith('|'): # Content line + # Split the line into cells + cells = line.strip().split('|')[1:-1] + + # Process each cell for line breaks + maxLines = 1 + splitCells = [] + for cell in cells: + # Check if cell contains colspan marker + if cell.strip() == colspanMarker: + # For colspan cells, create same number of parts filled with marker + splitCells.append([colspanMarker]) + else: + parts = cell.split('<br />') + if len(parts) > 1: + # Found line breaks in cell + # Add "\" to each part except the last + parts = [ p + '\\' if i < len(parts)-1 else p + for i, p in enumerate(parts) ] + + splitCells.append(parts) + maxLines = max(maxLines, len(parts)) + + # If we found line breaks, create multiple content lines + if maxLines > 1: + for line_idx in range(maxLines): + newCells = [] + for cellParts in splitCells: + if len(cellParts) == 1 and cellParts[0].strip() == colspanMarker: + # For colspan cells, always use the marker + text = colspanMarker + else: + # Use the part if available, otherwise empty string + text = cellParts[line_idx] if line_idx < len(cellParts) else '' + newCells.append(text.strip()) + new_line = '|' + '|'.join(f' {cell} ' for cell in newCells) + '|' + # Store with original line index as key + rowLines[i] = rowLines.get(i, []) + [new_line] + else: + # No line breaks, keep original line + rowLines[i] = [line] + else: + # Border lines are kept as is + rowLines[i] = [line] + + # Reconstruct the table + for i in range(len(lines)): + result.extend(rowLines.get(i, [])) + + return result + + + +def isGridTableStart(line: str) -> bool: + """Check if a line marks the start of a grid table. + + Args: + line: The line to check. + + Returns: + True if this is a table start line, False otherwise. + """ + return line.startswith('+') and line.endswith('+') and '-' in line and not '=' in line + diff --git a/spec2md.py b/spec2md.py index e12a9644c462bf526ba5da108adf8507db22ec3d..15f99c568ab8458fc88fa2d59cb7f7cba75853a3 100644 --- a/spec2md.py +++ b/spec2md.py @@ -28,6 +28,8 @@ from rich import inspect import configparser, zipfile from lxml import etree as ET +from gridTable import markdownToGrid, isGridTableStart, handleMultiLineGridTable, formatGridTable, colspanMarker + class Style(IntEnum): code = auto() example = auto() @@ -208,9 +210,34 @@ class DocumentConfiguration(object): self.emfConverterSvg = config.get('media', 'emfConverterSvg', fallback = None) +def richString(text:str) -> str: + """ Return a rich string for the console output. + + Args: + text: The text to convert to a rich string. + + Return: + The converted text. + """ + return text.replace('[', '\\[') + + +def linenumber(idx:int) -> str: + """ Return the formatted line number. + Args: + idx: The index to get the line number for. + + Return: + The formatted line number with leading zeros. + """ + return f'{idx+1:0{5}}' # currently 5 digits -def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:bool) -> None: + +def processDocuments(documents:list[str], + outDirectory:str, + skipImageConversion:bool, + forceMarkdownTables:bool) -> None: docs:Dict[str, Tuple[Document, DocumentConfiguration, Any]] = {} ptasks = {} mediaRelations:Dict[str, str] = {} @@ -590,6 +617,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: # Processing the document lines:list[str] = [] imageIndex = 1 + lastTableCaption:str = '<unknown caption>' for elem in docItems: paragraphNr += 1 @@ -672,6 +700,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: caption = replaceNL(text).strip() anchor = f'<a name="table_{caption[6:].split(":")[0].strip()}"></a>' if caption.startswith('Table ') and ':' in caption else '' lines.append(f'**{caption}**{anchor}') + lastTableCaption = caption # Image Caption elif style in docConfig.imagecaption: @@ -724,38 +753,76 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: # Print Unhandled tokens also to the console else: - _print(f'{paragraphNr} {style}: {elem.style}: {text}') + _print(f'[yellow]({linenumber(len(lines))}) Undefined paragraph style "{elem.style.name}":[/yellow] [grey39]{text}') lines.append(text) case 'Table': rows:list[list[str]] = [] nrRows = 0 + colSpanDetected = False for row in elem.rows: cells:list[str] = [] + colspanCounter = 0 for cell in row.cells: + if not forceMarkdownTables: + if colspanCounter > 0: + cells.append(colspanMarker) # add at least a space + colspanCounter -= 1 + continue + if cell._tc.grid_span > 1: + colSpanDetected = True + colspanCounter = cell._tc.grid_span - 1 + elif cell._tc.grid_span > 1: + colSpanDetected = True cells.append(f'{getTextFromXML(cell)} ') # add at least a space rows.append(cells) nrRows += 1 + # Warning if this is a single-row table if nrRows == 1: - _print(f'[red]Single-row table found. Such tables cannot be converted to markdown.[/red] Please consider to change the following table in the original document:\n[grey39]{rows[0]}', highlight = False) + _print(f'[red]({linenumber(len(lines)+2)}) Single-row table found. Such tables cannot be converted to markdown.[/red]Consider to change the following table in the original document:\n[grey39]{rows[0]}', highlight = False) - lines.append('') # Add an empty line before a table + # Warning if a table with colspans is detected + if colSpanDetected: + if forceMarkdownTables: + _print(f'[yellow]({linenumber(len(lines)+2)}) Table with colspans found: [/yellow][grey39]{richString(lastTableCaption)}[/grey39]\nConsider to convert it manually to a grid table', highlight = False) + + tableLines:list[str] = [] + + errorDetected:bool = False for idx, row in enumerate(rows): # Check for a table caption and add separator line if idx == 1: - lines.append('-'.join('|' * (len(row) + 1) )) + tableLines.append('-'.join('|' * (len(row) + 1) )) + + # # Check if the number of columns is the same as the previous row and add cells if smaller + + if idx > 0 and len(row) != len(rows[idx-1]): + _print(f'[red]({linenumber(len(lines))}) Number of columns in table row {idx} does not match the previous row.[/red]\nTable may need extra attention', highlight = False) + errorDetected = True # Add table row - lines.append(f'|{"|".join(row)}|' + tableLines.append(f'|{"|".join(row)}|' .replace('\n', _linebreak)) # replace line breaks in cells + + # if colSpanDetected and gridTableForColspan then convert to grid table + if colSpanDetected and not forceMarkdownTables and not errorDetected: + lines.append('') # Add an empty line before a table + lines.append('<mark>Table with colspans converted to grid table. Please check and adjust manually if necessary.</mark>') + tableLines = markdownToGrid(tableLines) + + lines.append('') # Add an empty line before a table + if errorDetected: + lines.append('<mark>The table below caused an error during conversion and may need extra attention</mark>') + lines.append('') # Add an empty line before a table + lines.extend(tableLines) lines.append('') # Add another empty line after a table case _: - _print('[blue] {type(elem).__name__}') + _print(f'[blue]({linenumber(len(lines))}) {type(elem).__name__}') # # Replace non-ascii characters @@ -772,7 +839,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: line = line.replace(ch, rch) # we need the line for further replacements lines[i] = line else: - _print(f'[yellow]Non-ASCII character (consider to add a replacement in the config.ini file): "{ch}" / {ord(ch)} / {hex(ord(ch))}') + _print(f'[yellow]({linenumber(i)}) Non-ASCII character (consider to add a replacement in the config.ini file): "{ch}" / {ord(ch)} / {hex(ord(ch))}') # @@ -892,10 +959,42 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: # # List unresolved CAPTION markers # - for i in range(len(lines)): - line = lines[i] + # + # List unresolved CAPTION markers + # + for i, line in enumerate(lines): if _captionMarker in line: - _print(f'[yellow]Unresolved / unreferenced figure caption : \[{i}] "{line}"') + _print(f'[yellow]({linenumber(i)}) Unresolved / unreferenced figure caption: "{line}"[/yellow]') + + + # + # Correct formatting of Grid tables after all other changes have been applied + # + + if not forceMarkdownTables: + gridTable:list[str] = [] + result:list[str] = [] + for i, line in enumerate(lines): + + # Check for grid table start + if isGridTableStart(line) and not gridTable: + gridTable = [ line ] + continue + # Are we in a grid table? + if gridTable: + # Is the current line still part of the grid table? + if line.startswith(('|', '+')): + gridTable.append(line) + continue + # grid table finished. Assign and clear + gridTable = handleMultiLineGridTable(gridTable) + result.extend(formatGridTable(gridTable)) + gridTable = [] + continue + # not in grid table + result.append(line) + lines = result + # # Write produced Markdown file @@ -954,6 +1053,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>', help = 'specify output directory') parser.add_argument('--skip-image-conversion', '-sic', action='store_true', dest='skipImageConversion', help = 'skip image conversion step') + parser.add_argument('--force-markdown-tables', '-mdt', action='store_true', dest='forceMarkdownTables', help = 'Force markdown instead of grid format for tables with colspans') parser.add_argument('document', nargs = '+', help = 'documents to parse') args = parser.parse_args() @@ -961,5 +1061,8 @@ if __name__ == '__main__': # Process documents and print output os.makedirs(args.outDirectory, exist_ok = True) - processDocuments(sorted(args.document), args.outDirectory, args.skipImageConversion) + processDocuments(sorted(args.document), + args.outDirectory, + args.skipImageConversion, + args.forceMarkdownTables)