From 9d0a1d239c6c21b5d69c29eeb98c09ccb54531f0 Mon Sep 17 00:00:00 2001 From: ankraft <an.kraft@gmail.com> Date: Thu, 20 Feb 2025 12:48:33 +0100 Subject: [PATCH 01/29] Moved functions in different modules. Some optimizations --- LICENSE | 2 +- toMkdocs/gridTableTools.py | 503 +++++++++++++++++ toMkdocs/makrdownTools.py | 494 ++++++++++++++++ toMkdocs/regexMatches.py | 40 ++ toMkdocs/toMkdocs.py | 1095 ++++-------------------------------- 5 files changed, 1132 insertions(+), 1002 deletions(-) create mode 100644 toMkdocs/gridTableTools.py create mode 100644 toMkdocs/makrdownTools.py create mode 100644 toMkdocs/regexMatches.py diff --git a/LICENSE b/LICENSE index 642df8a..11dd0df 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ BSD 3-Clause License -Copyright (c) 2024, Miguel Angel Reina Ortega +Copyright (c) 2024, Miguel Angel Reina Ortega & Andreas Kraft Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py new file mode 100644 index 0000000..170c3a5 --- /dev/null +++ b/toMkdocs/gridTableTools.py @@ -0,0 +1,503 @@ +# +# gridTableTools.py +# +# (c) 2025 by Miguel Angel Reina Ortega & Andreas Kraft +# License: BSD 3-Clause License. See the LICENSE file for further details. +# +""" Tools for working with grid tables in markdown files. """ + +from typing import Optional +from regexMatches import * + + + +class GridCell: + """ Represents a grid table cell. """ + + def __init__(self) -> None: + """ Initialize a new grid table cell. + """ + self.content:Optional[str] = None + self.rowspan:int = 0 + self.colspan:int = 0 + self.colspanAdjusted:bool = False + self.alignment:str = 'align="center"' + self.position:Optional[int] = None + self.listFlag:bool = False + self.auxiliarIndex:int = 0 + + + def calculateAndSetAlignment(self, headerDelimiterPositions:list[int], defaultAlignments:list[str]) -> None: + """ Set the alignment of the cell based on the position of the delimiter. + """ + if self.position is None: + raise ValueError('Cell position must be set before calculating alignment.') + + headerDelimiterIndex = 0 + while headerDelimiterIndex < len(defaultAlignments) and self.position > headerDelimiterPositions[headerDelimiterIndex]: + headerDelimiterIndex += 1 + if headerDelimiterIndex < len(defaultAlignments): + if self.position < headerDelimiterPositions[headerDelimiterIndex]: + self.alignment = defaultAlignments[headerDelimiterIndex] + elif self.position == headerDelimiterPositions[headerDelimiterIndex]: + self.alignment = defaultAlignments[headerDelimiterIndex] + headerDelimiterIndex += 1 + else: + raise ValueError('Invalid table formatting') + + + def __str__(self): + return f'(Content: {self.content}, Rowspan: {self.rowspan}, Colspan: {self.colspan}, Alignment: {self.alignment}, Position: {self.position}, ListFlag: {self.listFlag}, AuxiliarIndex: {self.auxiliarIndex})' + + + def __repr__(self): + return self.__str__() + + +class GridRow(): + """ Represents a row in a grid table. """ + cells:list[GridCell] = [] + + + def __init__(self, length: int = 1) -> None: + self.cells = [GridCell() for _ in range(length)] + + + def __getitem__(self, item): + return self.cells[item] + + + def __setitem__(self, key, value): + self.cells[key] = value + + + def __str__(self): + return str(self.cells) + + + def __repr__(self): + return self.__str__() + + +class GridRowsTracker(): + """ Represents the document object. """ + def __init__(self, size:int) -> None: + self.gridRowTracker = [0 for _ in range(size)] + + + def __getitem__(self, item:int) -> int: + return self.gridRowTracker[item] + + + def __setitem__(self, key:int, value:int) -> None: + self.gridRowTracker[key] = value + + + def __str__(self): + return str(self.gridRowTracker) + + + def __repr__(self): + return self.__str__() + + +# Some type aliases +GridTableRow = list[GridCell] +GridTableRowList = list[GridTableRow] + +def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableRowList]: + """ + Parse a Pandoc-style grid table into a structure for HTML conversion with rowspan and colspan. + + :param pandoc_table: String of the Pandoc-style grid table. + :return: List of lists representing the table with metadata for spans. + """ + + # Split the input into lines + lines:list[str] = [line.strip() for line in gridTable.strip().split('\n')] + + + # Detect separator lines by pattern (it does not take into account partial separators + def isSeparator(line:str) -> bool: + return matchGridTableSeparator.match(line) is not None + + + # Set content on the cell - concatenating multilines, flagging lists + def handleCellContent(cell:GridCell, content:str) -> None: + _c = content.strip() + + if cell.content is None: # Previous empty cell + cell.rowspan += 1 + cell.colspan += 1 + if _c.startswith('- '): # List in a cell + cell.listFlag = True + cell.content = _c + '\n' # Add newline to know when the list element ends + + elif cell.listFlag and len(_c) > 0: # any other content when handling list is concatenated to the last list element + cell.content = _c + '\n' + + elif not _c: # separation between list and other paragraph + cell.listFlag = False + cell.content = '\n' #if not cell['content'].endswith("\n") else "" + else: + cell.content = re.sub(r'\\\s*$', '\n', _c) + else: # Cell has content + if _c.startswith('- '): # List + if not cell.listFlag: + cell.content += '\n' + #cell['content'] = cell['content'].strip("\n") + cell.listFlag = True + cell.content += _c + '\n' # Add newline to know when the list element ends + elif cell.listFlag and _c: # any other content when handling list is concatenated to the last list element + cell.content = cell.content.strip('\n') + ' ' + _c + '\n' + elif len(_c) == 0: # separation between list and other paragraph + cell.listFlag = False + #content = re.sub(r'\\\s*$', "\n", content.strip()) + cell.content += '\n' if not cell.content.endswith('\n') else '' + else: + cell.content += ' ' + re.sub(r'\\\s*$', '\n', _c) + + # Adjust colspan of a cell + def adjustColspan(row:GridRow, columnIndex:int, numberOfParts:int, line, numberOfColumns:int, delimiterPositions:list[int]) -> None: + for j in range(columnIndex, numberOfParts): + delimiterStart:Optional[int] = None + colI = columnIndex + while delimiterStart == None: + delimiterStart = row[colI - 1].position if colI > 0 else 0 + colI -= 1 + positions = [line.find(delimiter, delimiterStart + 1) for delimiter in "|+" if delimiter in line[delimiterStart + 1:]] + position = min(positions) if positions else -1 + if position > delimiterPositions[j]: # Colspan to be increased + row[columnIndex].colspan += 1 + if position == delimiterPositions[len(delimiterPositions) - 1]: # last cell in row, adjust colspan to get max number columns + colspan_allocated = row[columnIndex].colspan + #for cell_index in range(number_of_parts): + # colspan_allocated += row[cell_index].colspan + row[columnIndex].colspan += numberOfColumns - colspan_allocated - columnIndex + elif position < delimiterPositions[j]: + raise ValueError("Wrong cell formatting") + else: + break + + row[columnIndex].colspanAdjusted = True # Mark cell as adjusted + + + separatorIndices = [i for i, line in enumerate(lines) if isSeparator(line)] + + if not separatorIndices: + raise ValueError('No valid separators found in the provided grid table.') + + # Calculate max number of columns + delimiterPositions:list[int] = [] + numberOfColumns = 0 + + for separatorIndex in separatorIndices: + if (_cnt := lines[separatorIndex].count('+') - 1) > numberOfColumns: + numberOfColumns = _cnt + delimiterPositions = [] + for rowIndex in range(numberOfColumns): + delimiterPositionsStart = delimiterPositions[rowIndex - 1] if rowIndex != 0 else 0 + delPositions = [lines[separatorIndex].find(delimiter, delimiterPositionsStart + 1) for delimiter in '+' if delimiter in lines[separatorIndex][delimiterPositionsStart + 1:]] + delimiterPositions.append(min(delPositions) if delPositions else -1) + + + # Determine delimter positions and alignments + hasHeader = False + headerDelimiterPositions:list[int] = [] + headerRows:GridTableRowList = [] + dataRows:GridTableRowList = [] + defaultAlignments:list[str] = [] + + for index in separatorIndices: + if matchGridTableHeaderSeparator.match(lines[index]): + hasHeader = True + headerSeparatorIndex = index + parts = re.split(r'\+', lines[index].strip('+')) + #Calculate default alignments and positions of delimiters + for partIndex in range(len(parts)): + if parts[partIndex].startswith(':') and not parts[partIndex].endswith(':'): # Left alignment + defaultAlignments.append('align="left"') + elif not parts[partIndex].startswith(":") and parts[partIndex].endswith(":"): # Right alignment + defaultAlignments.append('align="right"') + else: + defaultAlignments.append('align="center"') # Center alignment + # Delimiter position + delimiterPositionsStart = delimiterPositions[partIndex - 1] if partIndex != 0 else 0 + delPositions = [lines[index].find(delimiter, delimiterPositionsStart + 1) for delimiter in '+' if delimiter in lines[index][delimiterPositionsStart + 1:]] + headerDelimiterPositions.append(min(delPositions) if delPositions else -1) + + + for rowNumber in range(len(separatorIndices) - 1): + rows:list[GridRow] = [] + rowsTracker:GridRowsTracker + inDataRow = False + start, end = separatorIndices[rowNumber], separatorIndices[rowNumber + 1] + rowLines = lines[start:end] # Lines between separators including separator line start as it gives information about the number of columns of the row + if rowLines: + # Combine multiline content into single strings for each cell + for line in rowLines: + if isSeparator(line) and not inDataRow: + inDataRow = True + parts = re.split(r'\s*\+\s*', line.strip('+')) + delimiterIndex = 0 + + rows.append(GridRow(numberOfColumns)) + rowsTracker = GridRowsTracker(numberOfColumns) + columnIndex = 0 + + for rowIndex in range(len(parts)): + if columnIndex in range(numberOfColumns): + delimiterIndex += len(parts[rowIndex]) + 1 + cell = rows[-1][columnIndex] + + # Set position + cell.position = delimiterIndex # Position of cell delimiter + + + # Set alignment as defined by header separator line + cell.calculateAndSetAlignment(headerDelimiterPositions, defaultAlignments) + + while delimiterIndex > delimiterPositions[columnIndex]: + columnIndex += 1 + columnIndex += 1 + + elif inDataRow: + # Regular data row or partial separator + if matchGridTableBodySeparator.match(line): # Partial separator + cellsContent = re.split(r"[\|\+]", line.strip("|").strip("+")) # (?<!\\)[\|\+] + #Add another row, set delimiters for each cell + rows.append(GridRow(numberOfColumns)) + auxDelimiterIndex = 0 + auxiliarCellIndex = 0 + + for columnIndex, content in enumerate(cellsContent): + if auxiliarCellIndex in range(numberOfColumns): + auxDelimiterIndex += len(content) + 1 + cell = rows[-1][auxiliarCellIndex] + cell.position = auxDelimiterIndex # Position of cell delimiter + + cell.calculateAndSetAlignment(headerDelimiterPositions, defaultAlignments) + while auxDelimiterIndex > delimiterPositions[auxiliarCellIndex]: + auxiliarCellIndex += 1 + auxiliarCellIndex += 1 + + if len(cellsContent) <= numberOfColumns: # Colspan: Positions of | with respect to + need to be determined + columnCellIndex = 0 + + # Go through all cells in a columnt + for columnIndex, content in enumerate(cellsContent): + rowIndex = rowsTracker[columnCellIndex] + cell = rows[rowIndex][columnCellIndex] + + # Check whether a cell contains a header separator + if matchGridTableBodySeparatorLine.match(content): # A new row is to be added + rowsTracker[columnCellIndex] += 1 # That actual row will have more than one row + cell.listFlag = False + columnForward = 0 + + for delIndex in range(columnCellIndex, len(delimiterPositions)): + rowIndex = rowsTracker[columnCellIndex] # Correcting the rowIndex. Might have been changed by a previous iteration + if rows[rowIndex][columnCellIndex].position >= delimiterPositions[delIndex]: + columnForward += 1 + rowsTracker[columnCellIndex + columnForward - 1] += 1 if columnForward > 1 else 0 + columnCellIndex += columnForward + + continue + + else: + # Handle content of the cell + handleCellContent(cell, cellsContent[columnIndex]) + cell.rowspan += 1 + if not cell.colspanAdjusted: + # TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator + adjustColspan(rows[rowIndex], columnCellIndex, numberOfColumns, line, numberOfColumns, delimiterPositions) + + if cell.position >= delimiterPositions[columnCellIndex]: + columnCellIndex += cell.colspan if cell.colspan != 0 else 1 + continue + + else: + raise ValueError("More cells than columns found") + + else: # Data row + cellsContent = re.split(r'\s*\|\s*', line.strip('|')) + columnCellIndex = 0 + if len(cellsContent) < numberOfColumns: # Colspan: Positions of | with respect to + need to be determined + for columnIndex, content in enumerate(cellsContent): + row = rows[rowsTracker[columnCellIndex]] + cell = row[columnCellIndex] + # Handle content of the cell + handleCellContent(cell, content) + if not cell.colspanAdjusted: + #TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator + adjustColspan(row, columnCellIndex, numberOfColumns, line, numberOfColumns, delimiterPositions) + if cell.position >= delimiterPositions[columnCellIndex]: + columnCellIndex += cell.colspan # Move forward index i + + elif len(cellsContent) == numberOfColumns: # Simple row + for columnIndex, content in enumerate(cellsContent): + rowIndex = rowsTracker[columnIndex] + handleCellContent(rows[rowIndex][columnIndex], content) + else: + raise ValueError("More cells than columns found") + else: + raise ValueError("No separator line found for row starting") + + if hasHeader and start >= headerSeparatorIndex: # table_row and auxiliar_row are part of data_rows + for row in rows: + dataRows.append(row.cells) + elif hasHeader and start < headerSeparatorIndex: # table_row and auxiliar_row are part of header_rows + for row in rows: # header rows + headerRows.append(row.cells) + + # Check if there are any data rows + if not dataRows and not headerRows: + raise ValueError('No valid rows found in the provided grid table.') + + # Format text + for gridRows in [headerRows, dataRows]: + for gridRow in gridRows: + for cell in gridRow: + if cell.content is not None: + # Replacing "<" by < + cell.content = cell.content.replace("<", "<") + + # Bold replacements + # Regex to detect markdown bold formatting in cell content + if cell.content is not None: + cell.content = matchBold.sub(r'<strong>\g<text></strong>', cell.content) + + # Italic replacements + # Regex to detect markdown italic formatting in cell content + if cell.content is not None: + cell.content = matchItalic.sub(r'<i>\g<text></i>', cell.content) + + + # Correct newlines characters + for headerRow in headerRows: + for cell in headerRow: + cell.content = cell.content.replace('\n', '<br />') if cell.content is not None else None + for dataRow in dataRows: + for cell in dataRow: + cell.content = cell.content.replace('\n', '<br />') if cell.content is not None else None + + # + # Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows + # + + # Checking the header rows + forwardRowspan:list[int] = [] + for idx, headerRow in enumerate(headerRows): + if len(forwardRowspan) == 0: + forwardRowspan = [0] * len(headerRows[idx]) + sum = 0 + + for cellIndex, cell in enumerate(headerRow): + sum += cell.colspan + if idx > 0 and cell.colspan == 0: + if forwardRowspan[cellIndex] > 0: + sum += 1 + forwardRowspan[cellIndex] -= 1 + if forwardRowspan[cellIndex] == 0 and cell.rowspan > 1: + forwardRowspan[cellIndex] = cell.rowspan -1 + + if not sum == numberOfColumns: + raise ValueError('Grid table not converted properly') + + # Checking the data rows + forwardRowspan = [] + for idx, dataRow in enumerate(dataRows): + if len(forwardRowspan) == 0: + forwardRowspan = [0] * len(dataRows[idx]) + sum = 0 + + for cellIndex, cell in enumerate(dataRows[idx]): + sum += cell.colspan + if idx > 0 and cell.colspan == 0: + if forwardRowspan[cellIndex] > 0: + sum += 1 + forwardRowspan[cellIndex] -= 1 + if forwardRowspan[cellIndex] == 0 and cell.rowspan > 1: + forwardRowspan[cellIndex] = cell.rowspan - 1 + if not sum == numberOfColumns: + raise ValueError('Grid table not converted properly') + + return headerRows, dataRows + + +def generateHtmlTableWithSpans(gridTable:str) -> str: + """ Generate an HTML table from a Pandoc-style grid table with row and column spans. + + Args: + gridTable: The Pandoc-style grid table. + + Returns: + The HTML table in string format. + """ + try: + gridHeader, gridBody = parseGridTableWithSpans(gridTable) + except Exception as e: + import traceback + traceback.print_exc() + return f'HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS. {e}' + + html = '<table>\n' + hasHeader = False + + for row in gridHeader: + for cell in row: + if cell.rowspan != 0 and cell.colspan != 0: + hasHeader = True + break + + if hasHeader: + html += ' <thead>\n' + for row in gridHeader: + html += " <tr>\n" + for cell in row: + if cell.rowspan == 0 or cell.colspan == 0: + continue + else: + # Prepare content, in case there's a list + if cell.content is not None and (matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell.content)): # Update cell in new row + list = "<ul>" + # Build list the matches + for match in matches: + list += "<li>" + match[1] + "</li>" + list += "</ul>" + cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell.content) + # Enforce left alignment if cell contains a list + cell.alignment = "align=\"left\"" + + rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else "" + colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else "" + html += f" <th{rowspan}{colspan} {cell.alignment}>{cell.content}</th>\n" + html += " </tr>\n" + html += " </thead>\n" + + + html += " <tbody>\n" + for row in gridBody: + html += " <tr>\n" + for cell in row: + if cell.rowspan == 0 or cell.colspan == 0: + continue + else: + #Prepare content, in case there's a list + if cell.content is not None and (matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell.content)): # Update cell in new row + list = "<ul>" + # Build list the matches + for match in matches: + list += "<li>" + match[1] + "</li>" + list += "</ul>" + cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell.content) + # Enforce left alignment if cell contains a list + cell.alignment = "align=\"left\"" + + rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else "" + colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else "" + html += f" <td{rowspan}{colspan} {cell.alignment}>{cell.content}</td>\n" + html += " </tr>\n" + + html += " </tbody>\n" + html += "</table>" + return html + diff --git a/toMkdocs/makrdownTools.py b/toMkdocs/makrdownTools.py new file mode 100644 index 0000000..169b634 --- /dev/null +++ b/toMkdocs/makrdownTools.py @@ -0,0 +1,494 @@ +# +# markdownTools.py +# +# (c) 2025 by Andreas Kraft & Miguel Angel Reina Ortega +# License: BSD 3-Clause License. See the LICENSE file for further details. + + + +""" Various tools for markdown processing +""" +from __future__ import annotations + +from dataclasses import dataclass +import base64, hashlib +from enum import Enum, auto + +from gridTableTools import generateHtmlTableWithSpans +from regexMatches import * + +# TODO use a verbosity level instead +verbose = False +veryVerbose = False + +printInfo = print +printDebug = print + +def setScreenPrinters(info:callable = print, debug:callable = print) -> None: + global printInfo, printDebug + + printInfo = info + printDebug = debug + + +def _shortHash(value:str, length:int) -> str: + """ Generate a short hash of a string value. + + Args: + value: The value to hash. + length: The length of the hash. + + Returns: + The hash. + """ + return base64.b64encode( + hashlib.sha256( + value.encode() + ).digest() + ).decode()[:length] + + +class LineType(Enum): + """ Represents the type of a line in the markdown file. """ + HEADING = auto() + TEXT = auto() + CODEFENCESTART = auto() + CODE = auto() + CODEFENCEEND = auto() + LIST = auto() + NOTE = auto() + STANDALONEIMAGE = auto() + TABLEHEADER = auto() + TABLESEPARATOR = auto() + TABLEROW = auto() + TABLELASTROW = auto() + + +@dataclass +class Line: + """ Represents a line in the markdown file. """ + text:str = '\n' + lineType:LineType = LineType.TEXT + + +@dataclass +class Clause: + """ Represents a clause in the markdown file. """ + _level:int + _clauseNumber:str + _title:str + _lines:list[Line] + + + @property + def level(self) -> int: + """ Return the level of the clause. """ + return self._level + + + @property + def clauseNumber(self) -> str: + """ Return the clause number. """ + return self._clauseNumber if self._clauseNumber else '0' + + + @clauseNumber.setter + def clauseNumber(self, value:str) -> None: + """ Set the clause number. """ + self._clauseNumber = value + + + @property + def title(self) -> str: + """ Return the title of the clause. """ + return self._title + + + @title.setter + def title(self, value:str) -> None: + """ Set the title of the clause. """ + self._title = value + + + @property + def lines(self) -> list[Line]: + """ Return the lines of the clause. """ + return self._lines + + + @lines.setter + def lines(self, value:list[Line]) -> None: + """ Set the lines of the clause. """ + self._lines = value + + + @property + def linesCount(self) -> int: + """ Return the number of lines in the clause. + + Returns: + The number of lines in the clause. + """ + return len(self.lines) + + + def append(self, line:Line) -> None: + """ Append a line to the clause. + + Args: + line: The line to append. + """ + self.lines.append(line) + + + def extend(self, clause:Clause) -> None: + """ Extend the clause with the lines of another clause. + + Args: + clause: The clause to extend with. + """ + self.lines.extend(clause.lines) + + + def asStringList(self, paddings:int = 0) -> list[str]: + """ Return the clause as a list of strings. + + Args: + paddings: The number of empty lines to add before the clause. + Returns: + The clause's lines as a list of strings. + """ + return [ '\n' for _ in range(paddings) ] + [ l.text for l in self.lines ] + + + def __len__(self) -> int: + """ Return the number of characters in the clause. This does not include + empty lines or lines that contain only whitespace. + + Returns: + The number of characters in the clause. + """ + return sum([ len(l.text.strip()) for l in self.lines ]) + +class Footnote: + """ Represents a footnote in the markdown file. """ + def __init__(self, id:str, line:Line) -> None: + """ Constructor. + + Args: + id: The id of the footnote. + line: The line of the footnote. + """ + self.id = id + """ The id of the footnote. """ + + self.line = line + """ The line of the footnote. """ + +class Document: + """ Represents the document object. """ + clauses:list[Clause] = [] + footnotes:list[Footnote] = [] + + def __init__(self, clauses:list[Clause], footnotes:list[Footnote] = []) -> None: + self.clauses = clauses + self.footnotes = footnotes + + + def splitMarkdownDocument(self, + ignoreTitles:list[str] = [], + splitLevel:int = 1, + ignoreUntilFirstHeading:bool = False) -> None: + """ Split the clauses at a certain level. This is used to create the separate + markdown files for MkDocs. + + After the split, the clauses are stored in the document object. + + Args: + ignoreTitles: A list of titles that should be ignored. They are not included in the output. + splitLevel: The level at which the clauses should be split. + ignoreUntilFirstHeader: Ignore all clauses until the first heading. + + """ + result:list[Clause] = [] + + ignoreTitles = [ t.casefold() for t in ignoreTitles ] # convert to lower case + + for clause in self.clauses: + level = clause.level + + # Check if the current clause should be ignored + if clause.title.casefold() in ignoreTitles: + continue + + # Add a new output clause if the current clause's level is + # equal or less than the split level + if clause.level <= splitLevel: + result.append(Clause(level, clause.clauseNumber, clause.title, [])) + + # Add the lines to the output clause + result[-1].extend(clause) + + # Remove the first clause if it has no title + if ignoreUntilFirstHeading: + while len(result[0].title) == 0: + result.pop(0) + + self.clauses = result + + + def insertFootnotes(self) -> None: + """ Insert footnotes into the clauses. + + After the insertion, the clauses are stored in the document object. + + """ + printInfo('Adding footnotes to clauses') + + for clause in self.clauses: + foundFootnotes:list[Footnote] = [] + for line in clause.lines: + # ATTN: Only footnotes in normal text lines are checked + + if line.lineType == LineType.TEXT and (fn := MatchInlineFootnote.search(line.text)): + # Find the footnote in the list of footnotes + for f in self.footnotes: + if f.id == fn.groups()[0]: + foundFootnotes.append(f) + + # Insert the footnotes at the end of the clause + if len(foundFootnotes) > 0: + clause.append(Line('\n', LineType.TEXT)) + for f in foundFootnotes: + clause.append(f.line) + + + def updateLinks(self) -> None: + """ Update the links in the clauses to the new structure. This is done by + creating a dictionary of all links and their targets and then replacing + the links in the clauses. + + After the update, the clauses are stored in the document object. + """ + printInfo('Updating links in clauses') + + # Build the link target dictionary. Mapping anchor -> clause + linkTargets:dict[str, Clause] = {} + + # Find all Markdown headers in the clauses and convert them to anchor format + for i, clause in enumerate(self.clauses): + # Find all headers in the clause + for line in clause.lines: + if (m := matchHeader.match(line.text)): + + # convert the header to anchor format and add it to the dictionary + # Remove special characters + # TODO move perhaps to an own function + anchor = m.groups()[1].strip().casefold().replace(' ', '-') + for c in ( '.', '(', ')', '[', ']', ':', ',', "'", '"'): + anchor = anchor.replace(c, '') + # remove html tags from the anchor + anchor = re.sub(matchHtmlTag, '', anchor) + + linkTargets[f'#{anchor}'] = clause + if veryVerbose: + printDebug(f'Added Markdown anchor "{anchor}"') + + # Find all HTML anchors in the clauses and add them to the dictionary + for i, clause in enumerate(self.clauses): + for line in clause.lines: + if (anchors := matchHtmlAnchorLink.findall(line.text)): + for a in anchors: + linkTargets[f'#{a}'] = clause + if veryVerbose: + printDebug(f'Found HTML anchor "{a}" in clause "{clause.title}"') + + # Replace the html links + for clause in self.clauses: + for i, line in enumerate(clause.lines): + if (links := matchHtmlLink.findall(line.text)): + for lnk in links: + if lnk in linkTargets: + line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[lnk].clauseNumber}/#{lnk[1:]}') # Update the current line as well + if veryVerbose: + printDebug(f'Updated HTML link "{lnk}" in clause "{clause.title}"') + + # Replace the markdown links + for clause in self.clauses: + for i, line in enumerate(clause.lines): + if (links := markdownLink.findall(line.text)): + # Replace the old link targets with converted + # (lower case) versions that point to the output files + for lnk in links: + _lnk =lnk.casefold() + if _lnk in linkTargets: + line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[_lnk].clauseNumber}/#{lnk[1:]}') # Update the current line as well + if veryVerbose: + printDebug(f'Updated Markdown link "{lnk}" in clause "{clause.title}"') + + + def updateNotes(self) -> None: + """ Update the notes in the clauses to the mkDocs notes version. + + After the update, the clauses are stored in the document object. + """ + printInfo('Updating notes in clauses') + + for clause in self.clauses: + lines:list[Line] = [] + inNote = False + for line in clause.lines: + if line.lineType == LineType.NOTE: + if not inNote: + lines.append(Line('\n', LineType.TEXT)) + lines.append(Line('!!! note\n', LineType.NOTE)) + inNote = True + lines.append(Line(f"\t{re.sub(matchNoteStart, '', line.text)}", LineType.NOTE)) + if verbose: + printDebug(f'Converted note in clause "{clause.title}"') + else: + if inNote: + lines.append(Line('\n', LineType.TEXT)) + inNote = False + lines.append(line) + clause.lines = lines + + + +def analyseMarkdown(filename:str) -> Document: + """ Analyse the markdown file and split it into clauses. + + Args: + filename: The name of the markdown file. + + Returns: + The document object. + """ + + printInfo(f'Analyzing "{filename}"') + + # Read the file. + # Note: We use utf-8 and replace errors to avoid problems with special or unknown characters. + with open(filename, 'r', encoding = 'utf-8', errors = 'replace') as file: + inLines = file.readlines() + + # The list of clauses. The first clause contains the text before the first heading. + outClauses:list[Clause] = [Clause(0, '', '', [])] + footnotes:list[Footnote] = [] + + # Go through the lines and detect headers and codefences + inCodefence = False + inTable = False + tableHasSeparator = False + inGridTable = False + gridTableHasSeparator = False + gridTable = "" + for line in inLines: + + # Detect and handle codefences + # For the moment we support only codefences that start and end + # with 3 backticks. This is the most common way to define codefences. + # Note, that longer codefences are allowed by the markdown specification. + + if matchCodefenceStart.match(line) and not inCodefence: + inCodefence = True + outClauses[-1].append(Line(line, LineType.CODEFENCESTART)) + continue + if matchCodefenceEnd.match(line): + inCodefence = False + outClauses[-1].append(Line(line, LineType.CODEFENCEEND)) + continue + if inCodefence: + outClauses[-1].append(Line(line, LineType.CODE)) + continue + + # Detect and handle tables + if matchTable.match(line) and not inTable and not inGridTable: + inTable = True + outClauses[-1].append(Line(line, LineType.TABLEHEADER)) + continue + if inTable: + if matchTableSeparator.match(line) and not tableHasSeparator: + outClauses[-1].append(Line(line, LineType.TABLESEPARATOR)) + tableHasSeparator = True + continue + elif matchTable.match(line): + outClauses[-1].append(Line(line, LineType.TABLEROW)) + continue + else: + inTable = False + tableHasSeparator = False + # Mark the previous line as the last row in the table + outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW + # continue with other matches + + #Detect grid tables and convert them to html table + if matchGridTable.match(line) and not inGridTable: + inGridTable = True + #outClauses[-1].append(Line(line, LineType.TABLEHEADER)) + gridTable += line + continue + if inGridTable: + if matchGridTableHeaderSeparator.match(line) or matchGridTableBodySeparator.match(line): + #outClauses[-1].append(Line(line, LineType.TABLESEPARATOR)) + gridTable += line + continue + elif matchTable.match(line): + #outClauses[-1].append(Line(line, LineType.TABLEROW)) + gridTable += line + continue + else: + inGridTable = False + # Mark the previous line as the last row in the table + #outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW + # print(gridTable) + try: + htmltable = generateHtmlTableWithSpans(gridTable) + print(htmltable) + except Exception as e: + print(f"Error: {e}") + # TODO move this outside of the analyseMarkdown function !!! + for row in htmltable: + outClauses[-1].append(Line(row, LineType.TABLEROW)) + gridTable = "" + # continue with other matches + + # Detect notes + # Notes are lines that start with a '>'. + if matchNote.match(line): + outClauses[-1].append(Line(line, LineType.NOTE)) + continue + + # Detect footnotes + # Footnotes are lines that start with a '^' + if (_fn := matchFootnote.match(line)): + footnotes.append(Footnote(_fn.groups()[0], Line(line, LineType.TEXT))) + continue + + # Detect images on a single line + if (m := matchStandAloneImage.match(line)): + outClauses[-1].append(Line(line, LineType.STANDALONEIMAGE)) + continue + + # Detect headers + _lineType = LineType.TEXT + if (m := matchHeader.match(line)): + # Add a new clause + clauseTitle = m.groups()[1].strip() + clauseTitle = re.sub(matchHtmlTag, '', clauseTitle) + headerNumber = matchHeaderNumber.search(clauseTitle) + outClauses.append(Clause(len(m.groups()[0]), # level + headerNumber.group() if headerNumber else _shortHash(clauseTitle, 6), + clauseTitle, + [])) + _lineType = LineType.HEADING + + # Just add the line to the current clause as text + outClauses[-1].append(Line(line, _lineType)) + + return Document(outClauses, footnotes) + + + + + diff --git a/toMkdocs/regexMatches.py b/toMkdocs/regexMatches.py new file mode 100644 index 0000000..7b784c1 --- /dev/null +++ b/toMkdocs/regexMatches.py @@ -0,0 +1,40 @@ +# +# regexMatches.py +# +# (c) 2025 by Andreas Kraft & Miguel Angel Reina Ortega +# License: BSD 3-Clause License. See the LICENSE file for further details. + +# +""" This module contains the regular expressions used in the markdown processing. +""" + +import re + + +# Regular expressions +match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE) +matchFootnote = re.compile(r'\[\^([^\]]*)\]:', re.IGNORECASE) +matchHtmlAnchorLink = re.compile(r'<a\s+name="([^"]*)">[^<]*</a>', re.IGNORECASE) +matchHtmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE) +matchHtmlTag = re.compile(r'<[^>]*>', re.IGNORECASE) +MatchInlineFootnote = re.compile(r'\[\^([^\]]*)\]', re.IGNORECASE) +markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE) +matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE) +matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE) +matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE) +matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE) +matchGridTableBodySeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE) +matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE) +matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE) +matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE) +matchHeader = re.compile(r'(#+)\s+(.*)', re.IGNORECASE) +matchHeaderNumber = re.compile(r'\b[A-Za-z0-9]\d*(\.\d+)*\b', re.IGNORECASE) +matchListInContent = re.compile(r'^(?:\s*(P<marker>[-*+]|\s*\d+\.))\s+(P<content>.+)$', re.IGNORECASE) +matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE) +matchNoteStart = re.compile(r'^\s*>\s*(note)?\s*[:]?\s*', re.IGNORECASE) +matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECASE) +matchTable = re.compile(r'^\s*\|.*\|\s*$', re.IGNORECASE) +matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE) + +matchBold = re.compile(r'(?<!\S)(\*\*|__)(?P<text>.+?)(?<!\\)\1(?!\S)') +matchItalic = re.compile(r'(?<!\S)(\*|_)(?P<text>.+?)(?<!\\)\1(?!\S)') diff --git a/toMkdocs/toMkdocs.py b/toMkdocs/toMkdocs.py index ae0be2a..69037d1 100644 --- a/toMkdocs/toMkdocs.py +++ b/toMkdocs/toMkdocs.py @@ -1,1036 +1,121 @@ # # toMkdocs.py # -# (c) 2024 by Andreas Kraft +# (c) 2024 by Andreas Kraft & Miguel Angel Reina Ortega +# License: BSD 3-Clause License. See the LICENSE file for further details. + # # This script converts oneM2M spec markdown file to a mkdocs compatible # directory structure. # from __future__ import annotations -import logging -from enum import Enum, auto -import argparse, re, os, shutil, hashlib, base64 -from dataclasses import dataclass +import argparse, os, shutil from rich import print +from makrdownTools import Line, Document, analyseMarkdown, setScreenPrinters +from regexMatches import match2spaceListIndention verbose = False veryVerbose = False -class LineType(Enum): - """ Represents the type of a line in the markdown file. """ - HEADING = auto() - TEXT = auto() - CODEFENCESTART = auto() - CODE = auto() - CODEFENCEEND = auto() - LIST = auto() - NOTE = auto() - STANDALONEIMAGE = auto() - TABLEHEADER = auto() - TABLESEPARATOR = auto() - TABLEROW = auto() - TABLELASTROW = auto() - - -@dataclass -class Line: - """ Represents a line in the markdown file. """ - text:str = '\n' - lineType:LineType = LineType.TEXT - - - -@dataclass -class Clause: - """ Represents a clause in the markdown file. """ - _level:int - _clauseNumber:str - _title:str - _lines:list[Line] - - - @property - def level(self) -> int: - """ Return the level of the clause. """ - return self._level - - - @property - def clauseNumber(self) -> str: - """ Return the clause number. """ - return self._clauseNumber if self._clauseNumber else '0' - - - @clauseNumber.setter - def clauseNumber(self, value:str) -> None: - """ Set the clause number. """ - self._clauseNumber = value - - - @property - def title(self) -> str: - """ Return the title of the clause. """ - return self._title - - - @title.setter - def title(self, value:str) -> None: - """ Set the title of the clause. """ - self._title = value - - - @property - def lines(self) -> list[Line]: - """ Return the lines of the clause. """ - return self._lines - - - @lines.setter - def lines(self, value:list[Line]) -> None: - """ Set the lines of the clause. """ - self._lines = value - - - @property - def linesCount(self) -> int: - """ Return the number of lines in the clause. - - Returns: - The number of lines in the clause. - """ - return len(self.lines) - - - def append(self, line:Line) -> None: - """ Append a line to the clause. - - Args: - line: The line to append. - """ - self.lines.append(line) - - - def extend(self, clause:Clause) -> None: - """ Extend the clause with the lines of another clause. - - Args: - clause: The clause to extend with. - """ - self.lines.extend(clause.lines) - - - def asStringList(self, paddings:int = 0) -> list[str]: - """ Return the clause as a list of strings. - - Args: - paddings: The number of empty lines to add before the clause. - Returns: - The clause's lines as a list of strings. - """ - return [ '\n' for _ in range(paddings) ] + [ l.text for l in self.lines ] - - - def __len__(self) -> int: - """ Return the number of characters in the clause. This does not include - empty lines or lines that contain only whitespace. - - Returns: - The number of characters in the clause. - """ - return sum([ len(l.text.strip()) for l in self.lines ]) - - -class Footnote: - """ Represents a footnote in the markdown file. """ - def __init__(self, id:str, line:Line) -> None: - self.id = id - self.line = line - - -class Document: - """ Represents the document object. """ - clauses:list[Clause] = [] - footnotes:list[Footnote] = [] - - def __init__(self, clauses:list[Clause], footnotes:list[Footnote]) -> None: - self.clauses = clauses - self.footnotes = footnotes - - - def splitMarkdownDocument(self, - ignoreTitles:list[str] = [], - splitLevel:int = 1, - ignoreUntilFirstHeading:bool = False) -> None: - """ Split the clauses at a certain level. This is used to create the separate - markdown files for MkDocs. - - After the split, the clauses are stored in the document object. - - Args: - ignoreTitles: A list of titles that should be ignored. They are not included in the output. - splitLevel: The level at which the clauses should be split. - ignoreUntilFirstHeader: Ignore all clauses until the first heading. - - """ - result:list[Clause] = [] - - ignoreTitles = [ t.casefold() for t in ignoreTitles ] # convert to lower case - - for clause in self.clauses: - level = clause.level - - # Check if the current clause should be ignored - if clause.title.casefold() in ignoreTitles: - continue - - # Add a new output clause if the current clause's level is - # equal or less than the split level - if clause.level <= splitLevel: - result.append(Clause(level, clause.clauseNumber, clause.title, [])) - - # Add the lines to the output clause - result[-1].extend(clause) - - # Remove the first clause if it has no title - if ignoreUntilFirstHeading: - while len(result[0].title) == 0: - result.pop(0) - - self.clauses = result - - - def insertFootnotes(self) -> None: - """ Insert footnotes into the clauses. - - After the insertion, the clauses are stored in the document object. - - """ - print(f'[green]Adding footnotes to clauses') - - for clause in self.clauses: - foundFootnotes:list[Footnote] = [] - for line in clause.lines: - # ATTN: Only footnotes in normal text lines are checked - - if line.lineType == LineType.TEXT and (fn := _inlineFootnote.search(line.text)): - # Find the footnote in the list of footnotes - for f in self.footnotes: - if f.id == fn.groups()[0]: - foundFootnotes.append(f) - - # Insert the footnotes at the end of the clause - if len(foundFootnotes) > 0: - clause.append(Line('\n', LineType.TEXT)) - for f in foundFootnotes: - clause.append(f.line) - - - def updateLinks(self) -> None: - """ Update the links in the clauses to the new structure. This is done by - creating a dictionary of all links and their targets and then replacing - the links in the clauses. - - After the update, the clauses are stored in the document object. - """ - print(f'[green]Updating links in clauses') - - # Build the link target dictionary. Mapping anchor -> clause - linkTargets:dict[str, Clause] = {} - - # Find all Markdown headers in the clauses and convert them to anchor format - for i, clause in enumerate(self.clauses): - # Find all headers in the clause - for line in clause.lines: - if (m := _matchHeader.match(line.text)): - - # convert the header to anchor format and add it to the dictionary - # Remove special characters - # TODO move perhaps to an own function - anchor = m.groups()[1].strip().casefold().replace(' ', '-') - for c in ( '.', '(', ')', '[', ']', ':', ',', "'", '"'): - anchor = anchor.replace(c, '') - # remove html tags from the anchor - anchor = re.sub(_htmlTag, '', anchor) - - linkTargets[f'#{anchor}'] = clause - if veryVerbose: - print(f'[dim]Added Markdown anchor "{anchor}"') - - # Find all HTML anchors in the clauses and add them to the dictionary - for i, clause in enumerate(self.clauses): - for line in clause.lines: - if (anchors := _htmlAnchorLink.findall(line.text)): - for a in anchors: - linkTargets[f'#{a}'] = clause - if veryVerbose: - print(f'[dim]Found HTML anchor "{a}" in clause "{clause.title}"') - - # Replace the html links - for clause in self.clauses: - for i, line in enumerate(clause.lines): - if (links := _htmlLink.findall(line.text)): - for lnk in links: - if lnk in linkTargets: - line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[lnk].clauseNumber}/#{lnk[1:]}') # Update the current line as well - if veryVerbose: - print(f'[dim]Updated HTML link "{lnk}" in clause "{clause.title}"') - - # Replace the markdown links - for clause in self.clauses: - for i, line in enumerate(clause.lines): - if (links := _markdownLink.findall(line.text)): - # Replace the old link targets with converted - # (lower case) versions that point to the output files - for lnk in links: - _lnk =lnk.casefold() - if _lnk in linkTargets: - line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[_lnk].clauseNumber}/#{lnk[1:]}') # Update the current line as well - if veryVerbose: - print(f'[dim]Updated Markdown link "{lnk}" in clause "{clause.title}"') - - - def updateNotes(self) -> None: - """ Update the notes in the clauses to the mkDocs notes version. - - After the update, the clauses are stored in the document object. - """ - print(f'[green]Updating notes in clauses') - - for clause in self.clauses: - lines:list[Line] = [] - inNote = False - for line in clause.lines: - if line.lineType == LineType.NOTE: - if not inNote: - lines.append(Line('\n', LineType.TEXT)) - lines.append(Line('!!! note\n', LineType.NOTE)) - inNote = True - lines.append(Line(f"\t{re.sub(_matchNoteStart, '', line.text)}", LineType.NOTE)) - if verbose: - print(f'[dim]Converted note in clause "{clause.title}"') - else: - if inNote: - lines.append(Line('\n', LineType.TEXT)) - inNote = False - lines.append(line) - clause.lines = lines - - - def prepareForMkdocs(self, includeHangingParagraphs:bool = False) -> None: - """ Prepare the clauses for MkDocs. This includes removing the heading - from the clauses and marking the clauses that are only for navigation. - - After the preparation, the clauses are stored in the document object. - - Args: - includeHangingParagraphs: Include hanging paragraphs in the output. - """ - - # Remove the heading from the lines. The heading is the first line - # in the clause. This is done because MkDocs repeats the heading when - # displaying the page. - for clause in self.clauses: - if clause.linesCount > 0: - clause.lines.pop(0) - # Also, remove the first empty lines if they exist - while clause.linesCount > 0 and clause.lines[0].text.strip() == '': - clause.lines.pop(0) - - # Detect and handle hanging paragraphs. This is extra text in a clause, which - # has sub-clauses. This text is not allowed in oneM2M specifications. - for i, clause in enumerate(self.clauses): - if clause.level > 0 and clause.linesCount > 0: - # Check if there is a sub-clause in the next clause - if i + 1 < len(self.clauses) and self.clauses[i+1].level > clause.level: - # This is a hanging paragraph. Remove the text from the current clause. - print(f'[yellow]Hanging paragraph in clause "{clause.title}" {"(removed)" if not includeHangingParagraphs else "(kept)"}') - if not includeHangingParagraphs: - self.clauses[i].lines = [] - else: - self.clauses[i].lines = [Line("<mark>Editor note: This is a hanging paragraph and it must be moved to its own clause</mark>")] + [Line()] + self.clauses[i].lines - - # Repair wrong markdown for indented lines. - # Add 2 spaces to existing 2-space indentions - for clause in self.clauses: - for i, line in enumerate(clause.lines): - if _match2spaceListIndention.match(line.text): - clause.lines[i].text = ' ' + line.text - - - def writeClausesMkDocs(self, filename:str, navTitle:str, addNavTitle:bool = False) -> None: - """ Write the clauses to separate files and create a navigation file. - - Args: - filename: The name of the original markdown file. - navTitle: The title of the navigation entry. This is used to determine the directories. - addNavTitle: Add the title as an extra navigation level to the navigation file. - """ - - print(f'[green]Writing clauses to files') - # create directory first - os.makedirs(f'{os.path.dirname(filename)}/{navTitle}', exist_ok = True) - - # Write the files - for i, f in enumerate(self.clauses): - # write to single files, even empty ones - if verbose: - print(f'[dim]Writing "{f.clauseNumber}.md" - "{f.title}"') - with open(f'{os.path.dirname(filename)}/{navTitle}/{f.clauseNumber}.md', 'w') as file: - # Add one empty line before the clause. This is done to avoid - # a bug in MkDocs that does not display the first line of a clause - # if it contains a colon. It does not matter otherwise if the line - # is empty or not. - file.writelines(f.asStringList(1)) - - # write nav.yml file - print(f'[green]Writing "_nav.yml"') - indentation = ' ' if addNavTitle else '' # TODO make number of spaces configurable - with open(f'{os.path.dirname(filename)}/_nav.yml', 'w') as file: - if veryVerbose: - print(f'[dim]Writing navigation file') - if addNavTitle: - file.write(f'{indentation}- {navTitle}:\n') - for i, f in enumerate(self.clauses): +def prepareForMkdocs(document:Document, includeHangingParagraphs:bool = False) -> None: + """ Prepare the clauses for MkDocs. This includes removing the heading + from the clauses and marking the clauses that are only for navigation. - if not f.title: - print("continue") - continue - - # TODO handle if the next clause is more than one level deeper - - _title = f.title.replace("'", '"') - nextClause = self.clauses[i+1] if i+1 < len(self.clauses) else None - if nextClause is None or nextClause.level <= f.level: - file.write(f"{indentation}{' '*f.level}- '{_title}': '{navTitle}/{f.clauseNumber}.md'\n") - else: - file.write(f"{indentation}{' '*f.level}- '{_title}':\n") - if len(f) > 0: - file.write(f"{indentation}{' '*nextClause.level}- 'Hanging paragraph': '{navTitle}/{f.clauseNumber}.md'\n") - - - - -_matchHeader = re.compile(r'(#+)\s+(.*)', re.IGNORECASE) -_matchHeaderNumber = re.compile(r'\b[A-Za-z0-9]\d*(\.\d+)*\b', re.IGNORECASE) -_matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE) -_matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE) -_matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE) -_matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECASE) -_matchTable = re.compile(r'^\s*\|.*\|\s*$', re.IGNORECASE) -_matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE) -_matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE) -_matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE) -_matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE) -_matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE) -_matchGridTableBodySeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE) -_match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE) -_matchListInContent = re.compile(r'^(?:\s*(P<marker>[-*+]|\s*\d+\.))\s+(P<content>.+)$', re.IGNORECASE) -_markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE) -_htmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE) -_htmlAnchorLink = re.compile(r'<a\s+name="([^"]*)">[^<]*</a>', re.IGNORECASE) -_htmlTag = re.compile(r'<[^>]*>', re.IGNORECASE) -_matchNoteStart = re.compile(r'^\s*>\s*(note)?\s*[:]?\s*', re.IGNORECASE) -_footnote = re.compile(r'\[\^([^\]]*)\]:', re.IGNORECASE) -_inlineFootnote = re.compile(r'\[\^([^\]]*)\]', re.IGNORECASE) - - -# TODO handle multiple nav levels (left bar) better (make conifgurable) - - -def shortHash(value:str, length:int) -> str: - """ Generate a short hash of a string value. + After the preparation, the clauses are stored in the document object. Args: - value: The value to hash. - length: The length of the hash. - - Returns: - The hash. - """ - return base64.b64encode( - hashlib.sha256( - value.encode() - ).digest() - ).decode()[:length] - -def parse_pandoc_table_with_spans(pandoc_table): + document: The document object. + includeHangingParagraphs: Include hanging paragraphs in the output. """ - Parse a Pandoc-style grid table into a structure for HTML conversion with rowspan and colspan. - - :param pandoc_table: String of the Pandoc-style grid table. - :return: List of lists representing the table with metadata for spans. - """ - # Split the input into lines - lines = [line.strip() for line in pandoc_table.strip().split("\n")] - - class Cell: - """ Represents the document object. """ - content: str - rowspan: int - colspan: int - colspan_adjusted: bool - alignment: str - position: int - list_flag: bool - auxiliar_index: int - - def __init__(self): - self.content = None - self.rowspan = 0 - self.colspan = 0 - self.colspan_adjusted = False - self.alignment = "align=\"center\"" - self.position = None - self.list_flag = False - - def set_alignment(self): - header_delimiter_index = 0 - while header_delimiter_index in range(len(default_alignments)) and self.position > header_delimiter_positions[header_delimiter_index]: - header_delimiter_index += 1 - if header_delimiter_index in range(len(default_alignments)): - if self.position < header_delimiter_positions[header_delimiter_index]: - self.alignment = default_alignments[header_delimiter_index] - elif self.position == header_delimiter_positions[header_delimiter_index]: - self.alignment = default_alignments[header_delimiter_index] - header_delimiter_index += 1 - else: - raise ValueError("Invalid table formatting") - - class Row(): - """ Represents a row in the markdown file. """ - cells:list[Cell] = [] - - def __init__(self, length: int = 1) -> None: - self.cells = [Cell() for _ in range(length)] - - def __getitem__(self, item): - return self.cells[item] - - def __setitem__(self, key, value): - self.cells[key] = value - - class RowTracker(): - """ Represents the document object. """ - def __init__(self, items): - self.rowTracker = [0 for _ in range(items)] - - def __getitem__(self, item): - return self.rowTracker[item] - - def __setitem__(self, key, value): - self.rowTracker[key] = value - - # Detect separator lines by pattern (it does not take into account partial separators - def is_separator(line): - return _matchGridTableSeparator.match(line) - - # Set content on the cell - concatenating multilines, flagging lists - def handling_content(cell, content): - if cell.content is None: - cell.rowspan += 1 - cell.colspan += 1 - if content.strip().startswith("- "): # List - cell.list_flag = True - #print(content) - cell.content = content.strip() + "\n" # Add newline to know when the list element ends - elif cell.list_flag and content.strip() != "": # any other content when handling list is concatenated to the last list element - cell.content += content.strip() + "\n" - elif content.strip == "": # separation between list and other paragraph - cell.list_flag = False - cell.content += "\n" #if not cell['content'].endswith("\n") else "" - else: - cell.content = re.sub(r'\\\s*$', "\n", content.strip()) - else: - if content.strip().startswith("- "): # List - if not cell.list_flag: - cell.content += "\n" - #cell['content'] = cell['content'].strip("\n") - cell.list_flag = True - cell.content += content.strip() + "\n" # Add newline to know when the list element ends - elif cell.list_flag and content.strip() != "": # any other content when handling list is concatenated to the last list element - cell.content = cell.content.strip("\n") - cell.content += " " + content.strip() + "\n" - elif content.strip() == "": # separation between list and other paragraph - cell.list_flag = False - #content = re.sub(r'\\\s*$', "\n", content.strip()) - cell.content += "\n" if not cell.content.endswith("\n") else "" - else: - content = re.sub(r'\\\s*$', "\n", content.strip()) - cell.content += " " + content - #print(cell['content']) - return cell - - # Adjust colspan of a cell - def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions): - for j in range(column_index, number_of_parts): - delimiter_start = None - col_i= column_index - while delimiter_start == None: - delimiter_start = row[col_i - 1].position if col_i > 0 else 0 - col_i -= 1 - positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "|+" if delimiter in line[delimiter_start + 1:]] - position = min(positions) if positions else -1 - if position > delimiter_positions[j]: # Colspan to be increased - row[column_index].colspan += 1 - if position == delimiter_positions[len(delimiter_positions) - 1]: # last cell in row, adjust colspan to get max number columns - colspan_allocated = row[column_index].colspan - #for cell_index in range(number_of_parts): - # colspan_allocated += row[cell_index].colspan - row[column_index].colspan += number_of_columns - colspan_allocated - column_index - elif position < delimiter_positions[j]: - raise ValueError("Wrong cell formatting") - else: - break - return row[column_index] - - separator_indices = [i for i, line in enumerate(lines) if is_separator(line)] - - print(separator_indices) - if not separator_indices: - raise ValueError("No valid separators found in the provided Pandoc table.") - - # Calculate max number of columns - delimiter_positions = [] - number_of_columns = 0 - for separator_index in separator_indices: - if lines[separator_index].count("+") - 1 > number_of_columns: - number_of_columns = lines[separator_index].count("+") - 1 - delimiter_positions = [] - for j in range(number_of_columns): - delimiter_positions_start = delimiter_positions[j - 1] if j != 0 else 0 - del_positions = [lines[separator_index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[separator_index][delimiter_positions_start + 1:]] - delimiter_positions.append(min(del_positions) if del_positions else -1) - has_header = False - header_delimiter_positions = [] - for index in separator_indices: - if _matchGridTableHeaderSeparator.match(lines[index]): - has_header = True - header_separator_index = index - header_rows = [] - parts = re.split(r"\+", lines[index].strip("+")) - default_alignments = [] - #Calculate default alignments and positions of delimiters - for part_index in range(len(parts)): - if parts[part_index].startswith(":") and not parts[part_index].endswith(":"): - default_alignments.append("align=\"left\"") - elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"): - default_alignments.append("align=\"right\"") - else: - default_alignments.append("align=\"center\"") - # Delimiter position - delimiter_positions_start = delimiter_positions[part_index - 1] if part_index != 0 else 0 - del_positions = [lines[index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[index][delimiter_positions_start + 1:]] - header_delimiter_positions.append(min(del_positions) if del_positions else -1) - - data_rows = [] - for row in range(len(separator_indices) - 1): - rows = [] - rows_tracker = [] - in_data_row = False - start, end = separator_indices[row], separator_indices[row + 1] - row_lines = lines[start:end] # Lines between separators including separator line start as it gives information about the number of columns of the row - if row_lines: - # Combine multiline content into single strings for each cell - for line in row_lines: - if is_separator(line) and not in_data_row: - in_data_row = True - parts = re.split(r"\s*\+\s*", line.strip("+")) - delimiter_index = 0 - # Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator) - # we need to assign the default alignment as defined in the header separator line - # We may not need the code below, as that supports alignment per cell and row - #alignments = [] - #for part_index in range(len(parts)): - # if parts[part_index].startswith(":") and not parts[part_index].endswith(":"): - # alignments.append("align=\"left\"") - # elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"): - # alignments.append("align=\"right\"") - # else: - # alignments.append("align=\"center\"") - rows.append(Row(number_of_columns)) - #rows_tracker = [RowTracker() for _ in range(number_of_columns)] - rows_tracker = RowTracker(number_of_columns) - i = 0 - for j in range(len(parts)): - if i in range(number_of_columns): - delimiter_index += len(parts[j]) + 1 - # Set position - rows[-1][i].position = delimiter_index # Position of cell delimiter + - # Set alignment as defined by header separator line - rows[-1][i].set_alignment() - while delimiter_index > delimiter_positions[i]: - i += 1 - i += 1 - - elif in_data_row: - # Regular data row or partial separator - if _matchGridTableBodySeparator.match(line): # Partial separator - cells_content = re.split(r"[\|\+]", line.strip("|").strip("+")) # (?<!\\)[\|\+] - #Add another row, set delimiters for each cell - rows.append(Row(number_of_columns)) - aux_delimiter_index = 0 - auxiliar_cell_index = 0 - for i in range(len(cells_content)): - if auxiliar_cell_index in range(number_of_columns): - aux_delimiter_index += len(cells_content[i]) + 1 - rows[-1][auxiliar_cell_index].position = aux_delimiter_index # Position of cell delimiter + - rows[-1][auxiliar_cell_index].set_alignment() - while aux_delimiter_index > delimiter_positions[auxiliar_cell_index]: - auxiliar_cell_index += 1 - auxiliar_cell_index += 1 - if len(cells_content) <= number_of_columns: # Colspan: Positions of | with respect to + need to be determined - column_index = 0 - for i in range(len(cells_content)): - if _matchGridTableBodySeparatorLine.match(cells_content[i]): # A new row is to be added - rows_tracker[column_index] += 1 - rows[rows_tracker[column_index]][column_index].list_flag = False - #auxiliar_rows[-1]['use_auxiliar_row'][i] = True - #if cells[i].startswith(":") and not cells[i].endswith(":"): - # auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"left\"" - #elif not cells[i].startswith(":") and cells[i].endswith(":"): - # auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"right\"" - #else: - # auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"center\"" - column_forward = 0 - for del_index in range(column_index, len(delimiter_positions)): - if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[del_index]: - column_forward += 1 - rows_tracker[column_index + column_forward - 1] += 1 if column_forward > 1 else 0 - column_index += column_forward - continue - else: - # Handle content of the cell - rows[rows_tracker[column_index]][column_index] = handling_content(rows[rows_tracker[column_index]][column_index], cells_content[i]) - rows[rows_tracker[column_index]][column_index].rowspan += 1 - if not rows[rows_tracker[column_index]][column_index].colspan_adjusted: - rows[rows_tracker[column_index]][column_index].colspan_adjusted = True - # TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator - rows[rows_tracker[column_index]][column_index] = adjust_colspan(rows[rows_tracker[column_index]], column_index, number_of_columns, line, number_of_columns, delimiter_positions) - - if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[column_index]: - column_index += rows[rows_tracker[column_index]][column_index].colspan if rows[rows_tracker[column_index]][column_index].colspan != 0 else 1 - continue - - else: - raise ValueError("More cells than columns found") - else: # Data row - cells_content = re.split(r"\s*\|\s*", line.strip("|")) - column_index = 0 - if len(cells_content) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined - for i in range(len(cells_content)): - # Handle content of the cell - rows[rows_tracker[column_index]][column_index] = handling_content(rows[rows_tracker[column_index]][column_index], cells_content[i]) - if not rows[rows_tracker[column_index]][column_index].colspan_adjusted: - rows[rows_tracker[column_index]][column_index].colspan_adjusted = True - #TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator - rows[rows_tracker[column_index]][column_index] = adjust_colspan(rows[rows_tracker[column_index]], column_index, number_of_columns, line, number_of_columns, delimiter_positions) - if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[column_index]: - column_index += rows[rows_tracker[column_index]][column_index].colspan # Move forward index i + # Remove the heading from the lines. The heading is the first line + # in the clause. This is done because MkDocs repeats the heading when + # displaying the page. + for clause in document.clauses: + if clause.linesCount > 0: + clause.lines.pop(0) + # Also, remove the first empty lines if they exist + while clause.linesCount > 0 and clause.lines[0].text.strip() == '': + clause.lines.pop(0) - elif len(cells_content) == number_of_columns: # Simple row - for i in range(len(cells_content)): - rows[rows_tracker[i]][i] = handling_content(rows[rows_tracker[i]][i], cells_content[i]) - else: - raise ValueError("More cells than columns found") + # Detect and handle hanging paragraphs. This is extra text in a clause, which + # has sub-clauses. This text is not allowed in oneM2M specifications. + for i, clause in enumerate(document.clauses): + if clause.level > 0 and clause.linesCount > 0: + # Check if there is a sub-clause in the next clause + if i + 1 < len(document.clauses) and document.clauses[i+1].level > clause.level: + # This is a hanging paragraph. Remove the text from the current clause. + print(f'[yellow]Hanging paragraph in clause "{clause.title}" {"(removed)" if not includeHangingParagraphs else "(kept)"}') + if not includeHangingParagraphs: + document.clauses[i].lines = [] else: - raise ValueError("No separator line found for row starting") - - if has_header and start >= header_separator_index: # table_row and auxiliar_row are part of data_rows - for body_row in rows: - data_rows.append(body_row.cells) - elif has_header and start < header_separator_index: # table_row and auxiliar_row are part of header_rows - for header_row in rows: - header_rows.append(header_row.cells) - - #print(header_rows) - #print(data_rows) - # Check if there are any data rows - if not data_rows and not header_rows: - raise ValueError("No valid rows found in the provided Pandoc table.") - - # Format text - for rows in [header_rows, data_rows]: - bold = "<strong>" - italic = "<i>" - for row in rows: - for cell in row: - if cell.content is not None: - # Replacing "<" by < - #cell.content = cell.content.replace("<", "<") - - #Bold - for bold_characters in ["**", "__"]: - while cell.content.find(bold_characters) != -1: - cell.content = cell.content.replace(bold_characters, bold, 1) - if bold == "<strong>": - bold = "</strong>" - else: - bold = "<strong>" - #Italic - while cell.content.find("_") != -1 and cell.content.find("\_") == -1: - cell.content = cell.content.rstrip() .replace("_", italic, 1) - if italic == "<i>": - italic = "</i>" - else: - italic = "<i>" - while cell.content.find("\_") != -1: - cell.content = cell.content.rstrip().replace("\_", "_", 1) - - # Correct newlines characters - for row in header_rows: - for cell in row: - cell.content = cell.content.replace("\n", "<br />") if cell.content is not None else None - for row in data_rows: - for cell in row: - cell.content = cell.content.replace("\n", "<br />") if cell.content is not None else None - - # Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows - forward_rowspan = [] - for row_index in range(len(header_rows)): - if len(forward_rowspan) == 0: - forward_rowspan = [0 for _ in range(len(header_rows[row_index]))] - sum = 0 - for cell_index in range(len(header_rows[row_index])): - sum += header_rows[row_index][cell_index].colspan - if row_index > 0 and header_rows[row_index][cell_index].colspan == 0: - if forward_rowspan[cell_index] > 0: - sum += 1 - forward_rowspan[cell_index] -= 1 - if forward_rowspan[cell_index] == 0 and header_rows[row_index][cell_index].rowspan > 1: - forward_rowspan[cell_index] = header_rows[row_index][cell_index].rowspan -1 - if not sum == number_of_columns: - raise ValueError("Grid table not converted properly") - forward_rowspan = [] - for row_index in range(len(data_rows)): - if len(forward_rowspan) == 0: - forward_rowspan = [0 for _ in range(len(data_rows[row_index]))] - sum = 0 - for cell_index in range(len(data_rows[row_index])): - sum += data_rows[row_index][cell_index].colspan - if row_index > 0 and data_rows[row_index][cell_index].colspan == 0: - if forward_rowspan[cell_index] > 0: - sum += 1 - forward_rowspan[cell_index] -= 1 - if forward_rowspan[cell_index] == 0 and data_rows[row_index][cell_index].rowspan > 1: - forward_rowspan[cell_index] = data_rows[row_index][cell_index].rowspan - 1 - if not sum == number_of_columns: - raise ValueError("Grid table not converted properly") - - return header_rows, data_rows + # Add a note to the hanging paragraph + document.clauses[i].lines = [Line("<mark>Editor note: This is a hanging paragraph and it must be moved to its own clause</mark>")] + [Line()] + document.clauses[i].lines -def generate_html_table_with_spans(pandoc_table): - """ - Generate an HTML table from a Pandoc-style grid table with row and column spans. - - :param pandoc_table: String of the Pandoc-style grid table. - :return: HTML string. - """ - try: - grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table) - except: - logging.ERROR("Grid table could not be generated") - return "HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS" - else: - html = "<table>\n" - has_header = False - - for row in grid_header: - for cell in row: - if cell.rowspan != 0 and cell.colspan != 0: - has_header = True - if has_header: - html += " <thead>\n" - for row in grid_header: - html += " <tr>\n" - for cell in row: - if cell.rowspan == 0 or cell.colspan == 0: - continue - else: - # Prepare content, in case there's a list - #print(cell.content) - if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", - cell.content): # Update cell in new row - #print("MATCHING") - list = "<ul>" - # Build list the matches - for match in matches: - list += "<li>" + match[1] + "</li>" - list += "</ul>" - cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell.content) - # Enforce left alignment if cell contains a list - cell.alignment = "align=\"left\"" - #else: - # print("NOT MATCHING") + # Repair wrong markdown for indented lines. + # Add 2 spaces to existing 2-space indentions + for clause in document.clauses: + for i, line in enumerate(clause.lines): + if match2spaceListIndention.match(line.text): + clause.lines[i].text = ' ' + line.text - rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else "" - colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else "" - html += f" <th{rowspan}{colspan} {cell.alignment}>{cell.content}</th>\n" - html += " </tr>\n" - html += " </thead>\n" - html += " <tbody>\n" - for row in grid_body: - html += " <tr>\n" - for cell in row: - if cell.rowspan == 0 or cell.colspan == 0: - continue - else: - #Prepare content, in case there's a list - #print(cell.content) - if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell.content): # Update cell in new row - #print("MATCHING") - #print(cell.content) - list = "<ul>" - # Build list the matches - for match in matches: - list += "<li>" + match[1] + "</li>" - list += "</ul>" - cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell.content) - # Enforce left alignment if cell contains a list - cell.alignment = "align=\"left\"" - #else: - #print("NOT MATCHING") - rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else "" - colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else "" - html += f" <td{rowspan}{colspan} {cell.alignment}>{cell.content}</td>\n" - html += " </tr>\n" - - html += " </tbody>\n" - html += "</table>" - return html - -def analyseMarkdown(filename:str) -> Document: - """ Analyse the markdown file and split it into clauses. +def writeClausesMkDocs(document:Document, filename:str, navTitle:str, addNavTitle:bool = False) -> None: + """ Write the clauses to separate files and create a navigation file. Args: - filename: The name of the markdown file. - - Returns: - The document object. + document: The document object. + filename: The name of the original markdown file. + navTitle: The title of the navigation entry. This is used to determine the directories. + addNavTitle: Add the title as an extra navigation level to the navigation file. """ - print(f'[green]Analyzing "{filename}"') + print(f'[green]Writing clauses to files') + # create directory first + os.makedirs(f'{os.path.dirname(filename)}/{navTitle}', exist_ok = True) + + # Write the files + for i, f in enumerate(document.clauses): + # write to single files, even empty ones + if verbose: + print(f'[dim]Writing "{f.clauseNumber}.md" - "{f.title}"') + with open(f'{os.path.dirname(filename)}/{navTitle}/{f.clauseNumber}.md', 'w') as file: + # Add one empty line before the clause. This is done to avoid + # a bug in MkDocs that does not display the first line of a clause + # if it contains a colon. It does not matter otherwise if the line + # is empty or not. + file.writelines(f.asStringList(1)) - # Read the file. - # Note: We use utf-8 and replace errors to avoid problems with special or unknown characters. - with open(filename, 'r', encoding = 'utf-8', errors = 'replace') as file: - inLines = file.readlines() - # The list of clauses. The first clause contains the text before the first heading. - outClauses:list[Clause] = [Clause(0, '', '', [])] - footnotes:list[Footnote] = [] - - # Go through the lines and detect headers and codefences - inCodefence = False - inTable = False - tableHasSeparator = False - inGridTable = False - gridTableHasSeparator = False - gridTable = "" - for line in inLines: - - # Detect and handle codefences - # For the moment we support only codefences that start and end - # with 3 backticks. This is the most common way to define codefences. - # Note, that longer codefences are allowed by the markdown specification. - - if _matchCodefenceStart.match(line) and not inCodefence: - inCodefence = True - outClauses[-1].append(Line(line, LineType.CODEFENCESTART)) - continue - if _matchCodefenceEnd.match(line): - inCodefence = False - outClauses[-1].append(Line(line, LineType.CODEFENCEEND)) - continue - if inCodefence: - outClauses[-1].append(Line(line, LineType.CODE)) - continue - - # Detect and handle tables - if _matchTable.match(line) and not inTable and not inGridTable: - inTable = True - outClauses[-1].append(Line(line, LineType.TABLEHEADER)) - continue - if inTable: - if _matchTableSeparator.match(line) and not tableHasSeparator: - outClauses[-1].append(Line(line, LineType.TABLESEPARATOR)) - tableHasSeparator = True - continue - elif _matchTable.match(line): - outClauses[-1].append(Line(line, LineType.TABLEROW)) - continue - else: - inTable = False - tableHasSeparator = False - # Mark the previous line as the last row in the table - outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW - # continue with other matches - - #Detect grid tables and convert them to html table - if _matchGridTable.match(line) and not inGridTable: - inGridTable = True - #outClauses[-1].append(Line(line, LineType.TABLEHEADER)) - gridTable += line - continue - if inGridTable: - if _matchGridTableHeaderSeparator.match(line) or _matchGridTableBodySeparator.match(line): - #outClauses[-1].append(Line(line, LineType.TABLESEPARATOR)) - gridTable += line - continue - elif _matchTable.match(line): - #outClauses[-1].append(Line(line, LineType.TABLEROW)) - gridTable += line + # write nav.yml file + print(f'[green]Writing "_nav.yml"') + indentation = ' ' if addNavTitle else '' # TODO make number of spaces configurable + with open(f'{os.path.dirname(filename)}/_nav.yml', 'w') as file: + if veryVerbose: + print(f'[dim]Writing navigation file') + if addNavTitle: + file.write(f'{indentation}- {navTitle}:\n') + for i, f in enumerate(document.clauses): + + if not f.title: + # print("continue") continue - else: - inGridTable = False - # Mark the previous line as the last row in the table - #outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW - print(gridTable) - htmltable = "" - htmltable = generate_html_table_with_spans(gridTable) - print(htmltable) - for row in htmltable: - outClauses[-1].append(Line(row, LineType.TABLEROW)) - gridTable = "" - # continue with other matches - - # Detect notes - # Notes are lines that start with a '>'. - if _matchNote.match(line): - outClauses[-1].append(Line(line, LineType.NOTE)) - continue - # Detect footnotes - # Footnotes are lines that start with a '^' - if (_fn := _footnote.match(line)): - footnotes.append(Footnote(_fn.groups()[0], Line(line, LineType.TEXT))) - continue + # TODO handle if the next clause is more than one level deeper - # Detect images on a single line - if (m := _matchStandAloneImage.match(line)): - outClauses[-1].append(Line(line, LineType.STANDALONEIMAGE)) - continue - - # Detect headers - _lineType = LineType.TEXT - if (m := _matchHeader.match(line)): - # Add a new clause - clauseTitle = m.groups()[1].strip() - clauseTitle = re.sub(_htmlTag, '', clauseTitle) - headerNumber = _matchHeaderNumber.search(clauseTitle) - outClauses.append(Clause(len(m.groups()[0]), # level - headerNumber.group() if headerNumber else shortHash(clauseTitle, 6), - clauseTitle, - [])) - _lineType = LineType.HEADING + _title = f.title.replace("'", '"') + nextClause = document.clauses[i+1] if i+1 < len(document.clauses) else None + if nextClause is None or nextClause.level <= f.level: + file.write(f"{indentation}{' '*f.level}- '{_title}': '{navTitle}/{f.clauseNumber}.md'\n") + else: + file.write(f"{indentation}{' '*f.level}- '{_title}':\n") + if len(f) > 0: + file.write(f"{indentation}{' '*nextClause.level}- 'Hanging paragraph': '{navTitle}/{f.clauseNumber}.md'\n") - # Just add the line to the current clause as text - outClauses[-1].append(Line(line, _lineType)) - return Document(outClauses, footnotes) +# TODO handle multiple nav levels (left bar) better (make conifgurable) def copyMediaFiles(filename:str, navTitle:str, mediaDirectory:str = 'media') -> None: @@ -1065,20 +150,22 @@ def processDocument(args:argparse.Namespace) -> None: document.insertFootnotes() document.updateLinks() document.updateNotes() - document.prepareForMkdocs(args.include_hanging_paragraphs) + + prepareForMkdocs(document, args.include_hanging_paragraphs) # Write the clauses to files - document.writeClausesMkDocs(inDocumentFilename, args.title, args.nav_add_title) + writeClausesMkDocs(document, inDocumentFilename, args.title, args.nav_add_title) # Copy the media files copyMediaFiles(inDocumentFilename, args.title, args.media_directory) -if __name__ == '__main__': +def main() -> None: parser = argparse.ArgumentParser(description = 'Convert oneM2M markdown specificatios to MkDocs format', formatter_class = argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--verbose', '-v', action = 'store_true', help = 'verbose output during processing') + parser.add_argument('--out', '-o', metavar='outfile', help = 'write output to file instead of stdout') parser.add_argument('--very-verbose', '-vv', action = 'store_true', help = 'very verbose output during processing') parser.add_argument('--ignore-clause', '-ic', metavar = 'clause', nargs = '+', default = [ 'Contents', 'History' ], help = 'ignore headers in the markdown document') parser.add_argument('--include-hanging-paragraphs', '-ihp', action = 'store_true', default = False, help = 'include hanging paragraphs (text in clauses with sub-clauses) in the output files') @@ -1088,8 +175,14 @@ if __name__ == '__main__': parser.add_argument('--title', '-t', metavar = 'title', required = True, help = 'mkdocs navigation tile') parser.add_argument('--nav-add-title', '-nat', action = 'store_true', default = False, help = 'add the title as an extra navigation level to the navigation file') - parser.add_argument('document', type = str, help = 'a oneM2M markdown specification document to process') args = parser.parse_args() + setScreenPrinters(info = lambda text: print(f'[green]{text}'), + debug = lambda text: print(f'[dim]{text}')) processDocument(args) + + +if __name__ == '__main__': + main() + -- GitLab From 6c8a9ddc8c0a0996d56276d3f6c09d78da12a152 Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Fri, 21 Feb 2025 11:05:46 +0100 Subject: [PATCH 02/29] Some improvements for grid tables conversion --- toMkdocs/gridTableTools.py | 184 +++++++++++++++++++++++++++++-------- 1 file changed, 146 insertions(+), 38 deletions(-) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index 170c3a5..95fde45 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -27,24 +27,37 @@ class GridCell: self.auxiliarIndex:int = 0 - def calculateAndSetAlignment(self, headerDelimiterPositions:list[int], defaultAlignments:list[str]) -> None: + def calculateAndSetAlignment(self) -> None: """ Set the alignment of the cell based on the position of the delimiter. """ if self.position is None: raise ValueError('Cell position must be set before calculating alignment.') - headerDelimiterIndex = 0 - while headerDelimiterIndex < len(defaultAlignments) and self.position > headerDelimiterPositions[headerDelimiterIndex]: - headerDelimiterIndex += 1 - if headerDelimiterIndex < len(defaultAlignments): - if self.position < headerDelimiterPositions[headerDelimiterIndex]: - self.alignment = defaultAlignments[headerDelimiterIndex] - elif self.position == headerDelimiterPositions[headerDelimiterIndex]: - self.alignment = defaultAlignments[headerDelimiterIndex] + if hasHeader: + headerDelimiterIndex = 0 + while headerDelimiterIndex < len(defaultAlignments) and self.position > headerDelimiterPositions[headerDelimiterIndex]: headerDelimiterIndex += 1 + if headerDelimiterIndex < len(defaultAlignments): + if self.position < headerDelimiterPositions[headerDelimiterIndex]: + self.alignment = defaultAlignments[headerDelimiterIndex] + elif self.position == headerDelimiterPositions[headerDelimiterIndex]: + self.alignment = defaultAlignments[headerDelimiterIndex] + headerDelimiterIndex += 1 + else: + raise ValueError('Invalid table formatting') else: - raise ValueError('Invalid table formatting') - + body_delimiter_index = 0 + while body_delimiter_index in range(len(defaultAlignments)) and self.position > delimiterPositions[body_delimiter_index]: + body_delimiter_index += 1 + if body_delimiter_index in range(len(defaultAlignments)): + if self.position < delimiterPositions[body_delimiter_index]: + self.alignment = defaultAlignments[body_delimiter_index] + elif self.position == delimiterPositions[body_delimiter_index]: + self.alignment = defaultAlignments[body_delimiter_index] + body_delimiter_index += 1 + else: + raise ValueError("Invalid table formatting") + def __str__(self): return f'(Content: {self.content}, Rowspan: {self.rowspan}, Colspan: {self.colspan}, Alignment: {self.alignment}, Position: {self.position}, ListFlag: {self.listFlag}, AuxiliarIndex: {self.auxiliarIndex})' @@ -78,6 +91,48 @@ class GridRow(): def __repr__(self): return self.__str__() + def check_delimiter_alignment(line: str, delimiters: str = "|+") -> bool: + """ + Check if delimiters in a row align with expected positions. + + Args: + line: The line of text to check + delimiter_positions: List of expected positions (based on + characters) + delimiters: String containing valid delimiter characters (default: "|+") + + Returns: + bool: True if delimiters align correctly, False otherwise + """ + if not line or not delimiterPositions: + return False + + print(f"\nChecking line: '{line}'") + print(f"Expected delimiter positions: {delimiterPositions}") + + # For full separator lines (only +) + if '+' in line and '|' not in line: + currentPositions = [i for i, char in enumerate(line) if (char == '+' and i != 0)] + print(f"Full separator line - Found + at positions: {currentPositions}") + return all(delimiterPositions[-1] in currentPositions and + line.startswith("+") and + pos in delimiterPositions for pos in currentPositions) + + # For data lines (only |) + if '|' in line and '+' not in line: + currentPositions = [i for i, char in enumerate(line) if (char == '|' and i != 0)] + print(f"Data line - Found | at positions: {current_positions}") + return all(delimiterPositions[-1] in currentPositions and + line.startswith("|") and + pos in delimiterPositions for pos in currentPositions) + + # For partial separators (mix of + and |) + currentPositions = [i for i, char in enumerate(line) if (char in delimiters and i != 0)] + print(f"Partial separator - Found delimiters at positions: {currentPositions}") + print(f"Characters at those positions: {[line[pos] for pos in currentPositions]}") + return all(delimiterPositions[-1] in currentPositions and + (line.startswith("+") or line.startswith("|")) and + pos in delimiterPositions for pos in currentPositions) + class GridRowsTracker(): """ Represents the document object. """ @@ -112,7 +167,15 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR :param pandoc_table: String of the Pandoc-style grid table. :return: List of lists representing the table with metadata for spans. """ - + global hasHeader, defaultAlignments, headerDelimiterPositions, delimiterPositions, nextListElementMark + + # Initialize globals + hasHeader = False + defaultAlignments:list[str] = [] + headerDelimiterPositions:list[int] = [] + delimiterPositions:list[int] = [] + nextListElementMark = '@' + # Split the input into lines lines:list[str] = [line.strip() for line in gridTable.strip().split('\n')] @@ -131,14 +194,13 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR cell.colspan += 1 if _c.startswith('- '): # List in a cell cell.listFlag = True - cell.content = _c + '\n' # Add newline to know when the list element ends - + _c = re.sub(r'\\\s*$', '\n', _c) + cell.content = _c + nextListElementMark # Add list element end mark to know when the list element ends elif cell.listFlag and len(_c) > 0: # any other content when handling list is concatenated to the last list element - cell.content = _c + '\n' - + _c = re.sub(r'\\\s*$', '\n', _c) + cell.content += _c + nextListElementMark #add the list element end mark elif not _c: # separation between list and other paragraph - cell.listFlag = False - cell.content = '\n' #if not cell['content'].endswith("\n") else "" + cell.content += '\n' if not cell['content'].endswith('\n') else "" else: cell.content = re.sub(r'\\\s*$', '\n', _c) else: # Cell has content @@ -147,11 +209,16 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR cell.content += '\n' #cell['content'] = cell['content'].strip("\n") cell.listFlag = True - cell.content += _c + '\n' # Add newline to know when the list element ends - elif cell.listFlag and _c: # any other content when handling list is concatenated to the last list element - cell.content = cell.content.strip('\n') + ' ' + _c + '\n' + _c = re.sub(r'\\\s*$', '\n', _c) + cell.content += _c + nextListElementMark # Add list element end mark to know when the list element ends + elif cell.listFlag and len(_c) > 0: # any other content when handling list is concatenated to the last list element + cell.content = cell.content.strip(nextListElementMark) #remove list element end mark + _c = re.sub(r'\\\s*$', '\n', _c) + cell.content += " " + _c + nextListElementMark #add list element end mark elif len(_c) == 0: # separation between list and other paragraph - cell.listFlag = False + if cell.list_flag: + cell.list_flag = False + cell.content += '\n\n' #end list by \n #content = re.sub(r'\\\s*$', "\n", content.strip()) cell.content += '\n' if not cell.content.endswith('\n') else '' else: @@ -202,11 +269,8 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR # Determine delimter positions and alignments - hasHeader = False - headerDelimiterPositions:list[int] = [] headerRows:GridTableRowList = [] dataRows:GridTableRowList = [] - defaultAlignments:list[str] = [] for index in separatorIndices: if matchGridTableHeaderSeparator.match(lines[index]): @@ -217,7 +281,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR for partIndex in range(len(parts)): if parts[partIndex].startswith(':') and not parts[partIndex].endswith(':'): # Left alignment defaultAlignments.append('align="left"') - elif not parts[partIndex].startswith(":") and parts[partIndex].endswith(":"): # Right alignment + elif not parts[partIndex].startswith(':') and parts[partIndex].endswith(':'): # Right alignment defaultAlignments.append('align="right"') else: defaultAlignments.append('align="center"') # Center alignment @@ -226,6 +290,18 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR delPositions = [lines[index].find(delimiter, delimiterPositionsStart + 1) for delimiter in '+' if delimiter in lines[index][delimiterPositionsStart + 1:]] headerDelimiterPositions.append(min(delPositions) if delPositions else -1) + if not hasHeader: + #Set default alignments from the first separator + parts = re.split(r'\+', lines[0].strip('+')) + default_alignments = [] + # Calculate default alignments and positions of delimiters + for part_index in range(len(parts)): + if parts[part_index].startswith(':') and not parts[part_index].endswith(':'): + default_alignments.append('align="left"') + elif not parts[part_index].startswith(':') and parts[part_index].endswith(':'): + default_alignments.append('align="right"') + else: + default_alignments.append('align="center"') for rowNumber in range(len(separatorIndices) - 1): rows:list[GridRow] = [] @@ -238,6 +314,10 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR for line in rowLines: if isSeparator(line) and not inDataRow: inDataRow = True + # Add delimiter alignment check for separator lines + if not check_delimiter_alignment(line, delimiterPositions): + raise ValueError(f"Misaligned delimiters in separator row: {line}") + parts = re.split(r'\s*\+\s*', line.strip('+')) delimiterIndex = 0 @@ -254,7 +334,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR cell.position = delimiterIndex # Position of cell delimiter + # Set alignment as defined by header separator line - cell.calculateAndSetAlignment(headerDelimiterPositions, defaultAlignments) + cell.calculateAndSetAlignment() while delimiterIndex > delimiterPositions[columnIndex]: columnIndex += 1 @@ -263,7 +343,11 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR elif inDataRow: # Regular data row or partial separator if matchGridTableBodySeparator.match(line): # Partial separator - cellsContent = re.split(r"[\|\+]", line.strip("|").strip("+")) # (?<!\\)[\|\+] + # Add delimiter alignment check for partial separators + if not check_delimiter_alignment(line, delimiterPositions): + raise ValueError(f"Misaligned delimiters in partial separator: {line}") + + cellsContent = re.split(r"[\|\+]", line.strip('|').strip('+')) # (?<!\\)[\|\+] #Add another row, set delimiters for each cell rows.append(GridRow(numberOfColumns)) auxDelimiterIndex = 0 @@ -274,7 +358,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR auxDelimiterIndex += len(content) + 1 cell = rows[-1][auxiliarCellIndex] cell.position = auxDelimiterIndex # Position of cell delimiter + - cell.calculateAndSetAlignment(headerDelimiterPositions, defaultAlignments) + cell.calculateAndSetAlignment() while auxDelimiterIndex > delimiterPositions[auxiliarCellIndex]: auxiliarCellIndex += 1 auxiliarCellIndex += 1 @@ -318,7 +402,13 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR raise ValueError("More cells than columns found") else: # Data row - cellsContent = re.split(r'\s*\|\s*', line.strip('|')) + cellsContent = line.strip() + cellsContent = re.split(r"\|", line.strip('|')) + + # Add delimiter alignment check + if not check_delimiter_alignment(line, delimiterPositions): + raise ValueError(f"Misaligned delimiters in row: {line}") + columnCellIndex = 0 if len(cellsContent) < numberOfColumns: # Colspan: Positions of | with respect to + need to be determined for columnIndex, content in enumerate(cellsContent): @@ -347,6 +437,10 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR elif hasHeader and start < headerSeparatorIndex: # table_row and auxiliar_row are part of header_rows for row in rows: # header rows headerRows.append(row.cells) + else: + #only body + for row in rows: + dataRows.append(row.cells) # Check if there are any data rows if not dataRows and not headerRows: @@ -432,13 +526,27 @@ def generateHtmlTableWithSpans(gridTable:str) -> str: Returns: The HTML table in string format. """ + debug_output = [] + def debug_print(msg): + debug_output.append(str(msg)) # Convert message to string + try: + # Redirect print statements to our debug collector + global print + original_print = print + print = debug_print + gridHeader, gridBody = parseGridTableWithSpans(gridTable) - except Exception as e: - import traceback - traceback.print_exc() - return f'HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS. {e}' + + # Restore original print + print = original_print + except Exception as e: + debug_print("Grid table could not be generated") + debug_text = "<br>".join(debug_output) # Now all items are strings + return f'HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE.<br><pre>{debug_text}</pre>' + + # Generate table HTML... html = '<table>\n' hasHeader = False @@ -457,13 +565,13 @@ def generateHtmlTableWithSpans(gridTable:str) -> str: continue else: # Prepare content, in case there's a list - if cell.content is not None and (matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell.content)): # Update cell in new row + if cell.content is not None and (matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+)@", cell.content)): # Update cell in new row list = "<ul>" # Build list the matches for match in matches: list += "<li>" + match[1] + "</li>" list += "</ul>" - cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell.content) + cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+(?:(?!@).)+@)+", list, cell.content) # Enforce left alignment if cell contains a list cell.alignment = "align=\"left\"" @@ -482,13 +590,13 @@ def generateHtmlTableWithSpans(gridTable:str) -> str: continue else: #Prepare content, in case there's a list - if cell.content is not None and (matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell.content)): # Update cell in new row + if cell.content is not None and (matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+)@", cell.content)): # Update cell in new row list = "<ul>" # Build list the matches for match in matches: list += "<li>" + match[1] + "</li>" list += "</ul>" - cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell.content) + cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+(?:(?!@).)+@)+",list, cell.content) # Enforce left alignment if cell contains a list cell.alignment = "align=\"left\"" -- GitLab From 06c05a87a9d54f452950c8211b95bd6fe08cd939 Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Fri, 21 Feb 2025 12:02:05 +0100 Subject: [PATCH 03/29] Adding parameter in functions instead of global variables --- toMkdocs/gridTableTools.py | 95 +++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 48 deletions(-) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index 95fde45..c690c2c 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -27,7 +27,7 @@ class GridCell: self.auxiliarIndex:int = 0 - def calculateAndSetAlignment(self) -> None: + def calculateAndSetAlignment(self, headerDelimiterPositions:list[int], delimiterPositions:list[int], defaultAlignments:list[str], hasHeader:bool) -> None: """ Set the alignment of the cell based on the position of the delimiter. """ if self.position is None: @@ -90,50 +90,7 @@ class GridRow(): def __repr__(self): return self.__str__() - - def check_delimiter_alignment(line: str, delimiters: str = "|+") -> bool: - """ - Check if delimiters in a row align with expected positions. - - Args: - line: The line of text to check - delimiter_positions: List of expected positions (based on + characters) - delimiters: String containing valid delimiter characters (default: "|+") - - Returns: - bool: True if delimiters align correctly, False otherwise - """ - if not line or not delimiterPositions: - return False - - print(f"\nChecking line: '{line}'") - print(f"Expected delimiter positions: {delimiterPositions}") - - # For full separator lines (only +) - if '+' in line and '|' not in line: - currentPositions = [i for i, char in enumerate(line) if (char == '+' and i != 0)] - print(f"Full separator line - Found + at positions: {currentPositions}") - return all(delimiterPositions[-1] in currentPositions and - line.startswith("+") and - pos in delimiterPositions for pos in currentPositions) - - # For data lines (only |) - if '|' in line and '+' not in line: - currentPositions = [i for i, char in enumerate(line) if (char == '|' and i != 0)] - print(f"Data line - Found | at positions: {current_positions}") - return all(delimiterPositions[-1] in currentPositions and - line.startswith("|") and - pos in delimiterPositions for pos in currentPositions) - - # For partial separators (mix of + and |) - currentPositions = [i for i, char in enumerate(line) if (char in delimiters and i != 0)] - print(f"Partial separator - Found delimiters at positions: {currentPositions}") - print(f"Characters at those positions: {[line[pos] for pos in currentPositions]}") - return all(delimiterPositions[-1] in currentPositions and - (line.startswith("+") or line.startswith("|")) and - pos in delimiterPositions for pos in currentPositions) - - + class GridRowsTracker(): """ Represents the document object. """ def __init__(self, size:int) -> None: @@ -167,7 +124,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR :param pandoc_table: String of the Pandoc-style grid table. :return: List of lists representing the table with metadata for spans. """ - global hasHeader, defaultAlignments, headerDelimiterPositions, delimiterPositions, nextListElementMark + #global hasHeader, defaultAlignments, headerDelimiterPositions, delimiterPositions, nextListElementMark # Initialize globals hasHeader = False @@ -248,6 +205,48 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR row[columnIndex].colspanAdjusted = True # Mark cell as adjusted + def check_delimiter_alignment(line: str, delimiterPositions:list[int], delimiters: str = "|+") -> bool: + """ + Check if delimiters in a row align with expected positions. + + Args: + line: The line of text to check + delimiter_positions: List of expected positions (based on + characters) + delimiters: String containing valid delimiter characters (default: "|+") + + Returns: + bool: True if delimiters align correctly, False otherwise + """ + if not line or not delimiterPositions: + return False + + print(f"\nChecking line: '{line}'") + print(f"Expected delimiter positions: {delimiterPositions}") + + # For full separator lines (only +) + if '+' in line and '|' not in line: + currentPositions = [i for i, char in enumerate(line) if (char == '+' and i != 0)] + print(f"Full separator line - Found + at positions: {currentPositions}") + return all(delimiterPositions[-1] in currentPositions and + line.startswith("+") and + pos in delimiterPositions for pos in currentPositions) + + # For data lines (only |) + if '|' in line and '+' not in line: + currentPositions = [i for i, char in enumerate(line) if (char == '|' and i != 0)] + print(f"Data line - Found | at positions: {currentPositions}") + return all(delimiterPositions[-1] in currentPositions and + line.startswith("|") and + pos in delimiterPositions for pos in currentPositions) + + # For partial separators (mix of + and |) + currentPositions = [i for i, char in enumerate(line) if (char in delimiters and i != 0)] + print(f"Partial separator - Found delimiters at positions: {currentPositions}") + print(f"Characters at those positions: {[line[pos] for pos in currentPositions]}") + return all(delimiterPositions[-1] in currentPositions and + (line.startswith("+") or line.startswith("|")) and + pos in delimiterPositions for pos in currentPositions) + separatorIndices = [i for i, line in enumerate(lines) if isSeparator(line)] @@ -334,7 +333,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR cell.position = delimiterIndex # Position of cell delimiter + # Set alignment as defined by header separator line - cell.calculateAndSetAlignment() + cell.calculateAndSetAlignment(headerDelimiterPositions, delimiterPositions, defaultAlignments, hasHeader) while delimiterIndex > delimiterPositions[columnIndex]: columnIndex += 1 @@ -358,7 +357,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR auxDelimiterIndex += len(content) + 1 cell = rows[-1][auxiliarCellIndex] cell.position = auxDelimiterIndex # Position of cell delimiter + - cell.calculateAndSetAlignment() + cell.calculateAndSetAlignment(headerDelimiterPositions, delimiterPositions, defaultAlignments, hasHeader) while auxDelimiterIndex > delimiterPositions[auxiliarCellIndex]: auxiliarCellIndex += 1 auxiliarCellIndex += 1 -- GitLab From 62a4c4a4e182ff641fda5afc32eb2977fce963f9 Mon Sep 17 00:00:00 2001 From: ankraft <an.kraft@gmail.com> Date: Fri, 21 Feb 2025 14:33:30 +0100 Subject: [PATCH 04/29] Adapted debug output. Corrections in the main script. Renamed wrong filename --- .gitignore | 1 + toMkdocs/gridTableTools.py | 229 ++++++++++-------- .../{makrdownTools.py => markdownTools.py} | 58 +++-- toMkdocs/toMkdocs.py | 59 ++++- 4 files changed, 208 insertions(+), 139 deletions(-) rename toMkdocs/{makrdownTools.py => markdownTools.py} (92%) diff --git a/.gitignore b/.gitignore index 6d0869a..4fb7084 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ */ts-* */.python-version .python-version +toMkdocs/__pycache__ diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index c690c2c..22bc895 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -6,10 +6,25 @@ # """ Tools for working with grid tables in markdown files. """ -from typing import Optional +from typing import Optional, Callable from regexMatches import * +_alignLeft = 'align="left"' +_alignRight = 'align="right"' +_alignCenter = 'align="center"' + +printInfo = print +printDebug = print +printError = print + +def setLoggers(info:Callable=print, debug:Callable=print, error:Callable=print) -> None: + global printInfo, printDebug, printError + + printInfo = info + printDebug = debug + printError = error + class GridCell: """ Represents a grid table cell. """ @@ -27,8 +42,18 @@ class GridCell: self.auxiliarIndex:int = 0 - def calculateAndSetAlignment(self, headerDelimiterPositions:list[int], delimiterPositions:list[int], defaultAlignments:list[str], hasHeader:bool) -> None: + def calculateAndSetAlignment(self, + headerDelimiterPositions:list[int], + delimiterPositions:list[int], + defaultAlignments:list[str], + hasHeader:bool) -> None: """ Set the alignment of the cell based on the position of the delimiter. + + Args: + headerDelimiterPositions: The positions of the header delimiters. + delimiterPositions: The positions of the delimiters. + defaultAlignments: The default alignments. + hasHeader: True if the table has a header, False otherwise. """ if self.position is None: raise ValueError('Cell position must be set before calculating alignment.') @@ -46,17 +71,17 @@ class GridCell: else: raise ValueError('Invalid table formatting') else: - body_delimiter_index = 0 - while body_delimiter_index in range(len(defaultAlignments)) and self.position > delimiterPositions[body_delimiter_index]: - body_delimiter_index += 1 - if body_delimiter_index in range(len(defaultAlignments)): - if self.position < delimiterPositions[body_delimiter_index]: - self.alignment = defaultAlignments[body_delimiter_index] - elif self.position == delimiterPositions[body_delimiter_index]: - self.alignment = defaultAlignments[body_delimiter_index] - body_delimiter_index += 1 + bodyDelimiterIndex = 0 + while bodyDelimiterIndex < len(defaultAlignments) and self.position > delimiterPositions[bodyDelimiterIndex]: + bodyDelimiterIndex += 1 + if bodyDelimiterIndex < len(defaultAlignments): + if self.position < delimiterPositions[bodyDelimiterIndex]: + self.alignment = defaultAlignments[bodyDelimiterIndex] + elif self.position == delimiterPositions[bodyDelimiterIndex]: + self.alignment = defaultAlignments[bodyDelimiterIndex] + bodyDelimiterIndex += 1 else: - raise ValueError("Invalid table formatting") + raise ValueError('Invalid table formatting') def __str__(self): @@ -91,6 +116,7 @@ class GridRow(): def __repr__(self): return self.__str__() + class GridRowsTracker(): """ Represents the document object. """ def __init__(self, size:int) -> None: @@ -155,9 +181,10 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR cell.content = _c + nextListElementMark # Add list element end mark to know when the list element ends elif cell.listFlag and len(_c) > 0: # any other content when handling list is concatenated to the last list element _c = re.sub(r'\\\s*$', '\n', _c) - cell.content += _c + nextListElementMark #add the list element end mark - elif not _c: # separation between list and other paragraph - cell.content += '\n' if not cell['content'].endswith('\n') else "" + cell.content = _c + nextListElementMark #add the list element end mark + elif not _c: # empty line. separation between list and other paragraph + # cell.content = '\n' if not cell.content.endswith('\n') else "" + cell.content = '\n' # cell content is always empty / None here. else: cell.content = re.sub(r'\\\s*$', '\n', _c) else: # Cell has content @@ -173,8 +200,8 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR _c = re.sub(r'\\\s*$', '\n', _c) cell.content += " " + _c + nextListElementMark #add list element end mark elif len(_c) == 0: # separation between list and other paragraph - if cell.list_flag: - cell.list_flag = False + if cell.listFlag: + cell.listFlag = False cell.content += '\n\n' #end list by \n #content = re.sub(r'\\\s*$', "\n", content.strip()) cell.content += '\n' if not cell.content.endswith('\n') else '' @@ -205,7 +232,8 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR row[columnIndex].colspanAdjusted = True # Mark cell as adjusted - def check_delimiter_alignment(line: str, delimiterPositions:list[int], delimiters: str = "|+") -> bool: + + def checkDelimiterAlignment(line: str, delimiterPositions:list[int], delimiters: str = "|+") -> bool: """ Check if delimiters in a row align with expected positions. @@ -220,33 +248,29 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR if not line or not delimiterPositions: return False - print(f"\nChecking line: '{line}'") - print(f"Expected delimiter positions: {delimiterPositions}") + printDebug(f'\nChecking line: "{line}"') + printDebug(f'Expected delimiter positions: {delimiterPositions}') # For full separator lines (only +) if '+' in line and '|' not in line: - currentPositions = [i for i, char in enumerate(line) if (char == '+' and i != 0)] - print(f"Full separator line - Found + at positions: {currentPositions}") - return all(delimiterPositions[-1] in currentPositions and - line.startswith("+") and - pos in delimiterPositions for pos in currentPositions) + currentPositions = [i for i, char in enumerate(line) if (char == '+' and i > 0)] + printDebug(f'Full separator line - Found + at positions: {currentPositions}') + return all(delimiterPositions[-1] in currentPositions and line.startswith('+') and pos in delimiterPositions + for pos in currentPositions) # For data lines (only |) if '|' in line and '+' not in line: - currentPositions = [i for i, char in enumerate(line) if (char == '|' and i != 0)] - print(f"Data line - Found | at positions: {currentPositions}") - return all(delimiterPositions[-1] in currentPositions and - line.startswith("|") and - pos in delimiterPositions for pos in currentPositions) + currentPositions = [i for i, char in enumerate(line) if (char == '|' and i > 0)] + printDebug(f'Data line - Found | at positions: {currentPositions}') + return all(delimiterPositions[-1] in currentPositions and line.startswith("|") and pos in delimiterPositions + for pos in currentPositions) # For partial separators (mix of + and |) - currentPositions = [i for i, char in enumerate(line) if (char in delimiters and i != 0)] - print(f"Partial separator - Found delimiters at positions: {currentPositions}") - print(f"Characters at those positions: {[line[pos] for pos in currentPositions]}") - return all(delimiterPositions[-1] in currentPositions and - (line.startswith("+") or line.startswith("|")) and - pos in delimiterPositions for pos in currentPositions) - + currentPositions = [i for i, char in enumerate(line) if (char in delimiters and i > 0)] + printDebug(f'Partial separator - Found delimiters at positions: {currentPositions}') + printDebug(f'Characters at those positions: {[line[pos] for pos in currentPositions]}') + return all(delimiterPositions[-1] in currentPositions and line.startswith(('+', '|')) and pos in delimiterPositions + for pos in currentPositions) separatorIndices = [i for i, line in enumerate(lines) if isSeparator(line)] @@ -254,8 +278,8 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR raise ValueError('No valid separators found in the provided grid table.') # Calculate max number of columns - delimiterPositions:list[int] = [] - numberOfColumns = 0 + delimiterPositions = [] + numberOfColumns:int = 0 for separatorIndex in separatorIndices: if (_cnt := lines[separatorIndex].count('+') - 1) > numberOfColumns: @@ -263,10 +287,10 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR delimiterPositions = [] for rowIndex in range(numberOfColumns): delimiterPositionsStart = delimiterPositions[rowIndex - 1] if rowIndex != 0 else 0 - delPositions = [lines[separatorIndex].find(delimiter, delimiterPositionsStart + 1) for delimiter in '+' if delimiter in lines[separatorIndex][delimiterPositionsStart + 1:]] + delPositions = [lines[separatorIndex].find(delimiter, delimiterPositionsStart + 1) + for delimiter in '+' if delimiter in lines[separatorIndex][delimiterPositionsStart + 1:]] delimiterPositions.append(min(delPositions) if delPositions else -1) - # Determine delimter positions and alignments headerRows:GridTableRowList = [] dataRows:GridTableRowList = [] @@ -278,15 +302,22 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR parts = re.split(r'\+', lines[index].strip('+')) #Calculate default alignments and positions of delimiters for partIndex in range(len(parts)): - if parts[partIndex].startswith(':') and not parts[partIndex].endswith(':'): # Left alignment - defaultAlignments.append('align="left"') - elif not parts[partIndex].startswith(':') and parts[partIndex].endswith(':'): # Right alignment - defaultAlignments.append('align="right"') + # Left alignment + if parts[partIndex].startswith(':') and not parts[partIndex].endswith(':'): + defaultAlignments.append(_alignLeft) + + # Right alignment + elif not parts[partIndex].startswith(':') and parts[partIndex].endswith(':'): + defaultAlignments.append(_alignRight) + + # Center alignment else: - defaultAlignments.append('align="center"') # Center alignment + defaultAlignments.append(_alignCenter) + # Delimiter position delimiterPositionsStart = delimiterPositions[partIndex - 1] if partIndex != 0 else 0 - delPositions = [lines[index].find(delimiter, delimiterPositionsStart + 1) for delimiter in '+' if delimiter in lines[index][delimiterPositionsStart + 1:]] + delPositions = [lines[index].find(delimiter, delimiterPositionsStart + 1) + for delimiter in '+' if delimiter in lines[index][delimiterPositionsStart + 1:]] headerDelimiterPositions.append(min(delPositions) if delPositions else -1) if not hasHeader: @@ -296,11 +327,13 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR # Calculate default alignments and positions of delimiters for part_index in range(len(parts)): if parts[part_index].startswith(':') and not parts[part_index].endswith(':'): - default_alignments.append('align="left"') + default_alignments.append(_alignLeft) + elif not parts[part_index].startswith(':') and parts[part_index].endswith(':'): - default_alignments.append('align="right"') + default_alignments.append(_alignRight) + else: - default_alignments.append('align="center"') + default_alignments.append(_alignCenter) for rowNumber in range(len(separatorIndices) - 1): rows:list[GridRow] = [] @@ -314,8 +347,8 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR if isSeparator(line) and not inDataRow: inDataRow = True # Add delimiter alignment check for separator lines - if not check_delimiter_alignment(line, delimiterPositions): - raise ValueError(f"Misaligned delimiters in separator row: {line}") + if not checkDelimiterAlignment(line, delimiterPositions): + raise ValueError(f'Misaligned delimiters in separator row: {line}') parts = re.split(r'\s*\+\s*', line.strip('+')) delimiterIndex = 0 @@ -343,17 +376,17 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR # Regular data row or partial separator if matchGridTableBodySeparator.match(line): # Partial separator # Add delimiter alignment check for partial separators - if not check_delimiter_alignment(line, delimiterPositions): - raise ValueError(f"Misaligned delimiters in partial separator: {line}") + if not checkDelimiterAlignment(line, delimiterPositions): + raise ValueError(f'Misaligned delimiters in partial separator: {line}') - cellsContent = re.split(r"[\|\+]", line.strip('|').strip('+')) # (?<!\\)[\|\+] + cellsContent = re.split(r'[\|\+]', line.strip('|').strip('+')) # (?<!\\)[\|\+] #Add another row, set delimiters for each cell rows.append(GridRow(numberOfColumns)) auxDelimiterIndex = 0 auxiliarCellIndex = 0 for columnIndex, content in enumerate(cellsContent): - if auxiliarCellIndex in range(numberOfColumns): + if auxiliarCellIndex < numberOfColumns: auxDelimiterIndex += len(content) + 1 cell = rows[-1][auxiliarCellIndex] cell.position = auxDelimiterIndex # Position of cell delimiter + @@ -398,15 +431,14 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR continue else: - raise ValueError("More cells than columns found") + raise ValueError('More cells than columns found') else: # Data row - cellsContent = line.strip() - cellsContent = re.split(r"\|", line.strip('|')) + cellsContent = re.split(r'\|', line.strip('|')) # Add delimiter alignment check - if not check_delimiter_alignment(line, delimiterPositions): - raise ValueError(f"Misaligned delimiters in row: {line}") + if not checkDelimiterAlignment(line, delimiterPositions): + raise ValueError(f'Misaligned delimiters in row: {line}') columnCellIndex = 0 if len(cellsContent) < numberOfColumns: # Colspan: Positions of | with respect to + need to be determined @@ -426,9 +458,9 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR rowIndex = rowsTracker[columnIndex] handleCellContent(rows[rowIndex][columnIndex], content) else: - raise ValueError("More cells than columns found") + raise ValueError('More cells than columns found') else: - raise ValueError("No separator line found for row starting") + raise ValueError('No separator line found for row starting') if hasHeader and start >= headerSeparatorIndex: # table_row and auxiliar_row are part of data_rows for row in rows: @@ -451,7 +483,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR for cell in gridRow: if cell.content is not None: # Replacing "<" by < - cell.content = cell.content.replace("<", "<") + cell.content = cell.content.replace('<', '<') # Bold replacements # Regex to detect markdown bold formatting in cell content @@ -463,7 +495,6 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR if cell.content is not None: cell.content = matchItalic.sub(r'<i>\g<text></i>', cell.content) - # Correct newlines characters for headerRow in headerRows: for cell in headerRow: @@ -525,25 +556,11 @@ def generateHtmlTableWithSpans(gridTable:str) -> str: Returns: The HTML table in string format. """ - debug_output = [] - def debug_print(msg): - debug_output.append(str(msg)) # Convert message to string - try: - # Redirect print statements to our debug collector - global print - original_print = print - print = debug_print - gridHeader, gridBody = parseGridTableWithSpans(gridTable) - - # Restore original print - print = original_print - except Exception as e: - debug_print("Grid table could not be generated") - debug_text = "<br>".join(debug_output) # Now all items are strings - return f'HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE.<br><pre>{debug_text}</pre>' + printDebug('Grid table could not be generated') + return f'HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE' # Generate table HTML... html = '<table>\n' @@ -564,47 +581,47 @@ def generateHtmlTableWithSpans(gridTable:str) -> str: continue else: # Prepare content, in case there's a list - if cell.content is not None and (matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+)@", cell.content)): # Update cell in new row - list = "<ul>" + if cell.content is not None and (matches := re.findall(r'\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+)@', cell.content)): # Update cell in new row + list = '<ul>' # Build list the matches for match in matches: - list += "<li>" + match[1] + "</li>" - list += "</ul>" - cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+(?:(?!@).)+@)+", list, cell.content) + list += '<li>' + match[1] + '</li>' + list += '</ul>' + cell.content = re.sub(r'(\s*([-*+]|\s*\d+\.)\s+(?:(?!@).)+@)+', list, cell.content) # Enforce left alignment if cell contains a list - cell.alignment = "align=\"left\"" + cell.alignment = _alignLeft - rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else "" - colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else "" - html += f" <th{rowspan}{colspan} {cell.alignment}>{cell.content}</th>\n" - html += " </tr>\n" - html += " </thead>\n" + rowspan = f' rowspan="{cell.rowspan}"' if cell.rowspan > 1 else '' + colspan = f' colspan="{cell.colspan}"' if cell.colspan > 1 else '' + html += f' <th{rowspan}{colspan} {cell.alignment}>{cell.content}</th>\n' + html += ' </tr>\n' + html += ' </thead>\n' - html += " <tbody>\n" + html += ' <tbody>\n' for row in gridBody: - html += " <tr>\n" + html += ' <tr>\n' for cell in row: if cell.rowspan == 0 or cell.colspan == 0: continue else: #Prepare content, in case there's a list - if cell.content is not None and (matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+)@", cell.content)): # Update cell in new row - list = "<ul>" + if cell.content is not None and (matches := re.findall(r'\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+)@', cell.content)): # Update cell in new row + list = '<ul>' # Build list the matches for match in matches: - list += "<li>" + match[1] + "</li>" - list += "</ul>" - cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+(?:(?!@).)+@)+",list, cell.content) + list += f'<li>{match[1]}</li>' + list += '</ul>' + cell.content = re.sub(r'(\s*([-*+]|\s*\d+\.)\s+(?:(?!@).)+@)+', list, cell.content) # Enforce left alignment if cell contains a list - cell.alignment = "align=\"left\"" + cell.alignment = _alignLeft - rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else "" - colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else "" - html += f" <td{rowspan}{colspan} {cell.alignment}>{cell.content}</td>\n" - html += " </tr>\n" + rowspan = f' rowspan="{cell.rowspan}"' if cell.rowspan > 1 else '' + colspan = f' colspan="{cell.colspan}"' if cell.colspan > 1 else '' + html += f' <td{rowspan}{colspan} {cell.alignment}>{cell.content}</td>\n' + html += ' </tr>\n' - html += " </tbody>\n" - html += "</table>" + html += ' </tbody>\n' + html += '</table>' return html diff --git a/toMkdocs/makrdownTools.py b/toMkdocs/markdownTools.py similarity index 92% rename from toMkdocs/makrdownTools.py rename to toMkdocs/markdownTools.py index 169b634..56ea86e 100644 --- a/toMkdocs/makrdownTools.py +++ b/toMkdocs/markdownTools.py @@ -9,12 +9,13 @@ """ Various tools for markdown processing """ from __future__ import annotations +from typing import Callable from dataclasses import dataclass import base64, hashlib from enum import Enum, auto -from gridTableTools import generateHtmlTableWithSpans +from gridTableTools import generateHtmlTableWithSpans, setLoggers as setGridTableLoggers from regexMatches import * # TODO use a verbosity level instead @@ -23,12 +24,18 @@ veryVerbose = False printInfo = print printDebug = print +printError = print -def setScreenPrinters(info:callable = print, debug:callable = print) -> None: - global printInfo, printDebug +def setLoggers(info:Callable = print, debug:Callable = print, error:Callable= print) -> None: + global printInfo, printDebug, printError printInfo = info printDebug = debug + printError = error + + # Set the loggers for the grid table tools + setGridTableLoggers(info, debug, error) + def _shortHash(value:str, length:int) -> str: @@ -365,6 +372,28 @@ def analyseMarkdown(filename:str) -> Document: The document object. """ + gridTable:str = '' + + def processGridTable() -> None: + """ Process a grid table and convert it to an html table. + + This function adds the html table to the output clauses and + clears the gridTable variable. + """ + nonlocal gridTable + + htmltable:str = '' + try: + htmltable = generateHtmlTableWithSpans(gridTable) + printDebug(htmltable) + except Exception as e: + printError(f"Error: {e}") + # TODO move this outside of the analyseMarkdown function !!! + for row in htmltable: + outClauses[-1].append(Line(row, LineType.TABLEROW)) + gridTable = '' + + printInfo(f'Analyzing "{filename}"') # Read the file. @@ -381,10 +410,7 @@ def analyseMarkdown(filename:str) -> Document: inTable = False tableHasSeparator = False inGridTable = False - gridTableHasSeparator = False - gridTable = "" for line in inLines: - # Detect and handle codefences # For the moment we support only codefences that start and end # with 3 backticks. This is the most common way to define codefences. @@ -439,18 +465,7 @@ def analyseMarkdown(filename:str) -> Document: continue else: inGridTable = False - # Mark the previous line as the last row in the table - #outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW - # print(gridTable) - try: - htmltable = generateHtmlTableWithSpans(gridTable) - print(htmltable) - except Exception as e: - print(f"Error: {e}") - # TODO move this outside of the analyseMarkdown function !!! - for row in htmltable: - outClauses[-1].append(Line(row, LineType.TABLEROW)) - gridTable = "" + processGridTable() # continue with other matches # Detect notes @@ -486,9 +501,10 @@ def analyseMarkdown(filename:str) -> Document: # Just add the line to the current clause as text outClauses[-1].append(Line(line, _lineType)) - return Document(outClauses, footnotes) - - + # Process still unfinished cases + if gridTable: + processGridTable() + return Document(outClauses, footnotes) diff --git a/toMkdocs/toMkdocs.py b/toMkdocs/toMkdocs.py index 69037d1..af4a286 100644 --- a/toMkdocs/toMkdocs.py +++ b/toMkdocs/toMkdocs.py @@ -12,13 +12,49 @@ from __future__ import annotations import argparse, os, shutil from rich import print -from makrdownTools import Line, Document, analyseMarkdown, setScreenPrinters +from markdownTools import Line, Document, analyseMarkdown, setLoggers from regexMatches import match2spaceListIndention verbose = False veryVerbose = False +def printDebug(text:str) -> None: + """ Print a debug message. + + Args: + text: The text of the debug message. + """ + if verbose: + print(f'[dim]{text}') + + +def printInfo(text:str) -> None: + """ Print an information message. + + Args: + text: The text of the information message. + """ + print(f'[green]{text}') + + +def printWarning(text:str) -> None: + """ Print a warning message. + + Args: + text: The text of the warning message. + """ + print(f'[yellow]{text}') + + +def printError(text:str) -> None: + """ Print an error message. + + Args: + text: The text of the error message. + """ + print(f'[red]{text}') + def prepareForMkdocs(document:Document, includeHangingParagraphs:bool = False) -> None: """ Prepare the clauses for MkDocs. This includes removing the heading from the clauses and marking the clauses that are only for navigation. @@ -47,7 +83,7 @@ def prepareForMkdocs(document:Document, includeHangingParagraphs:bool = False) - # Check if there is a sub-clause in the next clause if i + 1 < len(document.clauses) and document.clauses[i+1].level > clause.level: # This is a hanging paragraph. Remove the text from the current clause. - print(f'[yellow]Hanging paragraph in clause "{clause.title}" {"(removed)" if not includeHangingParagraphs else "(kept)"}') + printWarning(f'Hanging paragraph in clause "{clause.title}" {"(removed)" if not includeHangingParagraphs else "(kept)"}') if not includeHangingParagraphs: document.clauses[i].lines = [] else: @@ -72,15 +108,14 @@ def writeClausesMkDocs(document:Document, filename:str, navTitle:str, addNavTitl addNavTitle: Add the title as an extra navigation level to the navigation file. """ - print(f'[green]Writing clauses to files') + printInfo(f'Writing clauses to files') # create directory first os.makedirs(f'{os.path.dirname(filename)}/{navTitle}', exist_ok = True) # Write the files for i, f in enumerate(document.clauses): # write to single files, even empty ones - if verbose: - print(f'[dim]Writing "{f.clauseNumber}.md" - "{f.title}"') + printDebug(f'Writing "{f.clauseNumber}.md" - "{f.title}"') with open(f'{os.path.dirname(filename)}/{navTitle}/{f.clauseNumber}.md', 'w') as file: # Add one empty line before the clause. This is done to avoid # a bug in MkDocs that does not display the first line of a clause @@ -90,11 +125,10 @@ def writeClausesMkDocs(document:Document, filename:str, navTitle:str, addNavTitl # write nav.yml file - print(f'[green]Writing "_nav.yml"') + printInfo(f'Writing "_nav.yml"') indentation = ' ' if addNavTitle else '' # TODO make number of spaces configurable with open(f'{os.path.dirname(filename)}/_nav.yml', 'w') as file: - if veryVerbose: - print(f'[dim]Writing navigation file') + printDebug(f'Writing navigation file') if addNavTitle: file.write(f'{indentation}- {navTitle}:\n') for i, f in enumerate(document.clauses): @@ -130,10 +164,10 @@ def copyMediaFiles(filename:str, navTitle:str, mediaDirectory:str = 'media') -> targetDirectory = f'{os.path.dirname(filename)}/{navTitle}/{mediaDirectory}' if os.path.exists(sourceDirectory): - print(f'[green]Copying media files from "{sourceDirectory}" to "{targetDirectory}"') + printInfo(f'Copying media files from "{sourceDirectory}" to "{targetDirectory}"') shutil.copytree(sourceDirectory, targetDirectory, dirs_exist_ok = True) else: - print(f'[red]Media directory "{sourceDirectory}" does not exist') + printError(f'Media directory "{sourceDirectory}" does not exist') def processDocument(args:argparse.Namespace) -> None: @@ -177,8 +211,9 @@ def main() -> None: parser.add_argument('document', type = str, help = 'a oneM2M markdown specification document to process') args = parser.parse_args() - setScreenPrinters(info = lambda text: print(f'[green]{text}'), - debug = lambda text: print(f'[dim]{text}')) + setLoggers(info = printInfo, + debug = printDebug, + error = printError) processDocument(args) -- GitLab From b8c12235e879cd971186b0e362ad9e7b838ea051 Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Fri, 21 Feb 2025 15:52:34 +0100 Subject: [PATCH 05/29] Raising runtime error --- toMkdocs/gridTableTools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index 22bc895..492b92d 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -560,7 +560,7 @@ def generateHtmlTableWithSpans(gridTable:str) -> str: gridHeader, gridBody = parseGridTableWithSpans(gridTable) except Exception as e: printDebug('Grid table could not be generated') - return f'HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE' + raise RuntimeError(f'HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE:{str(e)}') # Generate table HTML... html = '<table>\n' -- GitLab From 0f7c67d095e6c336a75b9a7a9ab34479bc993c41 Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Fri, 21 Feb 2025 16:12:28 +0100 Subject: [PATCH 06/29] Add breakline in error output message --- toMkdocs/gridTableTools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index 492b92d..071ecae 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -560,7 +560,7 @@ def generateHtmlTableWithSpans(gridTable:str) -> str: gridHeader, gridBody = parseGridTableWithSpans(gridTable) except Exception as e: printDebug('Grid table could not be generated') - raise RuntimeError(f'HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE:{str(e)}') + raise RuntimeError(f'HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE:\n{str(e)}') # Generate table HTML... html = '<table>\n' -- GitLab From 0e14f4fdfa6415cfd1b6f1c083dac5a614aa8f8d Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Fri, 21 Feb 2025 18:32:00 +0100 Subject: [PATCH 07/29] Fix for only body grid tables --- toMkdocs/gridTableTools.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index 071ecae..305c945 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -323,17 +323,17 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR if not hasHeader: #Set default alignments from the first separator parts = re.split(r'\+', lines[0].strip('+')) - default_alignments = [] + defaultAlignments = [] # Calculate default alignments and positions of delimiters for part_index in range(len(parts)): if parts[part_index].startswith(':') and not parts[part_index].endswith(':'): - default_alignments.append(_alignLeft) + defaultAlignments.append(_alignLeft) elif not parts[part_index].startswith(':') and parts[part_index].endswith(':'): - default_alignments.append(_alignRight) + defaultAlignments.append(_alignRight) else: - default_alignments.append(_alignCenter) + defaultAlignments.append(_alignCenter) for rowNumber in range(len(separatorIndices) - 1): rows:list[GridRow] = [] -- GitLab From 0d9f75e0ecc4652aea985de1f08944ec033d8cba Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Fri, 21 Feb 2025 18:48:24 +0100 Subject: [PATCH 08/29] Fix to take into account spaces at the beginning of the line --- toMkdocs/gridTableTools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index 305c945..2b1f2f7 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -160,7 +160,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR nextListElementMark = '@' # Split the input into lines - lines:list[str] = [line.strip() for line in gridTable.strip().split('\n')] + lines:list[str] = [line for line in gridTable.split('\n')] # Detect separator lines by pattern (it does not take into account partial separators -- GitLab From 0fc4d7e4afb5ff7aaf21200719a098f5d6e00fdb Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Mon, 24 Feb 2025 13:34:42 +0100 Subject: [PATCH 09/29] Checking alignment of end table separator --- toMkdocs/gridTableTools.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index 2b1f2f7..1d35fd4 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -335,6 +335,10 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR else: defaultAlignments.append(_alignCenter) + #Check end table delimiter alignment (not checked during the lines processing) + if not checkDelimiterAlignment(lines[-1], delimiterPositions): + raise ValueError(f'Misaligned delimiters in end table separator: {lines[-1]}') + for rowNumber in range(len(separatorIndices) - 1): rows:list[GridRow] = [] rowsTracker:GridRowsTracker -- GitLab From 582116cd26cb0880194670f100fc924b24e10ef0 Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Mon, 24 Feb 2025 15:35:03 +0100 Subject: [PATCH 10/29] Strip trailing spaces from grid table lines --- toMkdocs/gridTableTools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index 1d35fd4..e9070e8 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -160,7 +160,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR nextListElementMark = '@' # Split the input into lines - lines:list[str] = [line for line in gridTable.split('\n')] + lines:list[str] = [line for line in gridTable.rstrip().split('\n')] # Detect separator lines by pattern (it does not take into account partial separators -- GitLab From be71662cdcee3ca88269ab42093b6bab0740ac0d Mon Sep 17 00:00:00 2001 From: ankraft <an.kraft@gmail.com> Date: Mon, 24 Feb 2025 20:14:50 +0100 Subject: [PATCH 11/29] Fixed some regex --- toMkdocs/gridTableTools.py | 37 +++++++++++++++++++++---------------- toMkdocs/regexMatches.py | 4 ++-- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index e9070e8..1c60e40 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -13,6 +13,8 @@ from regexMatches import * _alignLeft = 'align="left"' _alignRight = 'align="right"' _alignCenter = 'align="center"' +_nextListElementMark = '∆' # Marks a continuing list in the line before. !!! Must be a single character + printInfo = print printDebug = print @@ -157,7 +159,6 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR defaultAlignments:list[str] = [] headerDelimiterPositions:list[int] = [] delimiterPositions:list[int] = [] - nextListElementMark = '@' # Split the input into lines lines:list[str] = [line for line in gridTable.rstrip().split('\n')] @@ -178,10 +179,10 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR if _c.startswith('- '): # List in a cell cell.listFlag = True _c = re.sub(r'\\\s*$', '\n', _c) - cell.content = _c + nextListElementMark # Add list element end mark to know when the list element ends + cell.content = _c + _nextListElementMark # Add list element end mark to know when the list element ends elif cell.listFlag and len(_c) > 0: # any other content when handling list is concatenated to the last list element _c = re.sub(r'\\\s*$', '\n', _c) - cell.content = _c + nextListElementMark #add the list element end mark + cell.content = _c + _nextListElementMark #add the list element end mark elif not _c: # empty line. separation between list and other paragraph # cell.content = '\n' if not cell.content.endswith('\n') else "" cell.content = '\n' # cell content is always empty / None here. @@ -194,11 +195,13 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR #cell['content'] = cell['content'].strip("\n") cell.listFlag = True _c = re.sub(r'\\\s*$', '\n', _c) - cell.content += _c + nextListElementMark # Add list element end mark to know when the list element ends + cell.content += _c + _nextListElementMark # Add list element end mark to know when the list element ends elif cell.listFlag and len(_c) > 0: # any other content when handling list is concatenated to the last list element - cell.content = cell.content.strip(nextListElementMark) #remove list element end mark + # cell.content = cell.content.strip(nextListElementMark) #remove list element end mark + cell.content = cell.content.removesuffix(_nextListElementMark) #remove list element end mark + _c = re.sub(r'\\\s*$', '\n', _c) - cell.content += " " + _c + nextListElementMark #add list element end mark + cell.content += ' ' + _c + _nextListElementMark #add list element end mark elif len(_c) == 0: # separation between list and other paragraph if cell.listFlag: cell.listFlag = False @@ -222,8 +225,6 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR row[columnIndex].colspan += 1 if position == delimiterPositions[len(delimiterPositions) - 1]: # last cell in row, adjust colspan to get max number columns colspan_allocated = row[columnIndex].colspan - #for cell_index in range(number_of_parts): - # colspan_allocated += row[cell_index].colspan row[columnIndex].colspan += numberOfColumns - colspan_allocated - columnIndex elif position < delimiterPositions[j]: raise ValueError("Wrong cell formatting") @@ -435,7 +436,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR continue else: - raise ValueError('More cells than columns found') + raise ValueError(f'More cells than columns found ({len(cellsContent)} {numberOfColumns})') else: # Data row cellsContent = re.split(r'\|', line.strip('|')) @@ -462,7 +463,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR rowIndex = rowsTracker[columnIndex] handleCellContent(rows[rowIndex][columnIndex], content) else: - raise ValueError('More cells than columns found') + raise ValueError(f'More cells than columns found ({len(cellsContent)} {numberOfColumns})') else: raise ValueError('No separator line found for row starting') @@ -489,11 +490,12 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR # Replacing "<" by < cell.content = cell.content.replace('<', '<') + # Bold replacements # Regex to detect markdown bold formatting in cell content if cell.content is not None: - cell.content = matchBold.sub(r'<strong>\g<text></strong>', cell.content) - + cell.content = matchBold.sub(r'\1<strong>\g<text></strong>', cell.content) + # Italic replacements # Regex to detect markdown italic formatting in cell content if cell.content is not None: @@ -560,6 +562,9 @@ def generateHtmlTableWithSpans(gridTable:str) -> str: Returns: The HTML table in string format. """ + regex1 = r'\s*([-*+]|\s*\d+\.)\s+((?:(?!' + re.escape(_nextListElementMark) + r').)+)' + re.escape(_nextListElementMark) + regex2 = r'(\s*([-*+]|\s*\d+\.)\s+(?:(?!∆).)+' + re.escape(_nextListElementMark) + r')+' + try: gridHeader, gridBody = parseGridTableWithSpans(gridTable) except Exception as e: @@ -585,13 +590,13 @@ def generateHtmlTableWithSpans(gridTable:str) -> str: continue else: # Prepare content, in case there's a list - if cell.content is not None and (matches := re.findall(r'\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+)@', cell.content)): # Update cell in new row + if cell.content is not None and (matches := re.findall(regex1, cell.content)): # Update cell in new row list = '<ul>' # Build list the matches for match in matches: list += '<li>' + match[1] + '</li>' list += '</ul>' - cell.content = re.sub(r'(\s*([-*+]|\s*\d+\.)\s+(?:(?!@).)+@)+', list, cell.content) + cell.content = re.sub(regex2, list, cell.content) # Enforce left alignment if cell contains a list cell.alignment = _alignLeft @@ -610,13 +615,13 @@ def generateHtmlTableWithSpans(gridTable:str) -> str: continue else: #Prepare content, in case there's a list - if cell.content is not None and (matches := re.findall(r'\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+)@', cell.content)): # Update cell in new row + if cell.content is not None and (matches := re.findall(regex1, cell.content)): # Update cell in new row list = '<ul>' # Build list the matches for match in matches: list += f'<li>{match[1]}</li>' list += '</ul>' - cell.content = re.sub(r'(\s*([-*+]|\s*\d+\.)\s+(?:(?!@).)+@)+', list, cell.content) + cell.content = re.sub(regex2, list, cell.content) # Enforce left alignment if cell contains a list cell.alignment = _alignLeft diff --git a/toMkdocs/regexMatches.py b/toMkdocs/regexMatches.py index 7b784c1..ba1fc15 100644 --- a/toMkdocs/regexMatches.py +++ b/toMkdocs/regexMatches.py @@ -36,5 +36,5 @@ matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECAS matchTable = re.compile(r'^\s*\|.*\|\s*$', re.IGNORECASE) matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE) -matchBold = re.compile(r'(?<!\S)(\*\*|__)(?P<text>.+?)(?<!\\)\1(?!\S)') -matchItalic = re.compile(r'(?<!\S)(\*|_)(?P<text>.+?)(?<!\\)\1(?!\S)') +matchBold = re.compile(r'(^|\s)(\*\*|__)(?P<text>.+?)\2(?!\w)') +matchItalic = re.compile(r'(^|\s)(\*|_)(?P<text>.+?)(?<!\\)\3(\s|$)') -- GitLab From 4f4e21feba31874d4dbd7e99c07485e1d198e61a Mon Sep 17 00:00:00 2001 From: ankraft <an.kraft@gmail.com> Date: Mon, 24 Feb 2025 20:15:27 +0100 Subject: [PATCH 12/29] Further improved dangling spaces after last grid table rows --- toMkdocs/gridTableTools.py | 1 + 1 file changed, 1 insertion(+) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index 1c60e40..3738dd5 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -349,6 +349,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR if rowLines: # Combine multiline content into single strings for each cell for line in rowLines: + line = line.rstrip() if isSeparator(line) and not inDataRow: inDataRow = True # Add delimiter alignment check for separator lines -- GitLab From ea4853f4293236cfa3bbfc92c7c799a6c4771167 Mon Sep 17 00:00:00 2001 From: ankraft <an.kraft@gmail.com> Date: Mon, 24 Feb 2025 20:16:42 +0100 Subject: [PATCH 13/29] Improved support for stringify documents. Added support for stdin/stdout file processing --- toMkdocs/markdownTools.py | 51 +++++++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/toMkdocs/markdownTools.py b/toMkdocs/markdownTools.py index 56ea86e..6c9dc41 100644 --- a/toMkdocs/markdownTools.py +++ b/toMkdocs/markdownTools.py @@ -9,7 +9,7 @@ """ Various tools for markdown processing """ from __future__ import annotations -from typing import Callable +from typing import Callable, Optional from dataclasses import dataclass import base64, hashlib @@ -69,6 +69,7 @@ class LineType(Enum): TABLESEPARATOR = auto() TABLEROW = auto() TABLELASTROW = auto() + RAWHTML = auto() @dataclass @@ -78,6 +79,16 @@ class Line: lineType:LineType = LineType.TEXT + def __str__(self) -> str: + """ Return the line as a string. """ + return self.text + + + def __repr__(self) -> str: + """ Return the line as a string. """ + return self.__str__() + + @dataclass class Clause: """ Represents a clause in the markdown file. """ @@ -176,6 +187,18 @@ class Clause: The number of characters in the clause. """ return sum([ len(l.text.strip()) for l in self.lines ]) + + + def __str__(self) -> str: + """ Return the clause as a string. """ + return ''.join([str(l) for l in self.lines ]) + + + def __repr__(self) -> str: + """ Return the clause as a string. """ + return self.__str__() + + class Footnote: """ Represents a footnote in the markdown file. """ @@ -361,12 +384,23 @@ class Document: clause.lines = lines + def __str__(self) -> str: + """ Return the document as a string. """ + return '\n'.join([ str(c) for c in self.clauses ]) + + + def __repr__(self) -> str: + """ Return the document as a string. """ + return self.__str__() + -def analyseMarkdown(filename:str) -> Document: +def analyseMarkdown(filename:Optional[str]=None, inLines:Optional[list[str]]=None) -> Document: """ Analyse the markdown file and split it into clauses. + Either the filename or the inLines must be provided. Args: filename: The name of the markdown file. + inLines: The lines of the markdown file. Returns: The document object. @@ -388,9 +422,7 @@ def analyseMarkdown(filename:str) -> Document: printDebug(htmltable) except Exception as e: printError(f"Error: {e}") - # TODO move this outside of the analyseMarkdown function !!! - for row in htmltable: - outClauses[-1].append(Line(row, LineType.TABLEROW)) + outClauses[-1].append(Line(htmltable, LineType.RAWHTML)) gridTable = '' @@ -398,8 +430,13 @@ def analyseMarkdown(filename:str) -> Document: # Read the file. # Note: We use utf-8 and replace errors to avoid problems with special or unknown characters. - with open(filename, 'r', encoding = 'utf-8', errors = 'replace') as file: - inLines = file.readlines() + if filename and not inLines: + with open(filename, 'r', encoding = 'utf-8', errors = 'replace') as file: + inLines = file.readlines() + elif not filename and inLines: + pass + else: + raise ValueError('Either the filename or the lines must be provided.') # The list of clauses. The first clause contains the text before the first heading. outClauses:list[Clause] = [Clause(0, '', '', [])] -- GitLab From 5b50a6f75de1436d2e8637e2f720b2d0fc39628d Mon Sep 17 00:00:00 2001 From: ankraft <an.kraft@gmail.com> Date: Mon, 24 Feb 2025 20:17:23 +0100 Subject: [PATCH 14/29] Added stdin/stdout filter to convert grid tables to html --- toMkdocs/gridTableFilter.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 toMkdocs/gridTableFilter.py diff --git a/toMkdocs/gridTableFilter.py b/toMkdocs/gridTableFilter.py new file mode 100644 index 0000000..c630ae1 --- /dev/null +++ b/toMkdocs/gridTableFilter.py @@ -0,0 +1,36 @@ +# +# gridTableFilter.py +# +# (c) 2025 by Andreas Kraft & Miguel Angel Reina Ortega +# License: BSD 3-Clause License. See the LICENSE file for further details. +# +""" This script replaces the grid tables in the markdown files with the equivalent + html tables. Other markdown elements are not affected and are passed through. + + The script expects the markdown file to be converted from stdin and writes the + result to stdout. +""" + +import argparse, sys +from rich import print +from markdownTools import analyseMarkdown, setLoggers + +def main() -> None: + + # Parse the command line arguments + parser = argparse.ArgumentParser(description='Convert grid tables to html tables. This script reads the markdown file from stdin and writes the result to stdout.') + parser.add_argument('-v', '--verbose', action='store_true', help='Print debug information to stderr.') + args = parser.parse_args() + + # Set the loggers + setLoggers(info=lambda m: print(f'[green]{m}', file=sys.stderr) if args.verbose else None, + debug=lambda m: print(f'[dim]{m}', file=sys.stderr) if args.verbose else None, + error=lambda m: print(f'[red]{m}', file=sys.stderr) if args.verbose else None) + + # Read the input from stdin and write the result to stdout + print(analyseMarkdown(inLines=sys.stdin.readlines()), file=sys.stdout) + + +if __name__ == '__main__': + main() + -- GitLab From 2bf10c2644bdcf9749d00dc2cc7db0129bd25250 Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Wed, 26 Feb 2025 10:47:16 +0100 Subject: [PATCH 15/29] Some fixes for tables without header --- toMkdocs/gridTableTools.py | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index 3738dd5..487ae31 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -72,19 +72,6 @@ class GridCell: headerDelimiterIndex += 1 else: raise ValueError('Invalid table formatting') - else: - bodyDelimiterIndex = 0 - while bodyDelimiterIndex < len(defaultAlignments) and self.position > delimiterPositions[bodyDelimiterIndex]: - bodyDelimiterIndex += 1 - if bodyDelimiterIndex < len(defaultAlignments): - if self.position < delimiterPositions[bodyDelimiterIndex]: - self.alignment = defaultAlignments[bodyDelimiterIndex] - elif self.position == delimiterPositions[bodyDelimiterIndex]: - self.alignment = defaultAlignments[bodyDelimiterIndex] - bodyDelimiterIndex += 1 - else: - raise ValueError('Invalid table formatting') - def __str__(self): return f'(Content: {self.content}, Rowspan: {self.rowspan}, Colspan: {self.colspan}, Alignment: {self.alignment}, Position: {self.position}, ListFlag: {self.listFlag}, AuxiliarIndex: {self.auxiliarIndex})' @@ -322,20 +309,28 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR headerDelimiterPositions.append(min(delPositions) if delPositions else -1) if not hasHeader: - #Set default alignments from the first separator + # Set default alignments from the first separator which takes the role of header + hasHeader = True + headerSeparatorIndex = 0 parts = re.split(r'\+', lines[0].strip('+')) - defaultAlignments = [] - # Calculate default alignments and positions of delimiters - for part_index in range(len(parts)): - if parts[part_index].startswith(':') and not parts[part_index].endswith(':'): + + # Calculate default alignments and positions of delimiters + for partIndex in range(len(parts)): + if parts[partIndex].startswith(':') and not parts[partIndex].endswith(':'): defaultAlignments.append(_alignLeft) - elif not parts[part_index].startswith(':') and parts[part_index].endswith(':'): + elif not parts[partIndex].startswith(':') and parts[partIndex].endswith(':'): defaultAlignments.append(_alignRight) else: defaultAlignments.append(_alignCenter) + # Delimiter position + delimiterPositionsStart = delimiterPositions[partIndex - 1] if partIndex != 0 else 0 + delPositions = [lines[index].find(delimiter, delimiterPositionsStart + 1) + for delimiter in '+' if delimiter in lines[index][delimiterPositionsStart + 1:]] + headerDelimiterPositions.append(min(delPositions) if delPositions else -1) + #Check end table delimiter alignment (not checked during the lines processing) if not checkDelimiterAlignment(lines[-1], delimiterPositions): raise ValueError(f'Misaligned delimiters in end table separator: {lines[-1]}') -- GitLab From a88ef036f506b99c3af873d703158536fd9ab264 Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Wed, 26 Feb 2025 17:09:32 +0100 Subject: [PATCH 16/29] Fixing setting of cell alignment --- toMkdocs/gridTableTools.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index 487ae31..2dd122a 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -39,6 +39,7 @@ class GridCell: self.colspan:int = 0 self.colspanAdjusted:bool = False self.alignment:str = 'align="center"' + self.positionStart:Optional[int] = None self.position:Optional[int] = None self.listFlag:bool = False self.auxiliarIndex:int = 0 @@ -57,19 +58,15 @@ class GridCell: defaultAlignments: The default alignments. hasHeader: True if the table has a header, False otherwise. """ - if self.position is None: + if self.position is None or self.positionStart is None: raise ValueError('Cell position must be set before calculating alignment.') if hasHeader: headerDelimiterIndex = 0 - while headerDelimiterIndex < len(defaultAlignments) and self.position > headerDelimiterPositions[headerDelimiterIndex]: + while headerDelimiterIndex < len(defaultAlignments) and self.positionStart > headerDelimiterPositions[headerDelimiterIndex]: headerDelimiterIndex += 1 if headerDelimiterIndex < len(defaultAlignments): - if self.position < headerDelimiterPositions[headerDelimiterIndex]: - self.alignment = defaultAlignments[headerDelimiterIndex] - elif self.position == headerDelimiterPositions[headerDelimiterIndex]: - self.alignment = defaultAlignments[headerDelimiterIndex] - headerDelimiterIndex += 1 + self.alignment = defaultAlignments[headerDelimiterIndex] else: raise ValueError('Invalid table formatting') @@ -364,6 +361,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR cell = rows[-1][columnIndex] # Set position + cell.positionStart = delimiterIndex - len(parts[rowIndex]) cell.position = delimiterIndex # Position of cell delimiter + # Set alignment as defined by header separator line @@ -390,6 +388,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR if auxiliarCellIndex < numberOfColumns: auxDelimiterIndex += len(content) + 1 cell = rows[-1][auxiliarCellIndex] + cell.positionStart = auxDelimiterIndex - len(content) # Position of cell delimiter + cell.position = auxDelimiterIndex # Position of cell delimiter + cell.calculateAndSetAlignment(headerDelimiterPositions, delimiterPositions, defaultAlignments, hasHeader) while auxDelimiterIndex > delimiterPositions[auxiliarCellIndex]: -- GitLab From 22f67aca686a2d51a8243f221cd491a9ee3c661e Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Thu, 27 Feb 2025 15:17:16 +0100 Subject: [PATCH 17/29] Fix for the italic conversion --- toMkdocs/gridTableTools.py | 2 +- toMkdocs/regexMatches.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index 2dd122a..ea3217f 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -494,7 +494,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR # Italic replacements # Regex to detect markdown italic formatting in cell content if cell.content is not None: - cell.content = matchItalic.sub(r'<i>\g<text></i>', cell.content) + cell.content = matchItalic.sub(r'\1<i>\g<text></i>', cell.content) # Correct newlines characters for headerRow in headerRows: diff --git a/toMkdocs/regexMatches.py b/toMkdocs/regexMatches.py index ba1fc15..cbac6c3 100644 --- a/toMkdocs/regexMatches.py +++ b/toMkdocs/regexMatches.py @@ -37,4 +37,4 @@ matchTable = re.compile(r'^\s*\|.*\|\s*$', re.IGNORECASE) matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE) matchBold = re.compile(r'(^|\s)(\*\*|__)(?P<text>.+?)\2(?!\w)') -matchItalic = re.compile(r'(^|\s)(\*|_)(?P<text>.+?)(?<!\\)\3(\s|$)') +matchItalic = re.compile(r'(^|\s)(\*|_)(?P<text>.+?)\2(?!\w)') \ No newline at end of file -- GitLab From e79c72d37c483a24065d6a8b014bc82cc0870eb6 Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Wed, 5 Mar 2025 12:32:48 +0100 Subject: [PATCH 18/29] Fix for merged rows --- toMkdocs/gridTableTools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index ea3217f..f56d4a7 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -405,7 +405,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR # Check whether a cell contains a header separator if matchGridTableBodySeparatorLine.match(content): # A new row is to be added - rowsTracker[columnCellIndex] += 1 # That actual row will have more than one row + rowsTracker[columnCellIndex] = max(rowsTracker) + 1 # That actual row will have more than one row cell.listFlag = False columnForward = 0 -- GitLab From dffceed3d2e1d31981ff3c7ded9ddd6616f79995 Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Wed, 5 Mar 2025 12:42:41 +0100 Subject: [PATCH 19/29] Fix for merged rows --- toMkdocs/gridTableTools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index f56d4a7..e22c733 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -413,7 +413,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR rowIndex = rowsTracker[columnCellIndex] # Correcting the rowIndex. Might have been changed by a previous iteration if rows[rowIndex][columnCellIndex].position >= delimiterPositions[delIndex]: columnForward += 1 - rowsTracker[columnCellIndex + columnForward - 1] += 1 if columnForward > 1 else 0 + rowsTracker[columnCellIndex + columnForward - 1] = max(rowsTracker) + 1 if columnForward > 1 else 0 columnCellIndex += columnForward continue -- GitLab From 4bfce4d012dcd1fc02a130737d30d2bd3c90e7b8 Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Wed, 5 Mar 2025 13:30:25 +0100 Subject: [PATCH 20/29] Fixes for merged rows --- toMkdocs/gridTableTools.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index e22c733..d729347 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -398,6 +398,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR if len(cellsContent) <= numberOfColumns: # Colspan: Positions of | with respect to + need to be determined columnCellIndex = 0 + maxRowsTracker = max(rowsTracker) # Go through all cells in a columnt for columnIndex, content in enumerate(cellsContent): rowIndex = rowsTracker[columnCellIndex] @@ -405,7 +406,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR # Check whether a cell contains a header separator if matchGridTableBodySeparatorLine.match(content): # A new row is to be added - rowsTracker[columnCellIndex] = max(rowsTracker) + 1 # That actual row will have more than one row + rowsTracker[columnCellIndex] = maxRowsTracker + 1 # That actual row will have more than one row cell.listFlag = False columnForward = 0 @@ -413,7 +414,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR rowIndex = rowsTracker[columnCellIndex] # Correcting the rowIndex. Might have been changed by a previous iteration if rows[rowIndex][columnCellIndex].position >= delimiterPositions[delIndex]: columnForward += 1 - rowsTracker[columnCellIndex + columnForward - 1] = max(rowsTracker) + 1 if columnForward > 1 else 0 + rowsTracker[columnCellIndex + columnForward - 1] = maxRowsTracker + 1 if columnForward > 1 else 0 columnCellIndex += columnForward continue -- GitLab From ad41b056365308db14d2cbe845adeccfbece99b3 Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Wed, 5 Mar 2025 13:39:18 +0100 Subject: [PATCH 21/29] Fixes for merged rows --- toMkdocs/gridTableTools.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index d729347..6b4ca62 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -407,6 +407,9 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR # Check whether a cell contains a header separator if matchGridTableBodySeparatorLine.match(content): # A new row is to be added rowsTracker[columnCellIndex] = maxRowsTracker + 1 # That actual row will have more than one row + rowIndex = rowsTracker[columnCellIndex] + cell = rows[rowIndex][columnCellIndex] + cell.listFlag = False columnForward = 0 -- GitLab From 852ecc728dff9fa0ef16e7f0ee8aabb8307e3b8d Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Wed, 5 Mar 2025 13:49:14 +0100 Subject: [PATCH 22/29] Fix for merged rows --- toMkdocs/gridTableTools.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index 6b4ca62..7a71f5f 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -124,6 +124,10 @@ class GridRowsTracker(): def __repr__(self): return self.__str__() + def max(self) -> int: + return max(self.gridRowTracker) + + # Some type aliases GridTableRow = list[GridCell] -- GitLab From 0fbabb1acf4453f71ea3b28a88acafd1c2dfc6d0 Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Wed, 5 Mar 2025 13:51:37 +0100 Subject: [PATCH 23/29] Fix for merged rows --- toMkdocs/gridTableTools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index 7a71f5f..f6825ae 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -402,7 +402,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR if len(cellsContent) <= numberOfColumns: # Colspan: Positions of | with respect to + need to be determined columnCellIndex = 0 - maxRowsTracker = max(rowsTracker) + maxRowsTracker = rowsTracker.max() # Go through all cells in a columnt for columnIndex, content in enumerate(cellsContent): rowIndex = rowsTracker[columnCellIndex] -- GitLab From 3cac165eea571993f7635d1843411183900ac98d Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Wed, 5 Mar 2025 14:09:13 +0100 Subject: [PATCH 24/29] Fix for merged rows --- toMkdocs/gridTableTools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index f6825ae..19ec017 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -421,7 +421,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR rowIndex = rowsTracker[columnCellIndex] # Correcting the rowIndex. Might have been changed by a previous iteration if rows[rowIndex][columnCellIndex].position >= delimiterPositions[delIndex]: columnForward += 1 - rowsTracker[columnCellIndex + columnForward - 1] = maxRowsTracker + 1 if columnForward > 1 else 0 + #rowsTracker[columnCellIndex + columnForward - 1] = maxRowsTracker + 1 if columnForward > 1 else 0 columnCellIndex += columnForward continue -- GitLab From a1b466b0336eea1283c3d150288fde7a261d54c3 Mon Sep 17 00:00:00 2001 From: ankraft <an.kraft@gmail.com> Date: Wed, 5 Mar 2025 17:30:49 +0100 Subject: [PATCH 25/29] Added comment --- toMkdocs/gridTableTools.py | 1 + 1 file changed, 1 insertion(+) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index 19ec017..ab2585a 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -402,6 +402,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR if len(cellsContent) <= numberOfColumns: # Colspan: Positions of | with respect to + need to be determined columnCellIndex = 0 + # Put the value in a variable here because we need the initial value maxRowsTracker = rowsTracker.max() # Go through all cells in a columnt for columnIndex, content in enumerate(cellsContent): -- GitLab From 98b64bd67f22909b22da31bc6fd1595c4f7cb9c6 Mon Sep 17 00:00:00 2001 From: ankraft <an.kraft@gmail.com> Date: Wed, 26 Mar 2025 18:05:01 +0100 Subject: [PATCH 26/29] New script to process markdown includes and front matter. Might be extended. --- processMDSpec.py | 198 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 198 insertions(+) create mode 100644 processMDSpec.py diff --git a/processMDSpec.py b/processMDSpec.py new file mode 100644 index 0000000..f065f95 --- /dev/null +++ b/processMDSpec.py @@ -0,0 +1,198 @@ +# +# processMDSpec.py +# +# (c) 2025 by Andreas Kraft +# License: BSD 3-Clause License. See the LICENSE file for further details. +# +""" This script processes markdown specification files. It handles the + include statements and the front matter. It can also render the markdown + content on console or output the front matter only. +""" + +from __future__ import annotations +from typing import Tuple, Generator +import argparse +from rich import print, markdown +import re, sys, yaml +from contextlib import contextmanager + + +_frontMatter:dict = {} +_includeStack:list[str] = [] + +@contextmanager +def includeStack(filename:str) -> Generator [None, None, None]: + """ Handle the include stack. + + This is used to detect circular includes and to keep track of the + include stack. + + Args: + filename: The name of the file being processed. + + Raises: + Exception: If a circular include is detected. + + Returns: + Generator: A generator that yields nothing. + """ + if filename in _includeStack: + print(f'[red]Circular include detected: {filename}') + raise Exception('Circular include detected') + _includeStack.append(filename) + yield + _includeStack.pop() + + +def processFrontMatter(lines:list[str], args:argparse.Namespace) -> Tuple[dict, list[str]]: + """ Process the front matter of a markdown file. This includes extracting + the front matter information and returning it as a dictionary. + + Currently only YAML front matter is supported. It can be extended later. + + Args: + lines: The lines of the markdown file. + args: The command line arguments. + + Raises: + yaml.YAMLError: If the front matter cannot be parsed as YAML. + + Returns: + dict: The front matter information as a dictionary. + list[str]: The lines of the markdown file without the front matter. + """ + + if not lines or not lines[0].startswith('---'): + return {}, lines + + frontMatterLines:list[str] = [] + for line in lines[1:]: + if re.match(r'^---\s*', line): + break + frontMatterLines.append(line) + + # Remove the front matter from the lines + lines = lines[len(frontMatterLines)+2:] + + # Parse the front matter as YAML + try: + return yaml.safe_load(''.join(frontMatterLines)), lines + except yaml.YAMLError as e: + print(f'[red]Error parsing front matter: {e}') + raise + + +def processFile(args:argparse.Namespace) -> str: + """ Handle the include statements in the markdown files. This includes + processing the include statements and removing the include statements + from the markdown files. + + Args: + args: The command line arguments. + + Raises: + Exception: If the file cannot be processed. + + Returns: + The processed markdown content as a string. + """ + + def handleIncludesForFile(filename:str) -> str: + """ Read a single markdown file and return its content. + + Args: + filename: The name of the file to read. + + Raises: + FileNotFoundError: If the file cannot be found. + + Returns: + The content of the file. + """ + + with includeStack(filename): + try: + with open(filename, 'r') as f: + lines = f.readlines() + except FileNotFoundError: + print(f'[red]File not found: {filename}') + raise + + # extract front matter information + fm, lines = processFrontMatter(lines, args) + if fm: + _frontMatter[filename] = fm + + if not args.doInclude: + return ''.join(lines) + + inCodeFence = False + for line in lines: + + # Ignore code fences + if re.match(r'^\s*```.*', line): + inCodeFence = not inCodeFence + continue + if inCodeFence: + continue + + # Check for ::include{file=...} pattern using regex at the beginning of a line + match = re.search(r'^::include\{\s*file=(.*?)\s*\}', line.strip()) + if match: + include_filename = match.group(1) + # Read the included file and replace the include statement with its content + include_content = handleIncludesForFile(include_filename) + lines[lines.index(line)] = include_content + + return ''.join(lines) + + return handleIncludesForFile(args.document) + + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='Process markdown specification files.') + parser.add_argument('--no-include', dest='doInclude', action='store_false', default=True, help="don't process include statements") + parser.add_argument('--render-markdown', '-md', dest='renderAsMarkdown', action='store_true', help='render output as markdown') + parser.add_argument('--process-frontmatter', '-fm', dest='outputFrontMatter', action='store_true', help='output front matter only') + parser.add_argument('--frontmatter-only', '-fmo', dest='onlyFrontMatter', action='store_true', help='output only front matter') + parser.add_argument('--verbose', '-v', action='store_true', help='print debug information to stderr.') + parser.add_argument('document', type = str, help = 'a markdown specification document to process') + args = parser.parse_args() + + if args.verbose: + if not args.doInclude: + print(f'[yellow]Skipping processing include statements', file=sys.stderr) + else: + print(f'[green]Processing include statements', file=sys.stderr) + + try: + lines = processFile(args) + except Exception as e: + print(f'[red]Error processing file: {e}', file=sys.stderr) + quit(1) + + if args.outputFrontMatter or args.onlyFrontMatter: + # Collect front matter information in the output + if not args.onlyFrontMatter: + print('---') + + # The following is a workaround to keep the order of the dictionary + # see https://stackoverflow.com/a/52621703 + yaml.add_representer(dict, lambda self, data: yaml.representer.SafeRepresenter.represent_dict(self, data.items())) + print(yaml.dump(_frontMatter, default_flow_style=False), end='') + + if not args.onlyFrontMatter: + print('---') + + if not args.onlyFrontMatter: + if args.renderAsMarkdown: + # Render the markdown content + print(markdown.Markdown(lines)) + else: + # Print the raw markdown content + print(lines) + + + -- GitLab From a6bd358c10ef2c7e622ccbb0fa953198a07a86f9 Mon Sep 17 00:00:00 2001 From: ankraft <an.kraft@gmail.com> Date: Fri, 28 Mar 2025 14:34:50 +0100 Subject: [PATCH 27/29] Support directory-relative imports (change paths of links, images, includes) in sub-folders --- processMDSpec.py | 108 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 93 insertions(+), 15 deletions(-) diff --git a/processMDSpec.py b/processMDSpec.py index f065f95..9b3d93c 100644 --- a/processMDSpec.py +++ b/processMDSpec.py @@ -10,10 +10,13 @@ """ from __future__ import annotations + +_print = print # save the original print function + from typing import Tuple, Generator import argparse -from rich import print, markdown -import re, sys, yaml +from rich import markdown, print +import re, sys, yaml, os from contextlib import contextmanager @@ -37,13 +40,79 @@ def includeStack(filename:str) -> Generator [None, None, None]: Generator: A generator that yields nothing. """ if filename in _includeStack: - print(f'[red]Circular include detected: {filename}') - raise Exception('Circular include detected') + raise Exception(f'Circular include detected: {" -> ".join(_includeStack)} -> {filename}') _includeStack.append(filename) yield _includeStack.pop() +def expandPaths(lines:list[str], currentPath:str, childPath:str) -> list[str]: + """ Expand the paths in the markdown file. This means that all paths in links, + images, and include statements are extended so that they would be valid paths + from the root document. + + Args: + lines: The lines of the markdown file. + currentPath: The current path of the file being processed. + childPath: The path of the child file being processed. + + Returns: + list[str]: The lines of the markdown file with expanded paths. + """ + + # Replace all relative paths in the markdown with the new path + # add a path to the current path + if currentPath[-1] != '/': + currentPath += '/' + newPath = currentPath + childPath + # Remove the leading './' from the path + while newPath.startswith('./'): + newPath = newPath[2:] + + inCodeFence = False + for index, line in enumerate(lines): + + # Ignore stuff in code fences + if re.match(r'^\s*```.*', line): + inCodeFence = not inCodeFence + continue + if inCodeFence: + continue + + # handle the links in a line (there could be multiple links in a line) + links = re.findall(r'\[([^\]]+)\]\(([^\)]+)\)', line) + for linkText, linkPath in links: + # Skip URLs and absolute paths + if linkPath.startswith(('http://', 'https://', '/')): + continue + + # Construct the new path by adding addedPath to the original path + newLinkPath = linkPath[2:] if linkPath.startswith('./') else linkPath + + # Create the updated path + updatedPath = f"{newPath}{linkPath}" if newPath.endswith('/') else f"{newPath}/{newLinkPath}" + + # Replace the original link with the updated one in the markdown + line = line.replace(f'[{linkText}]({linkPath})', f'[{linkText}]({updatedPath})') + + # handle the include statements (there should only be one per line) + includes = re.findall(r'^\s*::include{file=([^\}]+)}', line) + for includePath in includes: + + # Construct the new path by adding addedPath to the original path + includePath = includePath[2:] if includePath.startswith('./') else includePath + + # Create the updated path + updatedPath = f'{newPath}{includePath}' if newPath.endswith('/') else f'{newPath}/{includePath}' + + # Replace the original include with the updated one in the markdown + line = line.replace(f'::include{{file={includePath}}}', f'::include{{file={updatedPath}}}') + + lines[index] = line + + return lines + + def processFrontMatter(lines:list[str], args:argparse.Namespace) -> Tuple[dict, list[str]]: """ Process the front matter of a markdown file. This includes extracting the front matter information and returning it as a dictionary. @@ -97,7 +166,7 @@ def processFile(args:argparse.Namespace) -> str: The processed markdown content as a string. """ - def handleIncludesForFile(filename:str) -> str: + def handleIncludesForFile(filename:str, currentPath:str) -> str: """ Read a single markdown file and return its content. Args: @@ -109,6 +178,14 @@ def processFile(args:argparse.Namespace) -> str: Returns: The content of the file. """ + # Get the directory path from the filename + dirname = os.path.dirname(filename) + if dirname and not dirname.endswith('/'): + dirname = dirname + '/' + + dirname = dirname if dirname else '.' + currentPath = currentPath if currentPath else '.' + filename = os.path.normpath(filename) with includeStack(filename): try: @@ -117,8 +194,11 @@ def processFile(args:argparse.Namespace) -> str: except FileNotFoundError: print(f'[red]File not found: {filename}') raise + + # Expand the paths in the markdown file # extract front matter information + lines = expandPaths(lines, currentPath, dirname) fm, lines = processFrontMatter(lines, args) if fm: _frontMatter[filename] = fm @@ -129,7 +209,7 @@ def processFile(args:argparse.Namespace) -> str: inCodeFence = False for line in lines: - # Ignore code fences + # Ignore stuff code fences if re.match(r'^\s*```.*', line): inCodeFence = not inCodeFence continue @@ -139,17 +219,15 @@ def processFile(args:argparse.Namespace) -> str: # Check for ::include{file=...} pattern using regex at the beginning of a line match = re.search(r'^::include\{\s*file=(.*?)\s*\}', line.strip()) if match: - include_filename = match.group(1) + includeFilename = match.group(1) # Read the included file and replace the include statement with its content - include_content = handleIncludesForFile(include_filename) - lines[lines.index(line)] = include_content - + lines[lines.index(line)] = handleIncludesForFile(includeFilename, os.path.dirname(filename)) + return ''.join(lines) - return handleIncludesForFile(args.document) + return handleIncludesForFile(args.document, os.path.dirname(args.document)) - if __name__ == '__main__': parser = argparse.ArgumentParser(description='Process markdown specification files.') @@ -158,7 +236,7 @@ if __name__ == '__main__': parser.add_argument('--process-frontmatter', '-fm', dest='outputFrontMatter', action='store_true', help='output front matter only') parser.add_argument('--frontmatter-only', '-fmo', dest='onlyFrontMatter', action='store_true', help='output only front matter') parser.add_argument('--verbose', '-v', action='store_true', help='print debug information to stderr.') - parser.add_argument('document', type = str, help = 'a markdown specification document to process') + parser.add_argument('document', type=str, help='a markdown specification document to process') args = parser.parse_args() if args.verbose: @@ -170,7 +248,7 @@ if __name__ == '__main__': try: lines = processFile(args) except Exception as e: - print(f'[red]Error processing file: {e}', file=sys.stderr) + print(f'[red]Error while processing {args.document}\n{e}', file=sys.stderr) quit(1) if args.outputFrontMatter or args.onlyFrontMatter: @@ -192,7 +270,7 @@ if __name__ == '__main__': print(markdown.Markdown(lines)) else: # Print the raw markdown content - print(lines) + _print(lines) -- GitLab From ea9475eb75f041729d4e41c609019464226c8c26 Mon Sep 17 00:00:00 2001 From: ankraft <an.kraft@gmail.com> Date: Sun, 30 Mar 2025 13:43:08 +0200 Subject: [PATCH 28/29] Corrected printing of footnotes. Also, no more linebreaks at (pseudo)terminal width. This broke pipe table outputs --- toMkdocs/gridTableFilter.py | 1 - toMkdocs/markdownTools.py | 53 ++++++++++++++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/toMkdocs/gridTableFilter.py b/toMkdocs/gridTableFilter.py index c630ae1..027c69f 100644 --- a/toMkdocs/gridTableFilter.py +++ b/toMkdocs/gridTableFilter.py @@ -12,7 +12,6 @@ """ import argparse, sys -from rich import print from markdownTools import analyseMarkdown, setLoggers def main() -> None: diff --git a/toMkdocs/markdownTools.py b/toMkdocs/markdownTools.py index 6c9dc41..67fafa5 100644 --- a/toMkdocs/markdownTools.py +++ b/toMkdocs/markdownTools.py @@ -214,6 +214,13 @@ class Footnote: self.line = line """ The line of the footnote. """ + + def __str__(self) -> str: + return self.line.text + + + def __repr__(self) -> str: + return self.__str__() class Document: """ Represents the document object. """ @@ -386,7 +393,7 @@ class Document: def __str__(self) -> str: """ Return the document as a string. """ - return '\n'.join([ str(c) for c in self.clauses ]) + return '\n'.join([ str(c) for c in self.clauses + self.footnotes ]) def __repr__(self) -> str: @@ -545,3 +552,47 @@ def analyseMarkdown(filename:Optional[str]=None, inLines:Optional[list[str]]=Non return Document(outClauses, footnotes) +def main() -> None: + """Hauptfunktion zur Verarbeitung von Markdown-Dateien über die Kommandozeile.""" + import argparse + + parser = argparse.ArgumentParser(description='Markdown-Dateien verarbeiten, um Gittertabellen zu konvertieren und andere Formatierungen zu handhaben') + parser.add_argument('eingabe', help='Eingabe-Markdown-Datei') + parser.add_argument('-v', '--verbose', action='store_true', help='Ausführliche Ausgabe aktivieren') + parser.add_argument('-vv', '--sehr-verbose', action='store_true', help='Sehr ausführliche Ausgabe aktivieren') + parser.add_argument('-i', '--ignoriere-titel', nargs='+', default=[], help='Liste der zu ignorierenden Titel') + parser.add_argument('-s', '--teilungs-ebene', type=int, default=1, help='Ebene, auf der das Dokument geteilt werden soll (Standard: 1)') + parser.add_argument('-f', '--ignoriere-erste', action='store_true', help='Inhalt bis zur ersten Überschrift ignorieren') + + args = parser.parse_args() + + # Verbositätsebenen setzen + global verbose, veryVerbose + verbose = args.verbose + veryVerbose = args.sehr_verbose + + # Markdown-Datei verarbeiten + doc = analyseMarkdown(args.eingabe) + + # Dokument teilen und verarbeiten + doc.splitMarkdownDocument( + ignoreTitles=args.ignoriere_titel, + splitLevel=args.teilungs_ebene, + ignoreUntilFirstHeading=args.ignoriere_erste + ) + + # Dokumentenelemente aktualisieren + doc.insertFootnotes() + doc.updateLinks() + doc.updateNotes() + + # Verarbeitetes Dokument ausgeben + for clause in doc.clauses: + print(f"\n{'#' * clause.level} {clause.title}") + for line in clause.lines: + print(line.text, end='') + +if __name__ == '__main__': + main() + + -- GitLab From 45b89e6abf9480f62f5dd0433d2c86f37f18959f Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Tue, 1 Apr 2025 13:56:03 +0000 Subject: [PATCH 29/29] Fix when checking generated table rows --- toMkdocs/gridTableTools.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py index ab2585a..6b4161c 100644 --- a/toMkdocs/gridTableTools.py +++ b/toMkdocs/gridTableTools.py @@ -529,9 +529,13 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR if idx > 0 and cell.colspan == 0: if forwardRowspan[cellIndex] > 0: sum += 1 - forwardRowspan[cellIndex] -= 1 + forwardRowspan[cellIndex] -= 1 if forwardRowspan[cellIndex] == 0 and cell.rowspan > 1: forwardRowspan[cellIndex] = cell.rowspan -1 + colspan=1 + while cell.colspan > colspan: + forwardRowspan[cellIndex + colspan] = cell.rowspan - 1 + colspan += 1 if not sum == numberOfColumns: raise ValueError('Grid table not converted properly') @@ -548,9 +552,14 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR if idx > 0 and cell.colspan == 0: if forwardRowspan[cellIndex] > 0: sum += 1 - forwardRowspan[cellIndex] -= 1 + forwardRowspan[cellIndex] -= 1 if forwardRowspan[cellIndex] == 0 and cell.rowspan > 1: forwardRowspan[cellIndex] = cell.rowspan - 1 + colspan=1 + while cell.colspan > colspan: + forwardRowspan[cellIndex + colspan] = cell.rowspan - 1 + colspan += 1 + if not sum == numberOfColumns: raise ValueError('Grid table not converted properly') -- GitLab