diff --git a/LICENSE b/LICENSE index 642df8af5e5d17b274a380164734f39ae289113b..11dd0dfb75a5ffa733d06018da0d082e0c378ba6 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ BSD 3-Clause License -Copyright (c) 2024, Miguel Angel Reina Ortega +Copyright (c) 2024, Miguel Angel Reina Ortega & Andreas Kraft Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py new file mode 100644 index 0000000000000000000000000000000000000000..170c3a5dd8d02452b1b5e6f3bd5fdd847c9cdc47 --- /dev/null +++ b/toMkdocs/gridTableTools.py @@ -0,0 +1,503 @@ +# +# gridTableTools.py +# +# (c) 2025 by Miguel Angel Reina Ortega & Andreas Kraft +# License: BSD 3-Clause License. See the LICENSE file for further details. +# +""" Tools for working with grid tables in markdown files. """ + +from typing import Optional +from regexMatches import * + + + +class GridCell: + """ Represents a grid table cell. """ + + def __init__(self) -> None: + """ Initialize a new grid table cell. + """ + self.content:Optional[str] = None + self.rowspan:int = 0 + self.colspan:int = 0 + self.colspanAdjusted:bool = False + self.alignment:str = 'align="center"' + self.position:Optional[int] = None + self.listFlag:bool = False + self.auxiliarIndex:int = 0 + + + def calculateAndSetAlignment(self, headerDelimiterPositions:list[int], defaultAlignments:list[str]) -> None: + """ Set the alignment of the cell based on the position of the delimiter. + """ + if self.position is None: + raise ValueError('Cell position must be set before calculating alignment.') + + headerDelimiterIndex = 0 + while headerDelimiterIndex < len(defaultAlignments) and self.position > headerDelimiterPositions[headerDelimiterIndex]: + headerDelimiterIndex += 1 + if headerDelimiterIndex < len(defaultAlignments): + if self.position < headerDelimiterPositions[headerDelimiterIndex]: + self.alignment = defaultAlignments[headerDelimiterIndex] + elif self.position == headerDelimiterPositions[headerDelimiterIndex]: + self.alignment = defaultAlignments[headerDelimiterIndex] + headerDelimiterIndex += 1 + else: + raise ValueError('Invalid table formatting') + + + def __str__(self): + return f'(Content: {self.content}, Rowspan: {self.rowspan}, Colspan: {self.colspan}, Alignment: {self.alignment}, Position: {self.position}, ListFlag: {self.listFlag}, AuxiliarIndex: {self.auxiliarIndex})' + + + def __repr__(self): + return self.__str__() + + +class GridRow(): + """ Represents a row in a grid table. """ + cells:list[GridCell] = [] + + + def __init__(self, length: int = 1) -> None: + self.cells = [GridCell() for _ in range(length)] + + + def __getitem__(self, item): + return self.cells[item] + + + def __setitem__(self, key, value): + self.cells[key] = value + + + def __str__(self): + return str(self.cells) + + + def __repr__(self): + return self.__str__() + + +class GridRowsTracker(): + """ Represents the document object. """ + def __init__(self, size:int) -> None: + self.gridRowTracker = [0 for _ in range(size)] + + + def __getitem__(self, item:int) -> int: + return self.gridRowTracker[item] + + + def __setitem__(self, key:int, value:int) -> None: + self.gridRowTracker[key] = value + + + def __str__(self): + return str(self.gridRowTracker) + + + def __repr__(self): + return self.__str__() + + +# Some type aliases +GridTableRow = list[GridCell] +GridTableRowList = list[GridTableRow] + +def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableRowList]: + """ + Parse a Pandoc-style grid table into a structure for HTML conversion with rowspan and colspan. + + :param pandoc_table: String of the Pandoc-style grid table. + :return: List of lists representing the table with metadata for spans. + """ + + # Split the input into lines + lines:list[str] = [line.strip() for line in gridTable.strip().split('\n')] + + + # Detect separator lines by pattern (it does not take into account partial separators + def isSeparator(line:str) -> bool: + return matchGridTableSeparator.match(line) is not None + + + # Set content on the cell - concatenating multilines, flagging lists + def handleCellContent(cell:GridCell, content:str) -> None: + _c = content.strip() + + if cell.content is None: # Previous empty cell + cell.rowspan += 1 + cell.colspan += 1 + if _c.startswith('- '): # List in a cell + cell.listFlag = True + cell.content = _c + '\n' # Add newline to know when the list element ends + + elif cell.listFlag and len(_c) > 0: # any other content when handling list is concatenated to the last list element + cell.content = _c + '\n' + + elif not _c: # separation between list and other paragraph + cell.listFlag = False + cell.content = '\n' #if not cell['content'].endswith("\n") else "" + else: + cell.content = re.sub(r'\\\s*$', '\n', _c) + else: # Cell has content + if _c.startswith('- '): # List + if not cell.listFlag: + cell.content += '\n' + #cell['content'] = cell['content'].strip("\n") + cell.listFlag = True + cell.content += _c + '\n' # Add newline to know when the list element ends + elif cell.listFlag and _c: # any other content when handling list is concatenated to the last list element + cell.content = cell.content.strip('\n') + ' ' + _c + '\n' + elif len(_c) == 0: # separation between list and other paragraph + cell.listFlag = False + #content = re.sub(r'\\\s*$', "\n", content.strip()) + cell.content += '\n' if not cell.content.endswith('\n') else '' + else: + cell.content += ' ' + re.sub(r'\\\s*$', '\n', _c) + + # Adjust colspan of a cell + def adjustColspan(row:GridRow, columnIndex:int, numberOfParts:int, line, numberOfColumns:int, delimiterPositions:list[int]) -> None: + for j in range(columnIndex, numberOfParts): + delimiterStart:Optional[int] = None + colI = columnIndex + while delimiterStart == None: + delimiterStart = row[colI - 1].position if colI > 0 else 0 + colI -= 1 + positions = [line.find(delimiter, delimiterStart + 1) for delimiter in "|+" if delimiter in line[delimiterStart + 1:]] + position = min(positions) if positions else -1 + if position > delimiterPositions[j]: # Colspan to be increased + row[columnIndex].colspan += 1 + if position == delimiterPositions[len(delimiterPositions) - 1]: # last cell in row, adjust colspan to get max number columns + colspan_allocated = row[columnIndex].colspan + #for cell_index in range(number_of_parts): + # colspan_allocated += row[cell_index].colspan + row[columnIndex].colspan += numberOfColumns - colspan_allocated - columnIndex + elif position < delimiterPositions[j]: + raise ValueError("Wrong cell formatting") + else: + break + + row[columnIndex].colspanAdjusted = True # Mark cell as adjusted + + + separatorIndices = [i for i, line in enumerate(lines) if isSeparator(line)] + + if not separatorIndices: + raise ValueError('No valid separators found in the provided grid table.') + + # Calculate max number of columns + delimiterPositions:list[int] = [] + numberOfColumns = 0 + + for separatorIndex in separatorIndices: + if (_cnt := lines[separatorIndex].count('+') - 1) > numberOfColumns: + numberOfColumns = _cnt + delimiterPositions = [] + for rowIndex in range(numberOfColumns): + delimiterPositionsStart = delimiterPositions[rowIndex - 1] if rowIndex != 0 else 0 + delPositions = [lines[separatorIndex].find(delimiter, delimiterPositionsStart + 1) for delimiter in '+' if delimiter in lines[separatorIndex][delimiterPositionsStart + 1:]] + delimiterPositions.append(min(delPositions) if delPositions else -1) + + + # Determine delimter positions and alignments + hasHeader = False + headerDelimiterPositions:list[int] = [] + headerRows:GridTableRowList = [] + dataRows:GridTableRowList = [] + defaultAlignments:list[str] = [] + + for index in separatorIndices: + if matchGridTableHeaderSeparator.match(lines[index]): + hasHeader = True + headerSeparatorIndex = index + parts = re.split(r'\+', lines[index].strip('+')) + #Calculate default alignments and positions of delimiters + for partIndex in range(len(parts)): + if parts[partIndex].startswith(':') and not parts[partIndex].endswith(':'): # Left alignment + defaultAlignments.append('align="left"') + elif not parts[partIndex].startswith(":") and parts[partIndex].endswith(":"): # Right alignment + defaultAlignments.append('align="right"') + else: + defaultAlignments.append('align="center"') # Center alignment + # Delimiter position + delimiterPositionsStart = delimiterPositions[partIndex - 1] if partIndex != 0 else 0 + delPositions = [lines[index].find(delimiter, delimiterPositionsStart + 1) for delimiter in '+' if delimiter in lines[index][delimiterPositionsStart + 1:]] + headerDelimiterPositions.append(min(delPositions) if delPositions else -1) + + + for rowNumber in range(len(separatorIndices) - 1): + rows:list[GridRow] = [] + rowsTracker:GridRowsTracker + inDataRow = False + start, end = separatorIndices[rowNumber], separatorIndices[rowNumber + 1] + rowLines = lines[start:end] # Lines between separators including separator line start as it gives information about the number of columns of the row + if rowLines: + # Combine multiline content into single strings for each cell + for line in rowLines: + if isSeparator(line) and not inDataRow: + inDataRow = True + parts = re.split(r'\s*\+\s*', line.strip('+')) + delimiterIndex = 0 + + rows.append(GridRow(numberOfColumns)) + rowsTracker = GridRowsTracker(numberOfColumns) + columnIndex = 0 + + for rowIndex in range(len(parts)): + if columnIndex in range(numberOfColumns): + delimiterIndex += len(parts[rowIndex]) + 1 + cell = rows[-1][columnIndex] + + # Set position + cell.position = delimiterIndex # Position of cell delimiter + + + # Set alignment as defined by header separator line + cell.calculateAndSetAlignment(headerDelimiterPositions, defaultAlignments) + + while delimiterIndex > delimiterPositions[columnIndex]: + columnIndex += 1 + columnIndex += 1 + + elif inDataRow: + # Regular data row or partial separator + if matchGridTableBodySeparator.match(line): # Partial separator + cellsContent = re.split(r"[\|\+]", line.strip("|").strip("+")) # (?<!\\)[\|\+] + #Add another row, set delimiters for each cell + rows.append(GridRow(numberOfColumns)) + auxDelimiterIndex = 0 + auxiliarCellIndex = 0 + + for columnIndex, content in enumerate(cellsContent): + if auxiliarCellIndex in range(numberOfColumns): + auxDelimiterIndex += len(content) + 1 + cell = rows[-1][auxiliarCellIndex] + cell.position = auxDelimiterIndex # Position of cell delimiter + + cell.calculateAndSetAlignment(headerDelimiterPositions, defaultAlignments) + while auxDelimiterIndex > delimiterPositions[auxiliarCellIndex]: + auxiliarCellIndex += 1 + auxiliarCellIndex += 1 + + if len(cellsContent) <= numberOfColumns: # Colspan: Positions of | with respect to + need to be determined + columnCellIndex = 0 + + # Go through all cells in a columnt + for columnIndex, content in enumerate(cellsContent): + rowIndex = rowsTracker[columnCellIndex] + cell = rows[rowIndex][columnCellIndex] + + # Check whether a cell contains a header separator + if matchGridTableBodySeparatorLine.match(content): # A new row is to be added + rowsTracker[columnCellIndex] += 1 # That actual row will have more than one row + cell.listFlag = False + columnForward = 0 + + for delIndex in range(columnCellIndex, len(delimiterPositions)): + rowIndex = rowsTracker[columnCellIndex] # Correcting the rowIndex. Might have been changed by a previous iteration + if rows[rowIndex][columnCellIndex].position >= delimiterPositions[delIndex]: + columnForward += 1 + rowsTracker[columnCellIndex + columnForward - 1] += 1 if columnForward > 1 else 0 + columnCellIndex += columnForward + + continue + + else: + # Handle content of the cell + handleCellContent(cell, cellsContent[columnIndex]) + cell.rowspan += 1 + if not cell.colspanAdjusted: + # TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator + adjustColspan(rows[rowIndex], columnCellIndex, numberOfColumns, line, numberOfColumns, delimiterPositions) + + if cell.position >= delimiterPositions[columnCellIndex]: + columnCellIndex += cell.colspan if cell.colspan != 0 else 1 + continue + + else: + raise ValueError("More cells than columns found") + + else: # Data row + cellsContent = re.split(r'\s*\|\s*', line.strip('|')) + columnCellIndex = 0 + if len(cellsContent) < numberOfColumns: # Colspan: Positions of | with respect to + need to be determined + for columnIndex, content in enumerate(cellsContent): + row = rows[rowsTracker[columnCellIndex]] + cell = row[columnCellIndex] + # Handle content of the cell + handleCellContent(cell, content) + if not cell.colspanAdjusted: + #TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator + adjustColspan(row, columnCellIndex, numberOfColumns, line, numberOfColumns, delimiterPositions) + if cell.position >= delimiterPositions[columnCellIndex]: + columnCellIndex += cell.colspan # Move forward index i + + elif len(cellsContent) == numberOfColumns: # Simple row + for columnIndex, content in enumerate(cellsContent): + rowIndex = rowsTracker[columnIndex] + handleCellContent(rows[rowIndex][columnIndex], content) + else: + raise ValueError("More cells than columns found") + else: + raise ValueError("No separator line found for row starting") + + if hasHeader and start >= headerSeparatorIndex: # table_row and auxiliar_row are part of data_rows + for row in rows: + dataRows.append(row.cells) + elif hasHeader and start < headerSeparatorIndex: # table_row and auxiliar_row are part of header_rows + for row in rows: # header rows + headerRows.append(row.cells) + + # Check if there are any data rows + if not dataRows and not headerRows: + raise ValueError('No valid rows found in the provided grid table.') + + # Format text + for gridRows in [headerRows, dataRows]: + for gridRow in gridRows: + for cell in gridRow: + if cell.content is not None: + # Replacing "<" by < + cell.content = cell.content.replace("<", "<") + + # Bold replacements + # Regex to detect markdown bold formatting in cell content + if cell.content is not None: + cell.content = matchBold.sub(r'<strong>\g<text></strong>', cell.content) + + # Italic replacements + # Regex to detect markdown italic formatting in cell content + if cell.content is not None: + cell.content = matchItalic.sub(r'<i>\g<text></i>', cell.content) + + + # Correct newlines characters + for headerRow in headerRows: + for cell in headerRow: + cell.content = cell.content.replace('\n', '<br />') if cell.content is not None else None + for dataRow in dataRows: + for cell in dataRow: + cell.content = cell.content.replace('\n', '<br />') if cell.content is not None else None + + # + # Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows + # + + # Checking the header rows + forwardRowspan:list[int] = [] + for idx, headerRow in enumerate(headerRows): + if len(forwardRowspan) == 0: + forwardRowspan = [0] * len(headerRows[idx]) + sum = 0 + + for cellIndex, cell in enumerate(headerRow): + sum += cell.colspan + if idx > 0 and cell.colspan == 0: + if forwardRowspan[cellIndex] > 0: + sum += 1 + forwardRowspan[cellIndex] -= 1 + if forwardRowspan[cellIndex] == 0 and cell.rowspan > 1: + forwardRowspan[cellIndex] = cell.rowspan -1 + + if not sum == numberOfColumns: + raise ValueError('Grid table not converted properly') + + # Checking the data rows + forwardRowspan = [] + for idx, dataRow in enumerate(dataRows): + if len(forwardRowspan) == 0: + forwardRowspan = [0] * len(dataRows[idx]) + sum = 0 + + for cellIndex, cell in enumerate(dataRows[idx]): + sum += cell.colspan + if idx > 0 and cell.colspan == 0: + if forwardRowspan[cellIndex] > 0: + sum += 1 + forwardRowspan[cellIndex] -= 1 + if forwardRowspan[cellIndex] == 0 and cell.rowspan > 1: + forwardRowspan[cellIndex] = cell.rowspan - 1 + if not sum == numberOfColumns: + raise ValueError('Grid table not converted properly') + + return headerRows, dataRows + + +def generateHtmlTableWithSpans(gridTable:str) -> str: + """ Generate an HTML table from a Pandoc-style grid table with row and column spans. + + Args: + gridTable: The Pandoc-style grid table. + + Returns: + The HTML table in string format. + """ + try: + gridHeader, gridBody = parseGridTableWithSpans(gridTable) + except Exception as e: + import traceback + traceback.print_exc() + return f'HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS. {e}' + + html = '<table>\n' + hasHeader = False + + for row in gridHeader: + for cell in row: + if cell.rowspan != 0 and cell.colspan != 0: + hasHeader = True + break + + if hasHeader: + html += ' <thead>\n' + for row in gridHeader: + html += " <tr>\n" + for cell in row: + if cell.rowspan == 0 or cell.colspan == 0: + continue + else: + # Prepare content, in case there's a list + if cell.content is not None and (matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell.content)): # Update cell in new row + list = "<ul>" + # Build list the matches + for match in matches: + list += "<li>" + match[1] + "</li>" + list += "</ul>" + cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell.content) + # Enforce left alignment if cell contains a list + cell.alignment = "align=\"left\"" + + rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else "" + colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else "" + html += f" <th{rowspan}{colspan} {cell.alignment}>{cell.content}</th>\n" + html += " </tr>\n" + html += " </thead>\n" + + + html += " <tbody>\n" + for row in gridBody: + html += " <tr>\n" + for cell in row: + if cell.rowspan == 0 or cell.colspan == 0: + continue + else: + #Prepare content, in case there's a list + if cell.content is not None and (matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell.content)): # Update cell in new row + list = "<ul>" + # Build list the matches + for match in matches: + list += "<li>" + match[1] + "</li>" + list += "</ul>" + cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell.content) + # Enforce left alignment if cell contains a list + cell.alignment = "align=\"left\"" + + rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else "" + colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else "" + html += f" <td{rowspan}{colspan} {cell.alignment}>{cell.content}</td>\n" + html += " </tr>\n" + + html += " </tbody>\n" + html += "</table>" + return html + diff --git a/toMkdocs/makrdownTools.py b/toMkdocs/makrdownTools.py new file mode 100644 index 0000000000000000000000000000000000000000..169b6344870cd072c14f6aeb7a91c83bbfcd347f --- /dev/null +++ b/toMkdocs/makrdownTools.py @@ -0,0 +1,494 @@ +# +# markdownTools.py +# +# (c) 2025 by Andreas Kraft & Miguel Angel Reina Ortega +# License: BSD 3-Clause License. See the LICENSE file for further details. + + + +""" Various tools for markdown processing +""" +from __future__ import annotations + +from dataclasses import dataclass +import base64, hashlib +from enum import Enum, auto + +from gridTableTools import generateHtmlTableWithSpans +from regexMatches import * + +# TODO use a verbosity level instead +verbose = False +veryVerbose = False + +printInfo = print +printDebug = print + +def setScreenPrinters(info:callable = print, debug:callable = print) -> None: + global printInfo, printDebug + + printInfo = info + printDebug = debug + + +def _shortHash(value:str, length:int) -> str: + """ Generate a short hash of a string value. + + Args: + value: The value to hash. + length: The length of the hash. + + Returns: + The hash. + """ + return base64.b64encode( + hashlib.sha256( + value.encode() + ).digest() + ).decode()[:length] + + +class LineType(Enum): + """ Represents the type of a line in the markdown file. """ + HEADING = auto() + TEXT = auto() + CODEFENCESTART = auto() + CODE = auto() + CODEFENCEEND = auto() + LIST = auto() + NOTE = auto() + STANDALONEIMAGE = auto() + TABLEHEADER = auto() + TABLESEPARATOR = auto() + TABLEROW = auto() + TABLELASTROW = auto() + + +@dataclass +class Line: + """ Represents a line in the markdown file. """ + text:str = '\n' + lineType:LineType = LineType.TEXT + + +@dataclass +class Clause: + """ Represents a clause in the markdown file. """ + _level:int + _clauseNumber:str + _title:str + _lines:list[Line] + + + @property + def level(self) -> int: + """ Return the level of the clause. """ + return self._level + + + @property + def clauseNumber(self) -> str: + """ Return the clause number. """ + return self._clauseNumber if self._clauseNumber else '0' + + + @clauseNumber.setter + def clauseNumber(self, value:str) -> None: + """ Set the clause number. """ + self._clauseNumber = value + + + @property + def title(self) -> str: + """ Return the title of the clause. """ + return self._title + + + @title.setter + def title(self, value:str) -> None: + """ Set the title of the clause. """ + self._title = value + + + @property + def lines(self) -> list[Line]: + """ Return the lines of the clause. """ + return self._lines + + + @lines.setter + def lines(self, value:list[Line]) -> None: + """ Set the lines of the clause. """ + self._lines = value + + + @property + def linesCount(self) -> int: + """ Return the number of lines in the clause. + + Returns: + The number of lines in the clause. + """ + return len(self.lines) + + + def append(self, line:Line) -> None: + """ Append a line to the clause. + + Args: + line: The line to append. + """ + self.lines.append(line) + + + def extend(self, clause:Clause) -> None: + """ Extend the clause with the lines of another clause. + + Args: + clause: The clause to extend with. + """ + self.lines.extend(clause.lines) + + + def asStringList(self, paddings:int = 0) -> list[str]: + """ Return the clause as a list of strings. + + Args: + paddings: The number of empty lines to add before the clause. + Returns: + The clause's lines as a list of strings. + """ + return [ '\n' for _ in range(paddings) ] + [ l.text for l in self.lines ] + + + def __len__(self) -> int: + """ Return the number of characters in the clause. This does not include + empty lines or lines that contain only whitespace. + + Returns: + The number of characters in the clause. + """ + return sum([ len(l.text.strip()) for l in self.lines ]) + +class Footnote: + """ Represents a footnote in the markdown file. """ + def __init__(self, id:str, line:Line) -> None: + """ Constructor. + + Args: + id: The id of the footnote. + line: The line of the footnote. + """ + self.id = id + """ The id of the footnote. """ + + self.line = line + """ The line of the footnote. """ + +class Document: + """ Represents the document object. """ + clauses:list[Clause] = [] + footnotes:list[Footnote] = [] + + def __init__(self, clauses:list[Clause], footnotes:list[Footnote] = []) -> None: + self.clauses = clauses + self.footnotes = footnotes + + + def splitMarkdownDocument(self, + ignoreTitles:list[str] = [], + splitLevel:int = 1, + ignoreUntilFirstHeading:bool = False) -> None: + """ Split the clauses at a certain level. This is used to create the separate + markdown files for MkDocs. + + After the split, the clauses are stored in the document object. + + Args: + ignoreTitles: A list of titles that should be ignored. They are not included in the output. + splitLevel: The level at which the clauses should be split. + ignoreUntilFirstHeader: Ignore all clauses until the first heading. + + """ + result:list[Clause] = [] + + ignoreTitles = [ t.casefold() for t in ignoreTitles ] # convert to lower case + + for clause in self.clauses: + level = clause.level + + # Check if the current clause should be ignored + if clause.title.casefold() in ignoreTitles: + continue + + # Add a new output clause if the current clause's level is + # equal or less than the split level + if clause.level <= splitLevel: + result.append(Clause(level, clause.clauseNumber, clause.title, [])) + + # Add the lines to the output clause + result[-1].extend(clause) + + # Remove the first clause if it has no title + if ignoreUntilFirstHeading: + while len(result[0].title) == 0: + result.pop(0) + + self.clauses = result + + + def insertFootnotes(self) -> None: + """ Insert footnotes into the clauses. + + After the insertion, the clauses are stored in the document object. + + """ + printInfo('Adding footnotes to clauses') + + for clause in self.clauses: + foundFootnotes:list[Footnote] = [] + for line in clause.lines: + # ATTN: Only footnotes in normal text lines are checked + + if line.lineType == LineType.TEXT and (fn := MatchInlineFootnote.search(line.text)): + # Find the footnote in the list of footnotes + for f in self.footnotes: + if f.id == fn.groups()[0]: + foundFootnotes.append(f) + + # Insert the footnotes at the end of the clause + if len(foundFootnotes) > 0: + clause.append(Line('\n', LineType.TEXT)) + for f in foundFootnotes: + clause.append(f.line) + + + def updateLinks(self) -> None: + """ Update the links in the clauses to the new structure. This is done by + creating a dictionary of all links and their targets and then replacing + the links in the clauses. + + After the update, the clauses are stored in the document object. + """ + printInfo('Updating links in clauses') + + # Build the link target dictionary. Mapping anchor -> clause + linkTargets:dict[str, Clause] = {} + + # Find all Markdown headers in the clauses and convert them to anchor format + for i, clause in enumerate(self.clauses): + # Find all headers in the clause + for line in clause.lines: + if (m := matchHeader.match(line.text)): + + # convert the header to anchor format and add it to the dictionary + # Remove special characters + # TODO move perhaps to an own function + anchor = m.groups()[1].strip().casefold().replace(' ', '-') + for c in ( '.', '(', ')', '[', ']', ':', ',', "'", '"'): + anchor = anchor.replace(c, '') + # remove html tags from the anchor + anchor = re.sub(matchHtmlTag, '', anchor) + + linkTargets[f'#{anchor}'] = clause + if veryVerbose: + printDebug(f'Added Markdown anchor "{anchor}"') + + # Find all HTML anchors in the clauses and add them to the dictionary + for i, clause in enumerate(self.clauses): + for line in clause.lines: + if (anchors := matchHtmlAnchorLink.findall(line.text)): + for a in anchors: + linkTargets[f'#{a}'] = clause + if veryVerbose: + printDebug(f'Found HTML anchor "{a}" in clause "{clause.title}"') + + # Replace the html links + for clause in self.clauses: + for i, line in enumerate(clause.lines): + if (links := matchHtmlLink.findall(line.text)): + for lnk in links: + if lnk in linkTargets: + line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[lnk].clauseNumber}/#{lnk[1:]}') # Update the current line as well + if veryVerbose: + printDebug(f'Updated HTML link "{lnk}" in clause "{clause.title}"') + + # Replace the markdown links + for clause in self.clauses: + for i, line in enumerate(clause.lines): + if (links := markdownLink.findall(line.text)): + # Replace the old link targets with converted + # (lower case) versions that point to the output files + for lnk in links: + _lnk =lnk.casefold() + if _lnk in linkTargets: + line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[_lnk].clauseNumber}/#{lnk[1:]}') # Update the current line as well + if veryVerbose: + printDebug(f'Updated Markdown link "{lnk}" in clause "{clause.title}"') + + + def updateNotes(self) -> None: + """ Update the notes in the clauses to the mkDocs notes version. + + After the update, the clauses are stored in the document object. + """ + printInfo('Updating notes in clauses') + + for clause in self.clauses: + lines:list[Line] = [] + inNote = False + for line in clause.lines: + if line.lineType == LineType.NOTE: + if not inNote: + lines.append(Line('\n', LineType.TEXT)) + lines.append(Line('!!! note\n', LineType.NOTE)) + inNote = True + lines.append(Line(f"\t{re.sub(matchNoteStart, '', line.text)}", LineType.NOTE)) + if verbose: + printDebug(f'Converted note in clause "{clause.title}"') + else: + if inNote: + lines.append(Line('\n', LineType.TEXT)) + inNote = False + lines.append(line) + clause.lines = lines + + + +def analyseMarkdown(filename:str) -> Document: + """ Analyse the markdown file and split it into clauses. + + Args: + filename: The name of the markdown file. + + Returns: + The document object. + """ + + printInfo(f'Analyzing "{filename}"') + + # Read the file. + # Note: We use utf-8 and replace errors to avoid problems with special or unknown characters. + with open(filename, 'r', encoding = 'utf-8', errors = 'replace') as file: + inLines = file.readlines() + + # The list of clauses. The first clause contains the text before the first heading. + outClauses:list[Clause] = [Clause(0, '', '', [])] + footnotes:list[Footnote] = [] + + # Go through the lines and detect headers and codefences + inCodefence = False + inTable = False + tableHasSeparator = False + inGridTable = False + gridTableHasSeparator = False + gridTable = "" + for line in inLines: + + # Detect and handle codefences + # For the moment we support only codefences that start and end + # with 3 backticks. This is the most common way to define codefences. + # Note, that longer codefences are allowed by the markdown specification. + + if matchCodefenceStart.match(line) and not inCodefence: + inCodefence = True + outClauses[-1].append(Line(line, LineType.CODEFENCESTART)) + continue + if matchCodefenceEnd.match(line): + inCodefence = False + outClauses[-1].append(Line(line, LineType.CODEFENCEEND)) + continue + if inCodefence: + outClauses[-1].append(Line(line, LineType.CODE)) + continue + + # Detect and handle tables + if matchTable.match(line) and not inTable and not inGridTable: + inTable = True + outClauses[-1].append(Line(line, LineType.TABLEHEADER)) + continue + if inTable: + if matchTableSeparator.match(line) and not tableHasSeparator: + outClauses[-1].append(Line(line, LineType.TABLESEPARATOR)) + tableHasSeparator = True + continue + elif matchTable.match(line): + outClauses[-1].append(Line(line, LineType.TABLEROW)) + continue + else: + inTable = False + tableHasSeparator = False + # Mark the previous line as the last row in the table + outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW + # continue with other matches + + #Detect grid tables and convert them to html table + if matchGridTable.match(line) and not inGridTable: + inGridTable = True + #outClauses[-1].append(Line(line, LineType.TABLEHEADER)) + gridTable += line + continue + if inGridTable: + if matchGridTableHeaderSeparator.match(line) or matchGridTableBodySeparator.match(line): + #outClauses[-1].append(Line(line, LineType.TABLESEPARATOR)) + gridTable += line + continue + elif matchTable.match(line): + #outClauses[-1].append(Line(line, LineType.TABLEROW)) + gridTable += line + continue + else: + inGridTable = False + # Mark the previous line as the last row in the table + #outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW + # print(gridTable) + try: + htmltable = generateHtmlTableWithSpans(gridTable) + print(htmltable) + except Exception as e: + print(f"Error: {e}") + # TODO move this outside of the analyseMarkdown function !!! + for row in htmltable: + outClauses[-1].append(Line(row, LineType.TABLEROW)) + gridTable = "" + # continue with other matches + + # Detect notes + # Notes are lines that start with a '>'. + if matchNote.match(line): + outClauses[-1].append(Line(line, LineType.NOTE)) + continue + + # Detect footnotes + # Footnotes are lines that start with a '^' + if (_fn := matchFootnote.match(line)): + footnotes.append(Footnote(_fn.groups()[0], Line(line, LineType.TEXT))) + continue + + # Detect images on a single line + if (m := matchStandAloneImage.match(line)): + outClauses[-1].append(Line(line, LineType.STANDALONEIMAGE)) + continue + + # Detect headers + _lineType = LineType.TEXT + if (m := matchHeader.match(line)): + # Add a new clause + clauseTitle = m.groups()[1].strip() + clauseTitle = re.sub(matchHtmlTag, '', clauseTitle) + headerNumber = matchHeaderNumber.search(clauseTitle) + outClauses.append(Clause(len(m.groups()[0]), # level + headerNumber.group() if headerNumber else _shortHash(clauseTitle, 6), + clauseTitle, + [])) + _lineType = LineType.HEADING + + # Just add the line to the current clause as text + outClauses[-1].append(Line(line, _lineType)) + + return Document(outClauses, footnotes) + + + + + diff --git a/toMkdocs/regexMatches.py b/toMkdocs/regexMatches.py new file mode 100644 index 0000000000000000000000000000000000000000..7b784c13f23f34c3a0d08ac9f1014be039f6838b --- /dev/null +++ b/toMkdocs/regexMatches.py @@ -0,0 +1,40 @@ +# +# regexMatches.py +# +# (c) 2025 by Andreas Kraft & Miguel Angel Reina Ortega +# License: BSD 3-Clause License. See the LICENSE file for further details. + +# +""" This module contains the regular expressions used in the markdown processing. +""" + +import re + + +# Regular expressions +match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE) +matchFootnote = re.compile(r'\[\^([^\]]*)\]:', re.IGNORECASE) +matchHtmlAnchorLink = re.compile(r'<a\s+name="([^"]*)">[^<]*</a>', re.IGNORECASE) +matchHtmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE) +matchHtmlTag = re.compile(r'<[^>]*>', re.IGNORECASE) +MatchInlineFootnote = re.compile(r'\[\^([^\]]*)\]', re.IGNORECASE) +markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE) +matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE) +matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE) +matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE) +matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE) +matchGridTableBodySeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE) +matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE) +matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE) +matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE) +matchHeader = re.compile(r'(#+)\s+(.*)', re.IGNORECASE) +matchHeaderNumber = re.compile(r'\b[A-Za-z0-9]\d*(\.\d+)*\b', re.IGNORECASE) +matchListInContent = re.compile(r'^(?:\s*(P<marker>[-*+]|\s*\d+\.))\s+(P<content>.+)$', re.IGNORECASE) +matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE) +matchNoteStart = re.compile(r'^\s*>\s*(note)?\s*[:]?\s*', re.IGNORECASE) +matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECASE) +matchTable = re.compile(r'^\s*\|.*\|\s*$', re.IGNORECASE) +matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE) + +matchBold = re.compile(r'(?<!\S)(\*\*|__)(?P<text>.+?)(?<!\\)\1(?!\S)') +matchItalic = re.compile(r'(?<!\S)(\*|_)(?P<text>.+?)(?<!\\)\1(?!\S)') diff --git a/toMkdocs/toMkdocs.py b/toMkdocs/toMkdocs.py index ae0be2af60838763c31f81f873a2359971e9aa3a..69037d1b79915ea4279346c79d936463f8e27372 100644 --- a/toMkdocs/toMkdocs.py +++ b/toMkdocs/toMkdocs.py @@ -1,1036 +1,121 @@ # # toMkdocs.py # -# (c) 2024 by Andreas Kraft +# (c) 2024 by Andreas Kraft & Miguel Angel Reina Ortega +# License: BSD 3-Clause License. See the LICENSE file for further details. + # # This script converts oneM2M spec markdown file to a mkdocs compatible # directory structure. # from __future__ import annotations -import logging -from enum import Enum, auto -import argparse, re, os, shutil, hashlib, base64 -from dataclasses import dataclass +import argparse, os, shutil from rich import print +from makrdownTools import Line, Document, analyseMarkdown, setScreenPrinters +from regexMatches import match2spaceListIndention verbose = False veryVerbose = False -class LineType(Enum): - """ Represents the type of a line in the markdown file. """ - HEADING = auto() - TEXT = auto() - CODEFENCESTART = auto() - CODE = auto() - CODEFENCEEND = auto() - LIST = auto() - NOTE = auto() - STANDALONEIMAGE = auto() - TABLEHEADER = auto() - TABLESEPARATOR = auto() - TABLEROW = auto() - TABLELASTROW = auto() - - -@dataclass -class Line: - """ Represents a line in the markdown file. """ - text:str = '\n' - lineType:LineType = LineType.TEXT - - - -@dataclass -class Clause: - """ Represents a clause in the markdown file. """ - _level:int - _clauseNumber:str - _title:str - _lines:list[Line] - - - @property - def level(self) -> int: - """ Return the level of the clause. """ - return self._level - - - @property - def clauseNumber(self) -> str: - """ Return the clause number. """ - return self._clauseNumber if self._clauseNumber else '0' - - - @clauseNumber.setter - def clauseNumber(self, value:str) -> None: - """ Set the clause number. """ - self._clauseNumber = value - - - @property - def title(self) -> str: - """ Return the title of the clause. """ - return self._title - - - @title.setter - def title(self, value:str) -> None: - """ Set the title of the clause. """ - self._title = value - - - @property - def lines(self) -> list[Line]: - """ Return the lines of the clause. """ - return self._lines - - - @lines.setter - def lines(self, value:list[Line]) -> None: - """ Set the lines of the clause. """ - self._lines = value - - - @property - def linesCount(self) -> int: - """ Return the number of lines in the clause. - - Returns: - The number of lines in the clause. - """ - return len(self.lines) - - - def append(self, line:Line) -> None: - """ Append a line to the clause. - - Args: - line: The line to append. - """ - self.lines.append(line) - - - def extend(self, clause:Clause) -> None: - """ Extend the clause with the lines of another clause. - - Args: - clause: The clause to extend with. - """ - self.lines.extend(clause.lines) - - - def asStringList(self, paddings:int = 0) -> list[str]: - """ Return the clause as a list of strings. - - Args: - paddings: The number of empty lines to add before the clause. - Returns: - The clause's lines as a list of strings. - """ - return [ '\n' for _ in range(paddings) ] + [ l.text for l in self.lines ] - - - def __len__(self) -> int: - """ Return the number of characters in the clause. This does not include - empty lines or lines that contain only whitespace. - - Returns: - The number of characters in the clause. - """ - return sum([ len(l.text.strip()) for l in self.lines ]) - - -class Footnote: - """ Represents a footnote in the markdown file. """ - def __init__(self, id:str, line:Line) -> None: - self.id = id - self.line = line - - -class Document: - """ Represents the document object. """ - clauses:list[Clause] = [] - footnotes:list[Footnote] = [] - - def __init__(self, clauses:list[Clause], footnotes:list[Footnote]) -> None: - self.clauses = clauses - self.footnotes = footnotes - - - def splitMarkdownDocument(self, - ignoreTitles:list[str] = [], - splitLevel:int = 1, - ignoreUntilFirstHeading:bool = False) -> None: - """ Split the clauses at a certain level. This is used to create the separate - markdown files for MkDocs. - - After the split, the clauses are stored in the document object. - - Args: - ignoreTitles: A list of titles that should be ignored. They are not included in the output. - splitLevel: The level at which the clauses should be split. - ignoreUntilFirstHeader: Ignore all clauses until the first heading. - - """ - result:list[Clause] = [] - - ignoreTitles = [ t.casefold() for t in ignoreTitles ] # convert to lower case - - for clause in self.clauses: - level = clause.level - - # Check if the current clause should be ignored - if clause.title.casefold() in ignoreTitles: - continue - - # Add a new output clause if the current clause's level is - # equal or less than the split level - if clause.level <= splitLevel: - result.append(Clause(level, clause.clauseNumber, clause.title, [])) - - # Add the lines to the output clause - result[-1].extend(clause) - - # Remove the first clause if it has no title - if ignoreUntilFirstHeading: - while len(result[0].title) == 0: - result.pop(0) - - self.clauses = result - - - def insertFootnotes(self) -> None: - """ Insert footnotes into the clauses. - - After the insertion, the clauses are stored in the document object. - - """ - print(f'[green]Adding footnotes to clauses') - - for clause in self.clauses: - foundFootnotes:list[Footnote] = [] - for line in clause.lines: - # ATTN: Only footnotes in normal text lines are checked - - if line.lineType == LineType.TEXT and (fn := _inlineFootnote.search(line.text)): - # Find the footnote in the list of footnotes - for f in self.footnotes: - if f.id == fn.groups()[0]: - foundFootnotes.append(f) - - # Insert the footnotes at the end of the clause - if len(foundFootnotes) > 0: - clause.append(Line('\n', LineType.TEXT)) - for f in foundFootnotes: - clause.append(f.line) - - - def updateLinks(self) -> None: - """ Update the links in the clauses to the new structure. This is done by - creating a dictionary of all links and their targets and then replacing - the links in the clauses. - - After the update, the clauses are stored in the document object. - """ - print(f'[green]Updating links in clauses') - - # Build the link target dictionary. Mapping anchor -> clause - linkTargets:dict[str, Clause] = {} - - # Find all Markdown headers in the clauses and convert them to anchor format - for i, clause in enumerate(self.clauses): - # Find all headers in the clause - for line in clause.lines: - if (m := _matchHeader.match(line.text)): - - # convert the header to anchor format and add it to the dictionary - # Remove special characters - # TODO move perhaps to an own function - anchor = m.groups()[1].strip().casefold().replace(' ', '-') - for c in ( '.', '(', ')', '[', ']', ':', ',', "'", '"'): - anchor = anchor.replace(c, '') - # remove html tags from the anchor - anchor = re.sub(_htmlTag, '', anchor) - - linkTargets[f'#{anchor}'] = clause - if veryVerbose: - print(f'[dim]Added Markdown anchor "{anchor}"') - - # Find all HTML anchors in the clauses and add them to the dictionary - for i, clause in enumerate(self.clauses): - for line in clause.lines: - if (anchors := _htmlAnchorLink.findall(line.text)): - for a in anchors: - linkTargets[f'#{a}'] = clause - if veryVerbose: - print(f'[dim]Found HTML anchor "{a}" in clause "{clause.title}"') - - # Replace the html links - for clause in self.clauses: - for i, line in enumerate(clause.lines): - if (links := _htmlLink.findall(line.text)): - for lnk in links: - if lnk in linkTargets: - line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[lnk].clauseNumber}/#{lnk[1:]}') # Update the current line as well - if veryVerbose: - print(f'[dim]Updated HTML link "{lnk}" in clause "{clause.title}"') - - # Replace the markdown links - for clause in self.clauses: - for i, line in enumerate(clause.lines): - if (links := _markdownLink.findall(line.text)): - # Replace the old link targets with converted - # (lower case) versions that point to the output files - for lnk in links: - _lnk =lnk.casefold() - if _lnk in linkTargets: - line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[_lnk].clauseNumber}/#{lnk[1:]}') # Update the current line as well - if veryVerbose: - print(f'[dim]Updated Markdown link "{lnk}" in clause "{clause.title}"') - - - def updateNotes(self) -> None: - """ Update the notes in the clauses to the mkDocs notes version. - - After the update, the clauses are stored in the document object. - """ - print(f'[green]Updating notes in clauses') - - for clause in self.clauses: - lines:list[Line] = [] - inNote = False - for line in clause.lines: - if line.lineType == LineType.NOTE: - if not inNote: - lines.append(Line('\n', LineType.TEXT)) - lines.append(Line('!!! note\n', LineType.NOTE)) - inNote = True - lines.append(Line(f"\t{re.sub(_matchNoteStart, '', line.text)}", LineType.NOTE)) - if verbose: - print(f'[dim]Converted note in clause "{clause.title}"') - else: - if inNote: - lines.append(Line('\n', LineType.TEXT)) - inNote = False - lines.append(line) - clause.lines = lines - - - def prepareForMkdocs(self, includeHangingParagraphs:bool = False) -> None: - """ Prepare the clauses for MkDocs. This includes removing the heading - from the clauses and marking the clauses that are only for navigation. - - After the preparation, the clauses are stored in the document object. - - Args: - includeHangingParagraphs: Include hanging paragraphs in the output. - """ - - # Remove the heading from the lines. The heading is the first line - # in the clause. This is done because MkDocs repeats the heading when - # displaying the page. - for clause in self.clauses: - if clause.linesCount > 0: - clause.lines.pop(0) - # Also, remove the first empty lines if they exist - while clause.linesCount > 0 and clause.lines[0].text.strip() == '': - clause.lines.pop(0) - - # Detect and handle hanging paragraphs. This is extra text in a clause, which - # has sub-clauses. This text is not allowed in oneM2M specifications. - for i, clause in enumerate(self.clauses): - if clause.level > 0 and clause.linesCount > 0: - # Check if there is a sub-clause in the next clause - if i + 1 < len(self.clauses) and self.clauses[i+1].level > clause.level: - # This is a hanging paragraph. Remove the text from the current clause. - print(f'[yellow]Hanging paragraph in clause "{clause.title}" {"(removed)" if not includeHangingParagraphs else "(kept)"}') - if not includeHangingParagraphs: - self.clauses[i].lines = [] - else: - self.clauses[i].lines = [Line("<mark>Editor note: This is a hanging paragraph and it must be moved to its own clause</mark>")] + [Line()] + self.clauses[i].lines - - # Repair wrong markdown for indented lines. - # Add 2 spaces to existing 2-space indentions - for clause in self.clauses: - for i, line in enumerate(clause.lines): - if _match2spaceListIndention.match(line.text): - clause.lines[i].text = ' ' + line.text - - - def writeClausesMkDocs(self, filename:str, navTitle:str, addNavTitle:bool = False) -> None: - """ Write the clauses to separate files and create a navigation file. - - Args: - filename: The name of the original markdown file. - navTitle: The title of the navigation entry. This is used to determine the directories. - addNavTitle: Add the title as an extra navigation level to the navigation file. - """ - - print(f'[green]Writing clauses to files') - # create directory first - os.makedirs(f'{os.path.dirname(filename)}/{navTitle}', exist_ok = True) - - # Write the files - for i, f in enumerate(self.clauses): - # write to single files, even empty ones - if verbose: - print(f'[dim]Writing "{f.clauseNumber}.md" - "{f.title}"') - with open(f'{os.path.dirname(filename)}/{navTitle}/{f.clauseNumber}.md', 'w') as file: - # Add one empty line before the clause. This is done to avoid - # a bug in MkDocs that does not display the first line of a clause - # if it contains a colon. It does not matter otherwise if the line - # is empty or not. - file.writelines(f.asStringList(1)) - - # write nav.yml file - print(f'[green]Writing "_nav.yml"') - indentation = ' ' if addNavTitle else '' # TODO make number of spaces configurable - with open(f'{os.path.dirname(filename)}/_nav.yml', 'w') as file: - if veryVerbose: - print(f'[dim]Writing navigation file') - if addNavTitle: - file.write(f'{indentation}- {navTitle}:\n') - for i, f in enumerate(self.clauses): +def prepareForMkdocs(document:Document, includeHangingParagraphs:bool = False) -> None: + """ Prepare the clauses for MkDocs. This includes removing the heading + from the clauses and marking the clauses that are only for navigation. - if not f.title: - print("continue") - continue - - # TODO handle if the next clause is more than one level deeper - - _title = f.title.replace("'", '"') - nextClause = self.clauses[i+1] if i+1 < len(self.clauses) else None - if nextClause is None or nextClause.level <= f.level: - file.write(f"{indentation}{' '*f.level}- '{_title}': '{navTitle}/{f.clauseNumber}.md'\n") - else: - file.write(f"{indentation}{' '*f.level}- '{_title}':\n") - if len(f) > 0: - file.write(f"{indentation}{' '*nextClause.level}- 'Hanging paragraph': '{navTitle}/{f.clauseNumber}.md'\n") - - - - -_matchHeader = re.compile(r'(#+)\s+(.*)', re.IGNORECASE) -_matchHeaderNumber = re.compile(r'\b[A-Za-z0-9]\d*(\.\d+)*\b', re.IGNORECASE) -_matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE) -_matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE) -_matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE) -_matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECASE) -_matchTable = re.compile(r'^\s*\|.*\|\s*$', re.IGNORECASE) -_matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE) -_matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE) -_matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE) -_matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE) -_matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE) -_matchGridTableBodySeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE) -_match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE) -_matchListInContent = re.compile(r'^(?:\s*(P<marker>[-*+]|\s*\d+\.))\s+(P<content>.+)$', re.IGNORECASE) -_markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE) -_htmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE) -_htmlAnchorLink = re.compile(r'<a\s+name="([^"]*)">[^<]*</a>', re.IGNORECASE) -_htmlTag = re.compile(r'<[^>]*>', re.IGNORECASE) -_matchNoteStart = re.compile(r'^\s*>\s*(note)?\s*[:]?\s*', re.IGNORECASE) -_footnote = re.compile(r'\[\^([^\]]*)\]:', re.IGNORECASE) -_inlineFootnote = re.compile(r'\[\^([^\]]*)\]', re.IGNORECASE) - - -# TODO handle multiple nav levels (left bar) better (make conifgurable) - - -def shortHash(value:str, length:int) -> str: - """ Generate a short hash of a string value. + After the preparation, the clauses are stored in the document object. Args: - value: The value to hash. - length: The length of the hash. - - Returns: - The hash. - """ - return base64.b64encode( - hashlib.sha256( - value.encode() - ).digest() - ).decode()[:length] - -def parse_pandoc_table_with_spans(pandoc_table): + document: The document object. + includeHangingParagraphs: Include hanging paragraphs in the output. """ - Parse a Pandoc-style grid table into a structure for HTML conversion with rowspan and colspan. - - :param pandoc_table: String of the Pandoc-style grid table. - :return: List of lists representing the table with metadata for spans. - """ - # Split the input into lines - lines = [line.strip() for line in pandoc_table.strip().split("\n")] - - class Cell: - """ Represents the document object. """ - content: str - rowspan: int - colspan: int - colspan_adjusted: bool - alignment: str - position: int - list_flag: bool - auxiliar_index: int - - def __init__(self): - self.content = None - self.rowspan = 0 - self.colspan = 0 - self.colspan_adjusted = False - self.alignment = "align=\"center\"" - self.position = None - self.list_flag = False - - def set_alignment(self): - header_delimiter_index = 0 - while header_delimiter_index in range(len(default_alignments)) and self.position > header_delimiter_positions[header_delimiter_index]: - header_delimiter_index += 1 - if header_delimiter_index in range(len(default_alignments)): - if self.position < header_delimiter_positions[header_delimiter_index]: - self.alignment = default_alignments[header_delimiter_index] - elif self.position == header_delimiter_positions[header_delimiter_index]: - self.alignment = default_alignments[header_delimiter_index] - header_delimiter_index += 1 - else: - raise ValueError("Invalid table formatting") - - class Row(): - """ Represents a row in the markdown file. """ - cells:list[Cell] = [] - - def __init__(self, length: int = 1) -> None: - self.cells = [Cell() for _ in range(length)] - - def __getitem__(self, item): - return self.cells[item] - - def __setitem__(self, key, value): - self.cells[key] = value - - class RowTracker(): - """ Represents the document object. """ - def __init__(self, items): - self.rowTracker = [0 for _ in range(items)] - - def __getitem__(self, item): - return self.rowTracker[item] - - def __setitem__(self, key, value): - self.rowTracker[key] = value - - # Detect separator lines by pattern (it does not take into account partial separators - def is_separator(line): - return _matchGridTableSeparator.match(line) - - # Set content on the cell - concatenating multilines, flagging lists - def handling_content(cell, content): - if cell.content is None: - cell.rowspan += 1 - cell.colspan += 1 - if content.strip().startswith("- "): # List - cell.list_flag = True - #print(content) - cell.content = content.strip() + "\n" # Add newline to know when the list element ends - elif cell.list_flag and content.strip() != "": # any other content when handling list is concatenated to the last list element - cell.content += content.strip() + "\n" - elif content.strip == "": # separation between list and other paragraph - cell.list_flag = False - cell.content += "\n" #if not cell['content'].endswith("\n") else "" - else: - cell.content = re.sub(r'\\\s*$', "\n", content.strip()) - else: - if content.strip().startswith("- "): # List - if not cell.list_flag: - cell.content += "\n" - #cell['content'] = cell['content'].strip("\n") - cell.list_flag = True - cell.content += content.strip() + "\n" # Add newline to know when the list element ends - elif cell.list_flag and content.strip() != "": # any other content when handling list is concatenated to the last list element - cell.content = cell.content.strip("\n") - cell.content += " " + content.strip() + "\n" - elif content.strip() == "": # separation between list and other paragraph - cell.list_flag = False - #content = re.sub(r'\\\s*$', "\n", content.strip()) - cell.content += "\n" if not cell.content.endswith("\n") else "" - else: - content = re.sub(r'\\\s*$', "\n", content.strip()) - cell.content += " " + content - #print(cell['content']) - return cell - - # Adjust colspan of a cell - def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions): - for j in range(column_index, number_of_parts): - delimiter_start = None - col_i= column_index - while delimiter_start == None: - delimiter_start = row[col_i - 1].position if col_i > 0 else 0 - col_i -= 1 - positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "|+" if delimiter in line[delimiter_start + 1:]] - position = min(positions) if positions else -1 - if position > delimiter_positions[j]: # Colspan to be increased - row[column_index].colspan += 1 - if position == delimiter_positions[len(delimiter_positions) - 1]: # last cell in row, adjust colspan to get max number columns - colspan_allocated = row[column_index].colspan - #for cell_index in range(number_of_parts): - # colspan_allocated += row[cell_index].colspan - row[column_index].colspan += number_of_columns - colspan_allocated - column_index - elif position < delimiter_positions[j]: - raise ValueError("Wrong cell formatting") - else: - break - return row[column_index] - - separator_indices = [i for i, line in enumerate(lines) if is_separator(line)] - - print(separator_indices) - if not separator_indices: - raise ValueError("No valid separators found in the provided Pandoc table.") - - # Calculate max number of columns - delimiter_positions = [] - number_of_columns = 0 - for separator_index in separator_indices: - if lines[separator_index].count("+") - 1 > number_of_columns: - number_of_columns = lines[separator_index].count("+") - 1 - delimiter_positions = [] - for j in range(number_of_columns): - delimiter_positions_start = delimiter_positions[j - 1] if j != 0 else 0 - del_positions = [lines[separator_index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[separator_index][delimiter_positions_start + 1:]] - delimiter_positions.append(min(del_positions) if del_positions else -1) - has_header = False - header_delimiter_positions = [] - for index in separator_indices: - if _matchGridTableHeaderSeparator.match(lines[index]): - has_header = True - header_separator_index = index - header_rows = [] - parts = re.split(r"\+", lines[index].strip("+")) - default_alignments = [] - #Calculate default alignments and positions of delimiters - for part_index in range(len(parts)): - if parts[part_index].startswith(":") and not parts[part_index].endswith(":"): - default_alignments.append("align=\"left\"") - elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"): - default_alignments.append("align=\"right\"") - else: - default_alignments.append("align=\"center\"") - # Delimiter position - delimiter_positions_start = delimiter_positions[part_index - 1] if part_index != 0 else 0 - del_positions = [lines[index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[index][delimiter_positions_start + 1:]] - header_delimiter_positions.append(min(del_positions) if del_positions else -1) - - data_rows = [] - for row in range(len(separator_indices) - 1): - rows = [] - rows_tracker = [] - in_data_row = False - start, end = separator_indices[row], separator_indices[row + 1] - row_lines = lines[start:end] # Lines between separators including separator line start as it gives information about the number of columns of the row - if row_lines: - # Combine multiline content into single strings for each cell - for line in row_lines: - if is_separator(line) and not in_data_row: - in_data_row = True - parts = re.split(r"\s*\+\s*", line.strip("+")) - delimiter_index = 0 - # Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator) - # we need to assign the default alignment as defined in the header separator line - # We may not need the code below, as that supports alignment per cell and row - #alignments = [] - #for part_index in range(len(parts)): - # if parts[part_index].startswith(":") and not parts[part_index].endswith(":"): - # alignments.append("align=\"left\"") - # elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"): - # alignments.append("align=\"right\"") - # else: - # alignments.append("align=\"center\"") - rows.append(Row(number_of_columns)) - #rows_tracker = [RowTracker() for _ in range(number_of_columns)] - rows_tracker = RowTracker(number_of_columns) - i = 0 - for j in range(len(parts)): - if i in range(number_of_columns): - delimiter_index += len(parts[j]) + 1 - # Set position - rows[-1][i].position = delimiter_index # Position of cell delimiter + - # Set alignment as defined by header separator line - rows[-1][i].set_alignment() - while delimiter_index > delimiter_positions[i]: - i += 1 - i += 1 - - elif in_data_row: - # Regular data row or partial separator - if _matchGridTableBodySeparator.match(line): # Partial separator - cells_content = re.split(r"[\|\+]", line.strip("|").strip("+")) # (?<!\\)[\|\+] - #Add another row, set delimiters for each cell - rows.append(Row(number_of_columns)) - aux_delimiter_index = 0 - auxiliar_cell_index = 0 - for i in range(len(cells_content)): - if auxiliar_cell_index in range(number_of_columns): - aux_delimiter_index += len(cells_content[i]) + 1 - rows[-1][auxiliar_cell_index].position = aux_delimiter_index # Position of cell delimiter + - rows[-1][auxiliar_cell_index].set_alignment() - while aux_delimiter_index > delimiter_positions[auxiliar_cell_index]: - auxiliar_cell_index += 1 - auxiliar_cell_index += 1 - if len(cells_content) <= number_of_columns: # Colspan: Positions of | with respect to + need to be determined - column_index = 0 - for i in range(len(cells_content)): - if _matchGridTableBodySeparatorLine.match(cells_content[i]): # A new row is to be added - rows_tracker[column_index] += 1 - rows[rows_tracker[column_index]][column_index].list_flag = False - #auxiliar_rows[-1]['use_auxiliar_row'][i] = True - #if cells[i].startswith(":") and not cells[i].endswith(":"): - # auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"left\"" - #elif not cells[i].startswith(":") and cells[i].endswith(":"): - # auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"right\"" - #else: - # auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"center\"" - column_forward = 0 - for del_index in range(column_index, len(delimiter_positions)): - if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[del_index]: - column_forward += 1 - rows_tracker[column_index + column_forward - 1] += 1 if column_forward > 1 else 0 - column_index += column_forward - continue - else: - # Handle content of the cell - rows[rows_tracker[column_index]][column_index] = handling_content(rows[rows_tracker[column_index]][column_index], cells_content[i]) - rows[rows_tracker[column_index]][column_index].rowspan += 1 - if not rows[rows_tracker[column_index]][column_index].colspan_adjusted: - rows[rows_tracker[column_index]][column_index].colspan_adjusted = True - # TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator - rows[rows_tracker[column_index]][column_index] = adjust_colspan(rows[rows_tracker[column_index]], column_index, number_of_columns, line, number_of_columns, delimiter_positions) - - if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[column_index]: - column_index += rows[rows_tracker[column_index]][column_index].colspan if rows[rows_tracker[column_index]][column_index].colspan != 0 else 1 - continue - - else: - raise ValueError("More cells than columns found") - else: # Data row - cells_content = re.split(r"\s*\|\s*", line.strip("|")) - column_index = 0 - if len(cells_content) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined - for i in range(len(cells_content)): - # Handle content of the cell - rows[rows_tracker[column_index]][column_index] = handling_content(rows[rows_tracker[column_index]][column_index], cells_content[i]) - if not rows[rows_tracker[column_index]][column_index].colspan_adjusted: - rows[rows_tracker[column_index]][column_index].colspan_adjusted = True - #TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator - rows[rows_tracker[column_index]][column_index] = adjust_colspan(rows[rows_tracker[column_index]], column_index, number_of_columns, line, number_of_columns, delimiter_positions) - if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[column_index]: - column_index += rows[rows_tracker[column_index]][column_index].colspan # Move forward index i + # Remove the heading from the lines. The heading is the first line + # in the clause. This is done because MkDocs repeats the heading when + # displaying the page. + for clause in document.clauses: + if clause.linesCount > 0: + clause.lines.pop(0) + # Also, remove the first empty lines if they exist + while clause.linesCount > 0 and clause.lines[0].text.strip() == '': + clause.lines.pop(0) - elif len(cells_content) == number_of_columns: # Simple row - for i in range(len(cells_content)): - rows[rows_tracker[i]][i] = handling_content(rows[rows_tracker[i]][i], cells_content[i]) - else: - raise ValueError("More cells than columns found") + # Detect and handle hanging paragraphs. This is extra text in a clause, which + # has sub-clauses. This text is not allowed in oneM2M specifications. + for i, clause in enumerate(document.clauses): + if clause.level > 0 and clause.linesCount > 0: + # Check if there is a sub-clause in the next clause + if i + 1 < len(document.clauses) and document.clauses[i+1].level > clause.level: + # This is a hanging paragraph. Remove the text from the current clause. + print(f'[yellow]Hanging paragraph in clause "{clause.title}" {"(removed)" if not includeHangingParagraphs else "(kept)"}') + if not includeHangingParagraphs: + document.clauses[i].lines = [] else: - raise ValueError("No separator line found for row starting") - - if has_header and start >= header_separator_index: # table_row and auxiliar_row are part of data_rows - for body_row in rows: - data_rows.append(body_row.cells) - elif has_header and start < header_separator_index: # table_row and auxiliar_row are part of header_rows - for header_row in rows: - header_rows.append(header_row.cells) - - #print(header_rows) - #print(data_rows) - # Check if there are any data rows - if not data_rows and not header_rows: - raise ValueError("No valid rows found in the provided Pandoc table.") - - # Format text - for rows in [header_rows, data_rows]: - bold = "<strong>" - italic = "<i>" - for row in rows: - for cell in row: - if cell.content is not None: - # Replacing "<" by < - #cell.content = cell.content.replace("<", "<") - - #Bold - for bold_characters in ["**", "__"]: - while cell.content.find(bold_characters) != -1: - cell.content = cell.content.replace(bold_characters, bold, 1) - if bold == "<strong>": - bold = "</strong>" - else: - bold = "<strong>" - #Italic - while cell.content.find("_") != -1 and cell.content.find("\_") == -1: - cell.content = cell.content.rstrip() .replace("_", italic, 1) - if italic == "<i>": - italic = "</i>" - else: - italic = "<i>" - while cell.content.find("\_") != -1: - cell.content = cell.content.rstrip().replace("\_", "_", 1) - - # Correct newlines characters - for row in header_rows: - for cell in row: - cell.content = cell.content.replace("\n", "<br />") if cell.content is not None else None - for row in data_rows: - for cell in row: - cell.content = cell.content.replace("\n", "<br />") if cell.content is not None else None - - # Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows - forward_rowspan = [] - for row_index in range(len(header_rows)): - if len(forward_rowspan) == 0: - forward_rowspan = [0 for _ in range(len(header_rows[row_index]))] - sum = 0 - for cell_index in range(len(header_rows[row_index])): - sum += header_rows[row_index][cell_index].colspan - if row_index > 0 and header_rows[row_index][cell_index].colspan == 0: - if forward_rowspan[cell_index] > 0: - sum += 1 - forward_rowspan[cell_index] -= 1 - if forward_rowspan[cell_index] == 0 and header_rows[row_index][cell_index].rowspan > 1: - forward_rowspan[cell_index] = header_rows[row_index][cell_index].rowspan -1 - if not sum == number_of_columns: - raise ValueError("Grid table not converted properly") - forward_rowspan = [] - for row_index in range(len(data_rows)): - if len(forward_rowspan) == 0: - forward_rowspan = [0 for _ in range(len(data_rows[row_index]))] - sum = 0 - for cell_index in range(len(data_rows[row_index])): - sum += data_rows[row_index][cell_index].colspan - if row_index > 0 and data_rows[row_index][cell_index].colspan == 0: - if forward_rowspan[cell_index] > 0: - sum += 1 - forward_rowspan[cell_index] -= 1 - if forward_rowspan[cell_index] == 0 and data_rows[row_index][cell_index].rowspan > 1: - forward_rowspan[cell_index] = data_rows[row_index][cell_index].rowspan - 1 - if not sum == number_of_columns: - raise ValueError("Grid table not converted properly") - - return header_rows, data_rows + # Add a note to the hanging paragraph + document.clauses[i].lines = [Line("<mark>Editor note: This is a hanging paragraph and it must be moved to its own clause</mark>")] + [Line()] + document.clauses[i].lines -def generate_html_table_with_spans(pandoc_table): - """ - Generate an HTML table from a Pandoc-style grid table with row and column spans. - - :param pandoc_table: String of the Pandoc-style grid table. - :return: HTML string. - """ - try: - grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table) - except: - logging.ERROR("Grid table could not be generated") - return "HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS" - else: - html = "<table>\n" - has_header = False - - for row in grid_header: - for cell in row: - if cell.rowspan != 0 and cell.colspan != 0: - has_header = True - if has_header: - html += " <thead>\n" - for row in grid_header: - html += " <tr>\n" - for cell in row: - if cell.rowspan == 0 or cell.colspan == 0: - continue - else: - # Prepare content, in case there's a list - #print(cell.content) - if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", - cell.content): # Update cell in new row - #print("MATCHING") - list = "<ul>" - # Build list the matches - for match in matches: - list += "<li>" + match[1] + "</li>" - list += "</ul>" - cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell.content) - # Enforce left alignment if cell contains a list - cell.alignment = "align=\"left\"" - #else: - # print("NOT MATCHING") + # Repair wrong markdown for indented lines. + # Add 2 spaces to existing 2-space indentions + for clause in document.clauses: + for i, line in enumerate(clause.lines): + if match2spaceListIndention.match(line.text): + clause.lines[i].text = ' ' + line.text - rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else "" - colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else "" - html += f" <th{rowspan}{colspan} {cell.alignment}>{cell.content}</th>\n" - html += " </tr>\n" - html += " </thead>\n" - html += " <tbody>\n" - for row in grid_body: - html += " <tr>\n" - for cell in row: - if cell.rowspan == 0 or cell.colspan == 0: - continue - else: - #Prepare content, in case there's a list - #print(cell.content) - if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell.content): # Update cell in new row - #print("MATCHING") - #print(cell.content) - list = "<ul>" - # Build list the matches - for match in matches: - list += "<li>" + match[1] + "</li>" - list += "</ul>" - cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell.content) - # Enforce left alignment if cell contains a list - cell.alignment = "align=\"left\"" - #else: - #print("NOT MATCHING") - rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else "" - colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else "" - html += f" <td{rowspan}{colspan} {cell.alignment}>{cell.content}</td>\n" - html += " </tr>\n" - - html += " </tbody>\n" - html += "</table>" - return html - -def analyseMarkdown(filename:str) -> Document: - """ Analyse the markdown file and split it into clauses. +def writeClausesMkDocs(document:Document, filename:str, navTitle:str, addNavTitle:bool = False) -> None: + """ Write the clauses to separate files and create a navigation file. Args: - filename: The name of the markdown file. - - Returns: - The document object. + document: The document object. + filename: The name of the original markdown file. + navTitle: The title of the navigation entry. This is used to determine the directories. + addNavTitle: Add the title as an extra navigation level to the navigation file. """ - print(f'[green]Analyzing "{filename}"') + print(f'[green]Writing clauses to files') + # create directory first + os.makedirs(f'{os.path.dirname(filename)}/{navTitle}', exist_ok = True) + + # Write the files + for i, f in enumerate(document.clauses): + # write to single files, even empty ones + if verbose: + print(f'[dim]Writing "{f.clauseNumber}.md" - "{f.title}"') + with open(f'{os.path.dirname(filename)}/{navTitle}/{f.clauseNumber}.md', 'w') as file: + # Add one empty line before the clause. This is done to avoid + # a bug in MkDocs that does not display the first line of a clause + # if it contains a colon. It does not matter otherwise if the line + # is empty or not. + file.writelines(f.asStringList(1)) - # Read the file. - # Note: We use utf-8 and replace errors to avoid problems with special or unknown characters. - with open(filename, 'r', encoding = 'utf-8', errors = 'replace') as file: - inLines = file.readlines() - # The list of clauses. The first clause contains the text before the first heading. - outClauses:list[Clause] = [Clause(0, '', '', [])] - footnotes:list[Footnote] = [] - - # Go through the lines and detect headers and codefences - inCodefence = False - inTable = False - tableHasSeparator = False - inGridTable = False - gridTableHasSeparator = False - gridTable = "" - for line in inLines: - - # Detect and handle codefences - # For the moment we support only codefences that start and end - # with 3 backticks. This is the most common way to define codefences. - # Note, that longer codefences are allowed by the markdown specification. - - if _matchCodefenceStart.match(line) and not inCodefence: - inCodefence = True - outClauses[-1].append(Line(line, LineType.CODEFENCESTART)) - continue - if _matchCodefenceEnd.match(line): - inCodefence = False - outClauses[-1].append(Line(line, LineType.CODEFENCEEND)) - continue - if inCodefence: - outClauses[-1].append(Line(line, LineType.CODE)) - continue - - # Detect and handle tables - if _matchTable.match(line) and not inTable and not inGridTable: - inTable = True - outClauses[-1].append(Line(line, LineType.TABLEHEADER)) - continue - if inTable: - if _matchTableSeparator.match(line) and not tableHasSeparator: - outClauses[-1].append(Line(line, LineType.TABLESEPARATOR)) - tableHasSeparator = True - continue - elif _matchTable.match(line): - outClauses[-1].append(Line(line, LineType.TABLEROW)) - continue - else: - inTable = False - tableHasSeparator = False - # Mark the previous line as the last row in the table - outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW - # continue with other matches - - #Detect grid tables and convert them to html table - if _matchGridTable.match(line) and not inGridTable: - inGridTable = True - #outClauses[-1].append(Line(line, LineType.TABLEHEADER)) - gridTable += line - continue - if inGridTable: - if _matchGridTableHeaderSeparator.match(line) or _matchGridTableBodySeparator.match(line): - #outClauses[-1].append(Line(line, LineType.TABLESEPARATOR)) - gridTable += line - continue - elif _matchTable.match(line): - #outClauses[-1].append(Line(line, LineType.TABLEROW)) - gridTable += line + # write nav.yml file + print(f'[green]Writing "_nav.yml"') + indentation = ' ' if addNavTitle else '' # TODO make number of spaces configurable + with open(f'{os.path.dirname(filename)}/_nav.yml', 'w') as file: + if veryVerbose: + print(f'[dim]Writing navigation file') + if addNavTitle: + file.write(f'{indentation}- {navTitle}:\n') + for i, f in enumerate(document.clauses): + + if not f.title: + # print("continue") continue - else: - inGridTable = False - # Mark the previous line as the last row in the table - #outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW - print(gridTable) - htmltable = "" - htmltable = generate_html_table_with_spans(gridTable) - print(htmltable) - for row in htmltable: - outClauses[-1].append(Line(row, LineType.TABLEROW)) - gridTable = "" - # continue with other matches - - # Detect notes - # Notes are lines that start with a '>'. - if _matchNote.match(line): - outClauses[-1].append(Line(line, LineType.NOTE)) - continue - # Detect footnotes - # Footnotes are lines that start with a '^' - if (_fn := _footnote.match(line)): - footnotes.append(Footnote(_fn.groups()[0], Line(line, LineType.TEXT))) - continue + # TODO handle if the next clause is more than one level deeper - # Detect images on a single line - if (m := _matchStandAloneImage.match(line)): - outClauses[-1].append(Line(line, LineType.STANDALONEIMAGE)) - continue - - # Detect headers - _lineType = LineType.TEXT - if (m := _matchHeader.match(line)): - # Add a new clause - clauseTitle = m.groups()[1].strip() - clauseTitle = re.sub(_htmlTag, '', clauseTitle) - headerNumber = _matchHeaderNumber.search(clauseTitle) - outClauses.append(Clause(len(m.groups()[0]), # level - headerNumber.group() if headerNumber else shortHash(clauseTitle, 6), - clauseTitle, - [])) - _lineType = LineType.HEADING + _title = f.title.replace("'", '"') + nextClause = document.clauses[i+1] if i+1 < len(document.clauses) else None + if nextClause is None or nextClause.level <= f.level: + file.write(f"{indentation}{' '*f.level}- '{_title}': '{navTitle}/{f.clauseNumber}.md'\n") + else: + file.write(f"{indentation}{' '*f.level}- '{_title}':\n") + if len(f) > 0: + file.write(f"{indentation}{' '*nextClause.level}- 'Hanging paragraph': '{navTitle}/{f.clauseNumber}.md'\n") - # Just add the line to the current clause as text - outClauses[-1].append(Line(line, _lineType)) - return Document(outClauses, footnotes) +# TODO handle multiple nav levels (left bar) better (make conifgurable) def copyMediaFiles(filename:str, navTitle:str, mediaDirectory:str = 'media') -> None: @@ -1065,20 +150,22 @@ def processDocument(args:argparse.Namespace) -> None: document.insertFootnotes() document.updateLinks() document.updateNotes() - document.prepareForMkdocs(args.include_hanging_paragraphs) + + prepareForMkdocs(document, args.include_hanging_paragraphs) # Write the clauses to files - document.writeClausesMkDocs(inDocumentFilename, args.title, args.nav_add_title) + writeClausesMkDocs(document, inDocumentFilename, args.title, args.nav_add_title) # Copy the media files copyMediaFiles(inDocumentFilename, args.title, args.media_directory) -if __name__ == '__main__': +def main() -> None: parser = argparse.ArgumentParser(description = 'Convert oneM2M markdown specificatios to MkDocs format', formatter_class = argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--verbose', '-v', action = 'store_true', help = 'verbose output during processing') + parser.add_argument('--out', '-o', metavar='outfile', help = 'write output to file instead of stdout') parser.add_argument('--very-verbose', '-vv', action = 'store_true', help = 'very verbose output during processing') parser.add_argument('--ignore-clause', '-ic', metavar = 'clause', nargs = '+', default = [ 'Contents', 'History' ], help = 'ignore headers in the markdown document') parser.add_argument('--include-hanging-paragraphs', '-ihp', action = 'store_true', default = False, help = 'include hanging paragraphs (text in clauses with sub-clauses) in the output files') @@ -1088,8 +175,14 @@ if __name__ == '__main__': parser.add_argument('--title', '-t', metavar = 'title', required = True, help = 'mkdocs navigation tile') parser.add_argument('--nav-add-title', '-nat', action = 'store_true', default = False, help = 'add the title as an extra navigation level to the navigation file') - parser.add_argument('document', type = str, help = 'a oneM2M markdown specification document to process') args = parser.parse_args() + setScreenPrinters(info = lambda text: print(f'[green]{text}'), + debug = lambda text: print(f'[dim]{text}')) processDocument(args) + + +if __name__ == '__main__': + main() +