diff --git a/.gitignore b/.gitignore index 6d0869a8e10639c12c50041a220e5324a2f6ee94..4fb7084e96f2a10340d80ed783e44344ab49f81e 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ */ts-* */.python-version .python-version +toMkdocs/__pycache__ diff --git a/LICENSE b/LICENSE index 642df8af5e5d17b274a380164734f39ae289113b..11dd0dfb75a5ffa733d06018da0d082e0c378ba6 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ BSD 3-Clause License -Copyright (c) 2024, Miguel Angel Reina Ortega +Copyright (c) 2024, Miguel Angel Reina Ortega & Andreas Kraft Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/processMDSpec.py b/processMDSpec.py new file mode 100644 index 0000000000000000000000000000000000000000..9b3d93c85f5635c80917d8834bf6b0f15682d8ef --- /dev/null +++ b/processMDSpec.py @@ -0,0 +1,276 @@ +# +# processMDSpec.py +# +# (c) 2025 by Andreas Kraft +# License: BSD 3-Clause License. See the LICENSE file for further details. +# +""" This script processes markdown specification files. It handles the + include statements and the front matter. It can also render the markdown + content on console or output the front matter only. +""" + +from __future__ import annotations + +_print = print # save the original print function + +from typing import Tuple, Generator +import argparse +from rich import markdown, print +import re, sys, yaml, os +from contextlib import contextmanager + + +_frontMatter:dict = {} +_includeStack:list[str] = [] + +@contextmanager +def includeStack(filename:str) -> Generator [None, None, None]: + """ Handle the include stack. + + This is used to detect circular includes and to keep track of the + include stack. + + Args: + filename: The name of the file being processed. + + Raises: + Exception: If a circular include is detected. + + Returns: + Generator: A generator that yields nothing. + """ + if filename in _includeStack: + raise Exception(f'Circular include detected: {" -> ".join(_includeStack)} -> {filename}') + _includeStack.append(filename) + yield + _includeStack.pop() + + +def expandPaths(lines:list[str], currentPath:str, childPath:str) -> list[str]: + """ Expand the paths in the markdown file. This means that all paths in links, + images, and include statements are extended so that they would be valid paths + from the root document. + + Args: + lines: The lines of the markdown file. + currentPath: The current path of the file being processed. + childPath: The path of the child file being processed. + + Returns: + list[str]: The lines of the markdown file with expanded paths. + """ + + # Replace all relative paths in the markdown with the new path + # add a path to the current path + if currentPath[-1] != '/': + currentPath += '/' + newPath = currentPath + childPath + # Remove the leading './' from the path + while newPath.startswith('./'): + newPath = newPath[2:] + + inCodeFence = False + for index, line in enumerate(lines): + + # Ignore stuff in code fences + if re.match(r'^\s*```.*', line): + inCodeFence = not inCodeFence + continue + if inCodeFence: + continue + + # handle the links in a line (there could be multiple links in a line) + links = re.findall(r'\[([^\]]+)\]\(([^\)]+)\)', line) + for linkText, linkPath in links: + # Skip URLs and absolute paths + if linkPath.startswith(('http://', 'https://', '/')): + continue + + # Construct the new path by adding addedPath to the original path + newLinkPath = linkPath[2:] if linkPath.startswith('./') else linkPath + + # Create the updated path + updatedPath = f"{newPath}{linkPath}" if newPath.endswith('/') else f"{newPath}/{newLinkPath}" + + # Replace the original link with the updated one in the markdown + line = line.replace(f'[{linkText}]({linkPath})', f'[{linkText}]({updatedPath})') + + # handle the include statements (there should only be one per line) + includes = re.findall(r'^\s*::include{file=([^\}]+)}', line) + for includePath in includes: + + # Construct the new path by adding addedPath to the original path + includePath = includePath[2:] if includePath.startswith('./') else includePath + + # Create the updated path + updatedPath = f'{newPath}{includePath}' if newPath.endswith('/') else f'{newPath}/{includePath}' + + # Replace the original include with the updated one in the markdown + line = line.replace(f'::include{{file={includePath}}}', f'::include{{file={updatedPath}}}') + + lines[index] = line + + return lines + + +def processFrontMatter(lines:list[str], args:argparse.Namespace) -> Tuple[dict, list[str]]: + """ Process the front matter of a markdown file. This includes extracting + the front matter information and returning it as a dictionary. + + Currently only YAML front matter is supported. It can be extended later. + + Args: + lines: The lines of the markdown file. + args: The command line arguments. + + Raises: + yaml.YAMLError: If the front matter cannot be parsed as YAML. + + Returns: + dict: The front matter information as a dictionary. + list[str]: The lines of the markdown file without the front matter. + """ + + if not lines or not lines[0].startswith('---'): + return {}, lines + + frontMatterLines:list[str] = [] + for line in lines[1:]: + if re.match(r'^---\s*', line): + break + frontMatterLines.append(line) + + # Remove the front matter from the lines + lines = lines[len(frontMatterLines)+2:] + + # Parse the front matter as YAML + try: + return yaml.safe_load(''.join(frontMatterLines)), lines + except yaml.YAMLError as e: + print(f'[red]Error parsing front matter: {e}') + raise + + +def processFile(args:argparse.Namespace) -> str: + """ Handle the include statements in the markdown files. This includes + processing the include statements and removing the include statements + from the markdown files. + + Args: + args: The command line arguments. + + Raises: + Exception: If the file cannot be processed. + + Returns: + The processed markdown content as a string. + """ + + def handleIncludesForFile(filename:str, currentPath:str) -> str: + """ Read a single markdown file and return its content. + + Args: + filename: The name of the file to read. + + Raises: + FileNotFoundError: If the file cannot be found. + + Returns: + The content of the file. + """ + # Get the directory path from the filename + dirname = os.path.dirname(filename) + if dirname and not dirname.endswith('/'): + dirname = dirname + '/' + + dirname = dirname if dirname else '.' + currentPath = currentPath if currentPath else '.' + filename = os.path.normpath(filename) + + with includeStack(filename): + try: + with open(filename, 'r') as f: + lines = f.readlines() + except FileNotFoundError: + print(f'[red]File not found: {filename}') + raise + + # Expand the paths in the markdown file + + # extract front matter information + lines = expandPaths(lines, currentPath, dirname) + fm, lines = processFrontMatter(lines, args) + if fm: + _frontMatter[filename] = fm + + if not args.doInclude: + return ''.join(lines) + + inCodeFence = False + for line in lines: + + # Ignore stuff code fences + if re.match(r'^\s*```.*', line): + inCodeFence = not inCodeFence + continue + if inCodeFence: + continue + + # Check for ::include{file=...} pattern using regex at the beginning of a line + match = re.search(r'^::include\{\s*file=(.*?)\s*\}', line.strip()) + if match: + includeFilename = match.group(1) + # Read the included file and replace the include statement with its content + lines[lines.index(line)] = handleIncludesForFile(includeFilename, os.path.dirname(filename)) + + return ''.join(lines) + + return handleIncludesForFile(args.document, os.path.dirname(args.document)) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='Process markdown specification files.') + parser.add_argument('--no-include', dest='doInclude', action='store_false', default=True, help="don't process include statements") + parser.add_argument('--render-markdown', '-md', dest='renderAsMarkdown', action='store_true', help='render output as markdown') + parser.add_argument('--process-frontmatter', '-fm', dest='outputFrontMatter', action='store_true', help='output front matter only') + parser.add_argument('--frontmatter-only', '-fmo', dest='onlyFrontMatter', action='store_true', help='output only front matter') + parser.add_argument('--verbose', '-v', action='store_true', help='print debug information to stderr.') + parser.add_argument('document', type=str, help='a markdown specification document to process') + args = parser.parse_args() + + if args.verbose: + if not args.doInclude: + print(f'[yellow]Skipping processing include statements', file=sys.stderr) + else: + print(f'[green]Processing include statements', file=sys.stderr) + + try: + lines = processFile(args) + except Exception as e: + print(f'[red]Error while processing {args.document}\n{e}', file=sys.stderr) + quit(1) + + if args.outputFrontMatter or args.onlyFrontMatter: + # Collect front matter information in the output + if not args.onlyFrontMatter: + print('---') + + # The following is a workaround to keep the order of the dictionary + # see https://stackoverflow.com/a/52621703 + yaml.add_representer(dict, lambda self, data: yaml.representer.SafeRepresenter.represent_dict(self, data.items())) + print(yaml.dump(_frontMatter, default_flow_style=False), end='') + + if not args.onlyFrontMatter: + print('---') + + if not args.onlyFrontMatter: + if args.renderAsMarkdown: + # Render the markdown content + print(markdown.Markdown(lines)) + else: + # Print the raw markdown content + _print(lines) + + + diff --git a/toMkdocs/gridTableFilter.py b/toMkdocs/gridTableFilter.py new file mode 100644 index 0000000000000000000000000000000000000000..027c69f8e2695740c14877d62ef757b2a474eea3 --- /dev/null +++ b/toMkdocs/gridTableFilter.py @@ -0,0 +1,35 @@ +# +# gridTableFilter.py +# +# (c) 2025 by Andreas Kraft & Miguel Angel Reina Ortega +# License: BSD 3-Clause License. See the LICENSE file for further details. +# +""" This script replaces the grid tables in the markdown files with the equivalent + html tables. Other markdown elements are not affected and are passed through. + + The script expects the markdown file to be converted from stdin and writes the + result to stdout. +""" + +import argparse, sys +from markdownTools import analyseMarkdown, setLoggers + +def main() -> None: + + # Parse the command line arguments + parser = argparse.ArgumentParser(description='Convert grid tables to html tables. This script reads the markdown file from stdin and writes the result to stdout.') + parser.add_argument('-v', '--verbose', action='store_true', help='Print debug information to stderr.') + args = parser.parse_args() + + # Set the loggers + setLoggers(info=lambda m: print(f'[green]{m}', file=sys.stderr) if args.verbose else None, + debug=lambda m: print(f'[dim]{m}', file=sys.stderr) if args.verbose else None, + error=lambda m: print(f'[red]{m}', file=sys.stderr) if args.verbose else None) + + # Read the input from stdin and write the result to stdout + print(analyseMarkdown(inLines=sys.stdin.readlines()), file=sys.stdout) + + +if __name__ == '__main__': + main() + diff --git a/toMkdocs/gridTableTools.py b/toMkdocs/gridTableTools.py new file mode 100644 index 0000000000000000000000000000000000000000..6b4161cc25b50129bcd4253bb174d927d3cd9647 --- /dev/null +++ b/toMkdocs/gridTableTools.py @@ -0,0 +1,649 @@ +# +# gridTableTools.py +# +# (c) 2025 by Miguel Angel Reina Ortega & Andreas Kraft +# License: BSD 3-Clause License. See the LICENSE file for further details. +# +""" Tools for working with grid tables in markdown files. """ + +from typing import Optional, Callable +from regexMatches import * + + +_alignLeft = 'align="left"' +_alignRight = 'align="right"' +_alignCenter = 'align="center"' +_nextListElementMark = '∆' # Marks a continuing list in the line before. !!! Must be a single character + + +printInfo = print +printDebug = print +printError = print + +def setLoggers(info:Callable=print, debug:Callable=print, error:Callable=print) -> None: + global printInfo, printDebug, printError + + printInfo = info + printDebug = debug + printError = error + + +class GridCell: + """ Represents a grid table cell. """ + + def __init__(self) -> None: + """ Initialize a new grid table cell. + """ + self.content:Optional[str] = None + self.rowspan:int = 0 + self.colspan:int = 0 + self.colspanAdjusted:bool = False + self.alignment:str = 'align="center"' + self.positionStart:Optional[int] = None + self.position:Optional[int] = None + self.listFlag:bool = False + self.auxiliarIndex:int = 0 + + + def calculateAndSetAlignment(self, + headerDelimiterPositions:list[int], + delimiterPositions:list[int], + defaultAlignments:list[str], + hasHeader:bool) -> None: + """ Set the alignment of the cell based on the position of the delimiter. + + Args: + headerDelimiterPositions: The positions of the header delimiters. + delimiterPositions: The positions of the delimiters. + defaultAlignments: The default alignments. + hasHeader: True if the table has a header, False otherwise. + """ + if self.position is None or self.positionStart is None: + raise ValueError('Cell position must be set before calculating alignment.') + + if hasHeader: + headerDelimiterIndex = 0 + while headerDelimiterIndex < len(defaultAlignments) and self.positionStart > headerDelimiterPositions[headerDelimiterIndex]: + headerDelimiterIndex += 1 + if headerDelimiterIndex < len(defaultAlignments): + self.alignment = defaultAlignments[headerDelimiterIndex] + else: + raise ValueError('Invalid table formatting') + + def __str__(self): + return f'(Content: {self.content}, Rowspan: {self.rowspan}, Colspan: {self.colspan}, Alignment: {self.alignment}, Position: {self.position}, ListFlag: {self.listFlag}, AuxiliarIndex: {self.auxiliarIndex})' + + + def __repr__(self): + return self.__str__() + + +class GridRow(): + """ Represents a row in a grid table. """ + cells:list[GridCell] = [] + + + def __init__(self, length: int = 1) -> None: + self.cells = [GridCell() for _ in range(length)] + + + def __getitem__(self, item): + return self.cells[item] + + + def __setitem__(self, key, value): + self.cells[key] = value + + + def __str__(self): + return str(self.cells) + + + def __repr__(self): + return self.__str__() + + +class GridRowsTracker(): + """ Represents the document object. """ + def __init__(self, size:int) -> None: + self.gridRowTracker = [0 for _ in range(size)] + + + def __getitem__(self, item:int) -> int: + return self.gridRowTracker[item] + + + def __setitem__(self, key:int, value:int) -> None: + self.gridRowTracker[key] = value + + + def __str__(self): + return str(self.gridRowTracker) + + + def __repr__(self): + return self.__str__() + + def max(self) -> int: + return max(self.gridRowTracker) + + + +# Some type aliases +GridTableRow = list[GridCell] +GridTableRowList = list[GridTableRow] + +def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableRowList]: + """ + Parse a Pandoc-style grid table into a structure for HTML conversion with rowspan and colspan. + + :param pandoc_table: String of the Pandoc-style grid table. + :return: List of lists representing the table with metadata for spans. + """ + #global hasHeader, defaultAlignments, headerDelimiterPositions, delimiterPositions, nextListElementMark + + # Initialize globals + hasHeader = False + defaultAlignments:list[str] = [] + headerDelimiterPositions:list[int] = [] + delimiterPositions:list[int] = [] + + # Split the input into lines + lines:list[str] = [line for line in gridTable.rstrip().split('\n')] + + + # Detect separator lines by pattern (it does not take into account partial separators + def isSeparator(line:str) -> bool: + return matchGridTableSeparator.match(line) is not None + + + # Set content on the cell - concatenating multilines, flagging lists + def handleCellContent(cell:GridCell, content:str) -> None: + _c = content.strip() + + if cell.content is None: # Previous empty cell + cell.rowspan += 1 + cell.colspan += 1 + if _c.startswith('- '): # List in a cell + cell.listFlag = True + _c = re.sub(r'\\\s*$', '\n', _c) + cell.content = _c + _nextListElementMark # Add list element end mark to know when the list element ends + elif cell.listFlag and len(_c) > 0: # any other content when handling list is concatenated to the last list element + _c = re.sub(r'\\\s*$', '\n', _c) + cell.content = _c + _nextListElementMark #add the list element end mark + elif not _c: # empty line. separation between list and other paragraph + # cell.content = '\n' if not cell.content.endswith('\n') else "" + cell.content = '\n' # cell content is always empty / None here. + else: + cell.content = re.sub(r'\\\s*$', '\n', _c) + else: # Cell has content + if _c.startswith('- '): # List + if not cell.listFlag: + cell.content += '\n' + #cell['content'] = cell['content'].strip("\n") + cell.listFlag = True + _c = re.sub(r'\\\s*$', '\n', _c) + cell.content += _c + _nextListElementMark # Add list element end mark to know when the list element ends + elif cell.listFlag and len(_c) > 0: # any other content when handling list is concatenated to the last list element + # cell.content = cell.content.strip(nextListElementMark) #remove list element end mark + cell.content = cell.content.removesuffix(_nextListElementMark) #remove list element end mark + + _c = re.sub(r'\\\s*$', '\n', _c) + cell.content += ' ' + _c + _nextListElementMark #add list element end mark + elif len(_c) == 0: # separation between list and other paragraph + if cell.listFlag: + cell.listFlag = False + cell.content += '\n\n' #end list by \n + #content = re.sub(r'\\\s*$', "\n", content.strip()) + cell.content += '\n' if not cell.content.endswith('\n') else '' + else: + cell.content += ' ' + re.sub(r'\\\s*$', '\n', _c) + + # Adjust colspan of a cell + def adjustColspan(row:GridRow, columnIndex:int, numberOfParts:int, line, numberOfColumns:int, delimiterPositions:list[int]) -> None: + for j in range(columnIndex, numberOfParts): + delimiterStart:Optional[int] = None + colI = columnIndex + while delimiterStart == None: + delimiterStart = row[colI - 1].position if colI > 0 else 0 + colI -= 1 + positions = [line.find(delimiter, delimiterStart + 1) for delimiter in "|+" if delimiter in line[delimiterStart + 1:]] + position = min(positions) if positions else -1 + if position > delimiterPositions[j]: # Colspan to be increased + row[columnIndex].colspan += 1 + if position == delimiterPositions[len(delimiterPositions) - 1]: # last cell in row, adjust colspan to get max number columns + colspan_allocated = row[columnIndex].colspan + row[columnIndex].colspan += numberOfColumns - colspan_allocated - columnIndex + elif position < delimiterPositions[j]: + raise ValueError("Wrong cell formatting") + else: + break + + row[columnIndex].colspanAdjusted = True # Mark cell as adjusted + + + def checkDelimiterAlignment(line: str, delimiterPositions:list[int], delimiters: str = "|+") -> bool: + """ + Check if delimiters in a row align with expected positions. + + Args: + line: The line of text to check + delimiter_positions: List of expected positions (based on + characters) + delimiters: String containing valid delimiter characters (default: "|+") + + Returns: + bool: True if delimiters align correctly, False otherwise + """ + if not line or not delimiterPositions: + return False + + printDebug(f'\nChecking line: "{line}"') + printDebug(f'Expected delimiter positions: {delimiterPositions}') + + # For full separator lines (only +) + if '+' in line and '|' not in line: + currentPositions = [i for i, char in enumerate(line) if (char == '+' and i > 0)] + printDebug(f'Full separator line - Found + at positions: {currentPositions}') + return all(delimiterPositions[-1] in currentPositions and line.startswith('+') and pos in delimiterPositions + for pos in currentPositions) + + # For data lines (only |) + if '|' in line and '+' not in line: + currentPositions = [i for i, char in enumerate(line) if (char == '|' and i > 0)] + printDebug(f'Data line - Found | at positions: {currentPositions}') + return all(delimiterPositions[-1] in currentPositions and line.startswith("|") and pos in delimiterPositions + for pos in currentPositions) + + # For partial separators (mix of + and |) + currentPositions = [i for i, char in enumerate(line) if (char in delimiters and i > 0)] + printDebug(f'Partial separator - Found delimiters at positions: {currentPositions}') + printDebug(f'Characters at those positions: {[line[pos] for pos in currentPositions]}') + return all(delimiterPositions[-1] in currentPositions and line.startswith(('+', '|')) and pos in delimiterPositions + for pos in currentPositions) + + separatorIndices = [i for i, line in enumerate(lines) if isSeparator(line)] + + if not separatorIndices: + raise ValueError('No valid separators found in the provided grid table.') + + # Calculate max number of columns + delimiterPositions = [] + numberOfColumns:int = 0 + + for separatorIndex in separatorIndices: + if (_cnt := lines[separatorIndex].count('+') - 1) > numberOfColumns: + numberOfColumns = _cnt + delimiterPositions = [] + for rowIndex in range(numberOfColumns): + delimiterPositionsStart = delimiterPositions[rowIndex - 1] if rowIndex != 0 else 0 + delPositions = [lines[separatorIndex].find(delimiter, delimiterPositionsStart + 1) + for delimiter in '+' if delimiter in lines[separatorIndex][delimiterPositionsStart + 1:]] + delimiterPositions.append(min(delPositions) if delPositions else -1) + + # Determine delimter positions and alignments + headerRows:GridTableRowList = [] + dataRows:GridTableRowList = [] + + for index in separatorIndices: + if matchGridTableHeaderSeparator.match(lines[index]): + hasHeader = True + headerSeparatorIndex = index + parts = re.split(r'\+', lines[index].strip('+')) + #Calculate default alignments and positions of delimiters + for partIndex in range(len(parts)): + # Left alignment + if parts[partIndex].startswith(':') and not parts[partIndex].endswith(':'): + defaultAlignments.append(_alignLeft) + + # Right alignment + elif not parts[partIndex].startswith(':') and parts[partIndex].endswith(':'): + defaultAlignments.append(_alignRight) + + # Center alignment + else: + defaultAlignments.append(_alignCenter) + + # Delimiter position + delimiterPositionsStart = delimiterPositions[partIndex - 1] if partIndex != 0 else 0 + delPositions = [lines[index].find(delimiter, delimiterPositionsStart + 1) + for delimiter in '+' if delimiter in lines[index][delimiterPositionsStart + 1:]] + headerDelimiterPositions.append(min(delPositions) if delPositions else -1) + + if not hasHeader: + # Set default alignments from the first separator which takes the role of header + hasHeader = True + headerSeparatorIndex = 0 + parts = re.split(r'\+', lines[0].strip('+')) + + # Calculate default alignments and positions of delimiters + for partIndex in range(len(parts)): + if parts[partIndex].startswith(':') and not parts[partIndex].endswith(':'): + defaultAlignments.append(_alignLeft) + + elif not parts[partIndex].startswith(':') and parts[partIndex].endswith(':'): + defaultAlignments.append(_alignRight) + + else: + defaultAlignments.append(_alignCenter) + + # Delimiter position + delimiterPositionsStart = delimiterPositions[partIndex - 1] if partIndex != 0 else 0 + delPositions = [lines[index].find(delimiter, delimiterPositionsStart + 1) + for delimiter in '+' if delimiter in lines[index][delimiterPositionsStart + 1:]] + headerDelimiterPositions.append(min(delPositions) if delPositions else -1) + + #Check end table delimiter alignment (not checked during the lines processing) + if not checkDelimiterAlignment(lines[-1], delimiterPositions): + raise ValueError(f'Misaligned delimiters in end table separator: {lines[-1]}') + + for rowNumber in range(len(separatorIndices) - 1): + rows:list[GridRow] = [] + rowsTracker:GridRowsTracker + inDataRow = False + start, end = separatorIndices[rowNumber], separatorIndices[rowNumber + 1] + rowLines = lines[start:end] # Lines between separators including separator line start as it gives information about the number of columns of the row + if rowLines: + # Combine multiline content into single strings for each cell + for line in rowLines: + line = line.rstrip() + if isSeparator(line) and not inDataRow: + inDataRow = True + # Add delimiter alignment check for separator lines + if not checkDelimiterAlignment(line, delimiterPositions): + raise ValueError(f'Misaligned delimiters in separator row: {line}') + + parts = re.split(r'\s*\+\s*', line.strip('+')) + delimiterIndex = 0 + + rows.append(GridRow(numberOfColumns)) + rowsTracker = GridRowsTracker(numberOfColumns) + columnIndex = 0 + + for rowIndex in range(len(parts)): + if columnIndex in range(numberOfColumns): + delimiterIndex += len(parts[rowIndex]) + 1 + cell = rows[-1][columnIndex] + + # Set position + cell.positionStart = delimiterIndex - len(parts[rowIndex]) + cell.position = delimiterIndex # Position of cell delimiter + + + # Set alignment as defined by header separator line + cell.calculateAndSetAlignment(headerDelimiterPositions, delimiterPositions, defaultAlignments, hasHeader) + + while delimiterIndex > delimiterPositions[columnIndex]: + columnIndex += 1 + columnIndex += 1 + + elif inDataRow: + # Regular data row or partial separator + if matchGridTableBodySeparator.match(line): # Partial separator + # Add delimiter alignment check for partial separators + if not checkDelimiterAlignment(line, delimiterPositions): + raise ValueError(f'Misaligned delimiters in partial separator: {line}') + + cellsContent = re.split(r'[\|\+]', line.strip('|').strip('+')) # (?<!\\)[\|\+] + #Add another row, set delimiters for each cell + rows.append(GridRow(numberOfColumns)) + auxDelimiterIndex = 0 + auxiliarCellIndex = 0 + + for columnIndex, content in enumerate(cellsContent): + if auxiliarCellIndex < numberOfColumns: + auxDelimiterIndex += len(content) + 1 + cell = rows[-1][auxiliarCellIndex] + cell.positionStart = auxDelimiterIndex - len(content) # Position of cell delimiter + + cell.position = auxDelimiterIndex # Position of cell delimiter + + cell.calculateAndSetAlignment(headerDelimiterPositions, delimiterPositions, defaultAlignments, hasHeader) + while auxDelimiterIndex > delimiterPositions[auxiliarCellIndex]: + auxiliarCellIndex += 1 + auxiliarCellIndex += 1 + + if len(cellsContent) <= numberOfColumns: # Colspan: Positions of | with respect to + need to be determined + columnCellIndex = 0 + + # Put the value in a variable here because we need the initial value + maxRowsTracker = rowsTracker.max() + # Go through all cells in a columnt + for columnIndex, content in enumerate(cellsContent): + rowIndex = rowsTracker[columnCellIndex] + cell = rows[rowIndex][columnCellIndex] + + # Check whether a cell contains a header separator + if matchGridTableBodySeparatorLine.match(content): # A new row is to be added + rowsTracker[columnCellIndex] = maxRowsTracker + 1 # That actual row will have more than one row + rowIndex = rowsTracker[columnCellIndex] + cell = rows[rowIndex][columnCellIndex] + + cell.listFlag = False + columnForward = 0 + + for delIndex in range(columnCellIndex, len(delimiterPositions)): + rowIndex = rowsTracker[columnCellIndex] # Correcting the rowIndex. Might have been changed by a previous iteration + if rows[rowIndex][columnCellIndex].position >= delimiterPositions[delIndex]: + columnForward += 1 + #rowsTracker[columnCellIndex + columnForward - 1] = maxRowsTracker + 1 if columnForward > 1 else 0 + columnCellIndex += columnForward + + continue + + else: + # Handle content of the cell + handleCellContent(cell, cellsContent[columnIndex]) + cell.rowspan += 1 + if not cell.colspanAdjusted: + # TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator + adjustColspan(rows[rowIndex], columnCellIndex, numberOfColumns, line, numberOfColumns, delimiterPositions) + + if cell.position >= delimiterPositions[columnCellIndex]: + columnCellIndex += cell.colspan if cell.colspan != 0 else 1 + continue + + else: + raise ValueError(f'More cells than columns found ({len(cellsContent)} {numberOfColumns})') + + else: # Data row + cellsContent = re.split(r'\|', line.strip('|')) + + # Add delimiter alignment check + if not checkDelimiterAlignment(line, delimiterPositions): + raise ValueError(f'Misaligned delimiters in row: {line}') + + columnCellIndex = 0 + if len(cellsContent) < numberOfColumns: # Colspan: Positions of | with respect to + need to be determined + for columnIndex, content in enumerate(cellsContent): + row = rows[rowsTracker[columnCellIndex]] + cell = row[columnCellIndex] + # Handle content of the cell + handleCellContent(cell, content) + if not cell.colspanAdjusted: + #TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator + adjustColspan(row, columnCellIndex, numberOfColumns, line, numberOfColumns, delimiterPositions) + if cell.position >= delimiterPositions[columnCellIndex]: + columnCellIndex += cell.colspan # Move forward index i + + elif len(cellsContent) == numberOfColumns: # Simple row + for columnIndex, content in enumerate(cellsContent): + rowIndex = rowsTracker[columnIndex] + handleCellContent(rows[rowIndex][columnIndex], content) + else: + raise ValueError(f'More cells than columns found ({len(cellsContent)} {numberOfColumns})') + else: + raise ValueError('No separator line found for row starting') + + if hasHeader and start >= headerSeparatorIndex: # table_row and auxiliar_row are part of data_rows + for row in rows: + dataRows.append(row.cells) + elif hasHeader and start < headerSeparatorIndex: # table_row and auxiliar_row are part of header_rows + for row in rows: # header rows + headerRows.append(row.cells) + else: + #only body + for row in rows: + dataRows.append(row.cells) + + # Check if there are any data rows + if not dataRows and not headerRows: + raise ValueError('No valid rows found in the provided grid table.') + + # Format text + for gridRows in [headerRows, dataRows]: + for gridRow in gridRows: + for cell in gridRow: + if cell.content is not None: + # Replacing "<" by < + cell.content = cell.content.replace('<', '<') + + + # Bold replacements + # Regex to detect markdown bold formatting in cell content + if cell.content is not None: + cell.content = matchBold.sub(r'\1<strong>\g<text></strong>', cell.content) + + # Italic replacements + # Regex to detect markdown italic formatting in cell content + if cell.content is not None: + cell.content = matchItalic.sub(r'\1<i>\g<text></i>', cell.content) + + # Correct newlines characters + for headerRow in headerRows: + for cell in headerRow: + cell.content = cell.content.replace('\n', '<br />') if cell.content is not None else None + for dataRow in dataRows: + for cell in dataRow: + cell.content = cell.content.replace('\n', '<br />') if cell.content is not None else None + + # + # Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows + # + + # Checking the header rows + forwardRowspan:list[int] = [] + for idx, headerRow in enumerate(headerRows): + if len(forwardRowspan) == 0: + forwardRowspan = [0] * len(headerRows[idx]) + sum = 0 + + for cellIndex, cell in enumerate(headerRow): + sum += cell.colspan + if idx > 0 and cell.colspan == 0: + if forwardRowspan[cellIndex] > 0: + sum += 1 + forwardRowspan[cellIndex] -= 1 + if forwardRowspan[cellIndex] == 0 and cell.rowspan > 1: + forwardRowspan[cellIndex] = cell.rowspan -1 + colspan=1 + while cell.colspan > colspan: + forwardRowspan[cellIndex + colspan] = cell.rowspan - 1 + colspan += 1 + + if not sum == numberOfColumns: + raise ValueError('Grid table not converted properly') + + # Checking the data rows + forwardRowspan = [] + for idx, dataRow in enumerate(dataRows): + if len(forwardRowspan) == 0: + forwardRowspan = [0] * len(dataRows[idx]) + sum = 0 + + for cellIndex, cell in enumerate(dataRows[idx]): + sum += cell.colspan + if idx > 0 and cell.colspan == 0: + if forwardRowspan[cellIndex] > 0: + sum += 1 + forwardRowspan[cellIndex] -= 1 + if forwardRowspan[cellIndex] == 0 and cell.rowspan > 1: + forwardRowspan[cellIndex] = cell.rowspan - 1 + colspan=1 + while cell.colspan > colspan: + forwardRowspan[cellIndex + colspan] = cell.rowspan - 1 + colspan += 1 + + if not sum == numberOfColumns: + raise ValueError('Grid table not converted properly') + + return headerRows, dataRows + + +def generateHtmlTableWithSpans(gridTable:str) -> str: + """ Generate an HTML table from a Pandoc-style grid table with row and column spans. + + Args: + gridTable: The Pandoc-style grid table. + + Returns: + The HTML table in string format. + """ + regex1 = r'\s*([-*+]|\s*\d+\.)\s+((?:(?!' + re.escape(_nextListElementMark) + r').)+)' + re.escape(_nextListElementMark) + regex2 = r'(\s*([-*+]|\s*\d+\.)\s+(?:(?!∆).)+' + re.escape(_nextListElementMark) + r')+' + + try: + gridHeader, gridBody = parseGridTableWithSpans(gridTable) + except Exception as e: + printDebug('Grid table could not be generated') + raise RuntimeError(f'HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE:\n{str(e)}') + + # Generate table HTML... + html = '<table>\n' + hasHeader = False + + for row in gridHeader: + for cell in row: + if cell.rowspan != 0 and cell.colspan != 0: + hasHeader = True + break + + if hasHeader: + html += ' <thead>\n' + for row in gridHeader: + html += " <tr>\n" + for cell in row: + if cell.rowspan == 0 or cell.colspan == 0: + continue + else: + # Prepare content, in case there's a list + if cell.content is not None and (matches := re.findall(regex1, cell.content)): # Update cell in new row + list = '<ul>' + # Build list the matches + for match in matches: + list += '<li>' + match[1] + '</li>' + list += '</ul>' + cell.content = re.sub(regex2, list, cell.content) + # Enforce left alignment if cell contains a list + cell.alignment = _alignLeft + + rowspan = f' rowspan="{cell.rowspan}"' if cell.rowspan > 1 else '' + colspan = f' colspan="{cell.colspan}"' if cell.colspan > 1 else '' + html += f' <th{rowspan}{colspan} {cell.alignment}>{cell.content}</th>\n' + html += ' </tr>\n' + html += ' </thead>\n' + + + html += ' <tbody>\n' + for row in gridBody: + html += ' <tr>\n' + for cell in row: + if cell.rowspan == 0 or cell.colspan == 0: + continue + else: + #Prepare content, in case there's a list + if cell.content is not None and (matches := re.findall(regex1, cell.content)): # Update cell in new row + list = '<ul>' + # Build list the matches + for match in matches: + list += f'<li>{match[1]}</li>' + list += '</ul>' + cell.content = re.sub(regex2, list, cell.content) + # Enforce left alignment if cell contains a list + cell.alignment = _alignLeft + + rowspan = f' rowspan="{cell.rowspan}"' if cell.rowspan > 1 else '' + colspan = f' colspan="{cell.colspan}"' if cell.colspan > 1 else '' + html += f' <td{rowspan}{colspan} {cell.alignment}>{cell.content}</td>\n' + html += ' </tr>\n' + + html += ' </tbody>\n' + html += '</table>' + return html + diff --git a/toMkdocs/markdownTools.py b/toMkdocs/markdownTools.py new file mode 100644 index 0000000000000000000000000000000000000000..67fafa5b0b6aa8a34082f7c207c5867c58b754db --- /dev/null +++ b/toMkdocs/markdownTools.py @@ -0,0 +1,598 @@ +# +# markdownTools.py +# +# (c) 2025 by Andreas Kraft & Miguel Angel Reina Ortega +# License: BSD 3-Clause License. See the LICENSE file for further details. + + + +""" Various tools for markdown processing +""" +from __future__ import annotations +from typing import Callable, Optional + +from dataclasses import dataclass +import base64, hashlib +from enum import Enum, auto + +from gridTableTools import generateHtmlTableWithSpans, setLoggers as setGridTableLoggers +from regexMatches import * + +# TODO use a verbosity level instead +verbose = False +veryVerbose = False + +printInfo = print +printDebug = print +printError = print + +def setLoggers(info:Callable = print, debug:Callable = print, error:Callable= print) -> None: + global printInfo, printDebug, printError + + printInfo = info + printDebug = debug + printError = error + + # Set the loggers for the grid table tools + setGridTableLoggers(info, debug, error) + + + +def _shortHash(value:str, length:int) -> str: + """ Generate a short hash of a string value. + + Args: + value: The value to hash. + length: The length of the hash. + + Returns: + The hash. + """ + return base64.b64encode( + hashlib.sha256( + value.encode() + ).digest() + ).decode()[:length] + + +class LineType(Enum): + """ Represents the type of a line in the markdown file. """ + HEADING = auto() + TEXT = auto() + CODEFENCESTART = auto() + CODE = auto() + CODEFENCEEND = auto() + LIST = auto() + NOTE = auto() + STANDALONEIMAGE = auto() + TABLEHEADER = auto() + TABLESEPARATOR = auto() + TABLEROW = auto() + TABLELASTROW = auto() + RAWHTML = auto() + + +@dataclass +class Line: + """ Represents a line in the markdown file. """ + text:str = '\n' + lineType:LineType = LineType.TEXT + + + def __str__(self) -> str: + """ Return the line as a string. """ + return self.text + + + def __repr__(self) -> str: + """ Return the line as a string. """ + return self.__str__() + + +@dataclass +class Clause: + """ Represents a clause in the markdown file. """ + _level:int + _clauseNumber:str + _title:str + _lines:list[Line] + + + @property + def level(self) -> int: + """ Return the level of the clause. """ + return self._level + + + @property + def clauseNumber(self) -> str: + """ Return the clause number. """ + return self._clauseNumber if self._clauseNumber else '0' + + + @clauseNumber.setter + def clauseNumber(self, value:str) -> None: + """ Set the clause number. """ + self._clauseNumber = value + + + @property + def title(self) -> str: + """ Return the title of the clause. """ + return self._title + + + @title.setter + def title(self, value:str) -> None: + """ Set the title of the clause. """ + self._title = value + + + @property + def lines(self) -> list[Line]: + """ Return the lines of the clause. """ + return self._lines + + + @lines.setter + def lines(self, value:list[Line]) -> None: + """ Set the lines of the clause. """ + self._lines = value + + + @property + def linesCount(self) -> int: + """ Return the number of lines in the clause. + + Returns: + The number of lines in the clause. + """ + return len(self.lines) + + + def append(self, line:Line) -> None: + """ Append a line to the clause. + + Args: + line: The line to append. + """ + self.lines.append(line) + + + def extend(self, clause:Clause) -> None: + """ Extend the clause with the lines of another clause. + + Args: + clause: The clause to extend with. + """ + self.lines.extend(clause.lines) + + + def asStringList(self, paddings:int = 0) -> list[str]: + """ Return the clause as a list of strings. + + Args: + paddings: The number of empty lines to add before the clause. + Returns: + The clause's lines as a list of strings. + """ + return [ '\n' for _ in range(paddings) ] + [ l.text for l in self.lines ] + + + def __len__(self) -> int: + """ Return the number of characters in the clause. This does not include + empty lines or lines that contain only whitespace. + + Returns: + The number of characters in the clause. + """ + return sum([ len(l.text.strip()) for l in self.lines ]) + + + def __str__(self) -> str: + """ Return the clause as a string. """ + return ''.join([str(l) for l in self.lines ]) + + + def __repr__(self) -> str: + """ Return the clause as a string. """ + return self.__str__() + + + +class Footnote: + """ Represents a footnote in the markdown file. """ + def __init__(self, id:str, line:Line) -> None: + """ Constructor. + + Args: + id: The id of the footnote. + line: The line of the footnote. + """ + self.id = id + """ The id of the footnote. """ + + self.line = line + """ The line of the footnote. """ + + def __str__(self) -> str: + return self.line.text + + + def __repr__(self) -> str: + return self.__str__() + +class Document: + """ Represents the document object. """ + clauses:list[Clause] = [] + footnotes:list[Footnote] = [] + + def __init__(self, clauses:list[Clause], footnotes:list[Footnote] = []) -> None: + self.clauses = clauses + self.footnotes = footnotes + + + def splitMarkdownDocument(self, + ignoreTitles:list[str] = [], + splitLevel:int = 1, + ignoreUntilFirstHeading:bool = False) -> None: + """ Split the clauses at a certain level. This is used to create the separate + markdown files for MkDocs. + + After the split, the clauses are stored in the document object. + + Args: + ignoreTitles: A list of titles that should be ignored. They are not included in the output. + splitLevel: The level at which the clauses should be split. + ignoreUntilFirstHeader: Ignore all clauses until the first heading. + + """ + result:list[Clause] = [] + + ignoreTitles = [ t.casefold() for t in ignoreTitles ] # convert to lower case + + for clause in self.clauses: + level = clause.level + + # Check if the current clause should be ignored + if clause.title.casefold() in ignoreTitles: + continue + + # Add a new output clause if the current clause's level is + # equal or less than the split level + if clause.level <= splitLevel: + result.append(Clause(level, clause.clauseNumber, clause.title, [])) + + # Add the lines to the output clause + result[-1].extend(clause) + + # Remove the first clause if it has no title + if ignoreUntilFirstHeading: + while len(result[0].title) == 0: + result.pop(0) + + self.clauses = result + + + def insertFootnotes(self) -> None: + """ Insert footnotes into the clauses. + + After the insertion, the clauses are stored in the document object. + + """ + printInfo('Adding footnotes to clauses') + + for clause in self.clauses: + foundFootnotes:list[Footnote] = [] + for line in clause.lines: + # ATTN: Only footnotes in normal text lines are checked + + if line.lineType == LineType.TEXT and (fn := MatchInlineFootnote.search(line.text)): + # Find the footnote in the list of footnotes + for f in self.footnotes: + if f.id == fn.groups()[0]: + foundFootnotes.append(f) + + # Insert the footnotes at the end of the clause + if len(foundFootnotes) > 0: + clause.append(Line('\n', LineType.TEXT)) + for f in foundFootnotes: + clause.append(f.line) + + + def updateLinks(self) -> None: + """ Update the links in the clauses to the new structure. This is done by + creating a dictionary of all links and their targets and then replacing + the links in the clauses. + + After the update, the clauses are stored in the document object. + """ + printInfo('Updating links in clauses') + + # Build the link target dictionary. Mapping anchor -> clause + linkTargets:dict[str, Clause] = {} + + # Find all Markdown headers in the clauses and convert them to anchor format + for i, clause in enumerate(self.clauses): + # Find all headers in the clause + for line in clause.lines: + if (m := matchHeader.match(line.text)): + + # convert the header to anchor format and add it to the dictionary + # Remove special characters + # TODO move perhaps to an own function + anchor = m.groups()[1].strip().casefold().replace(' ', '-') + for c in ( '.', '(', ')', '[', ']', ':', ',', "'", '"'): + anchor = anchor.replace(c, '') + # remove html tags from the anchor + anchor = re.sub(matchHtmlTag, '', anchor) + + linkTargets[f'#{anchor}'] = clause + if veryVerbose: + printDebug(f'Added Markdown anchor "{anchor}"') + + # Find all HTML anchors in the clauses and add them to the dictionary + for i, clause in enumerate(self.clauses): + for line in clause.lines: + if (anchors := matchHtmlAnchorLink.findall(line.text)): + for a in anchors: + linkTargets[f'#{a}'] = clause + if veryVerbose: + printDebug(f'Found HTML anchor "{a}" in clause "{clause.title}"') + + # Replace the html links + for clause in self.clauses: + for i, line in enumerate(clause.lines): + if (links := matchHtmlLink.findall(line.text)): + for lnk in links: + if lnk in linkTargets: + line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[lnk].clauseNumber}/#{lnk[1:]}') # Update the current line as well + if veryVerbose: + printDebug(f'Updated HTML link "{lnk}" in clause "{clause.title}"') + + # Replace the markdown links + for clause in self.clauses: + for i, line in enumerate(clause.lines): + if (links := markdownLink.findall(line.text)): + # Replace the old link targets with converted + # (lower case) versions that point to the output files + for lnk in links: + _lnk =lnk.casefold() + if _lnk in linkTargets: + line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[_lnk].clauseNumber}/#{lnk[1:]}') # Update the current line as well + if veryVerbose: + printDebug(f'Updated Markdown link "{lnk}" in clause "{clause.title}"') + + + def updateNotes(self) -> None: + """ Update the notes in the clauses to the mkDocs notes version. + + After the update, the clauses are stored in the document object. + """ + printInfo('Updating notes in clauses') + + for clause in self.clauses: + lines:list[Line] = [] + inNote = False + for line in clause.lines: + if line.lineType == LineType.NOTE: + if not inNote: + lines.append(Line('\n', LineType.TEXT)) + lines.append(Line('!!! note\n', LineType.NOTE)) + inNote = True + lines.append(Line(f"\t{re.sub(matchNoteStart, '', line.text)}", LineType.NOTE)) + if verbose: + printDebug(f'Converted note in clause "{clause.title}"') + else: + if inNote: + lines.append(Line('\n', LineType.TEXT)) + inNote = False + lines.append(line) + clause.lines = lines + + + def __str__(self) -> str: + """ Return the document as a string. """ + return '\n'.join([ str(c) for c in self.clauses + self.footnotes ]) + + + def __repr__(self) -> str: + """ Return the document as a string. """ + return self.__str__() + + +def analyseMarkdown(filename:Optional[str]=None, inLines:Optional[list[str]]=None) -> Document: + """ Analyse the markdown file and split it into clauses. + Either the filename or the inLines must be provided. + + Args: + filename: The name of the markdown file. + inLines: The lines of the markdown file. + + Returns: + The document object. + """ + + gridTable:str = '' + + def processGridTable() -> None: + """ Process a grid table and convert it to an html table. + + This function adds the html table to the output clauses and + clears the gridTable variable. + """ + nonlocal gridTable + + htmltable:str = '' + try: + htmltable = generateHtmlTableWithSpans(gridTable) + printDebug(htmltable) + except Exception as e: + printError(f"Error: {e}") + outClauses[-1].append(Line(htmltable, LineType.RAWHTML)) + gridTable = '' + + + printInfo(f'Analyzing "{filename}"') + + # Read the file. + # Note: We use utf-8 and replace errors to avoid problems with special or unknown characters. + if filename and not inLines: + with open(filename, 'r', encoding = 'utf-8', errors = 'replace') as file: + inLines = file.readlines() + elif not filename and inLines: + pass + else: + raise ValueError('Either the filename or the lines must be provided.') + + # The list of clauses. The first clause contains the text before the first heading. + outClauses:list[Clause] = [Clause(0, '', '', [])] + footnotes:list[Footnote] = [] + + # Go through the lines and detect headers and codefences + inCodefence = False + inTable = False + tableHasSeparator = False + inGridTable = False + for line in inLines: + # Detect and handle codefences + # For the moment we support only codefences that start and end + # with 3 backticks. This is the most common way to define codefences. + # Note, that longer codefences are allowed by the markdown specification. + + if matchCodefenceStart.match(line) and not inCodefence: + inCodefence = True + outClauses[-1].append(Line(line, LineType.CODEFENCESTART)) + continue + if matchCodefenceEnd.match(line): + inCodefence = False + outClauses[-1].append(Line(line, LineType.CODEFENCEEND)) + continue + if inCodefence: + outClauses[-1].append(Line(line, LineType.CODE)) + continue + + # Detect and handle tables + if matchTable.match(line) and not inTable and not inGridTable: + inTable = True + outClauses[-1].append(Line(line, LineType.TABLEHEADER)) + continue + if inTable: + if matchTableSeparator.match(line) and not tableHasSeparator: + outClauses[-1].append(Line(line, LineType.TABLESEPARATOR)) + tableHasSeparator = True + continue + elif matchTable.match(line): + outClauses[-1].append(Line(line, LineType.TABLEROW)) + continue + else: + inTable = False + tableHasSeparator = False + # Mark the previous line as the last row in the table + outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW + # continue with other matches + + #Detect grid tables and convert them to html table + if matchGridTable.match(line) and not inGridTable: + inGridTable = True + #outClauses[-1].append(Line(line, LineType.TABLEHEADER)) + gridTable += line + continue + if inGridTable: + if matchGridTableHeaderSeparator.match(line) or matchGridTableBodySeparator.match(line): + #outClauses[-1].append(Line(line, LineType.TABLESEPARATOR)) + gridTable += line + continue + elif matchTable.match(line): + #outClauses[-1].append(Line(line, LineType.TABLEROW)) + gridTable += line + continue + else: + inGridTable = False + processGridTable() + # continue with other matches + + # Detect notes + # Notes are lines that start with a '>'. + if matchNote.match(line): + outClauses[-1].append(Line(line, LineType.NOTE)) + continue + + # Detect footnotes + # Footnotes are lines that start with a '^' + if (_fn := matchFootnote.match(line)): + footnotes.append(Footnote(_fn.groups()[0], Line(line, LineType.TEXT))) + continue + + # Detect images on a single line + if (m := matchStandAloneImage.match(line)): + outClauses[-1].append(Line(line, LineType.STANDALONEIMAGE)) + continue + + # Detect headers + _lineType = LineType.TEXT + if (m := matchHeader.match(line)): + # Add a new clause + clauseTitle = m.groups()[1].strip() + clauseTitle = re.sub(matchHtmlTag, '', clauseTitle) + headerNumber = matchHeaderNumber.search(clauseTitle) + outClauses.append(Clause(len(m.groups()[0]), # level + headerNumber.group() if headerNumber else _shortHash(clauseTitle, 6), + clauseTitle, + [])) + _lineType = LineType.HEADING + + # Just add the line to the current clause as text + outClauses[-1].append(Line(line, _lineType)) + + # Process still unfinished cases + if gridTable: + processGridTable() + + return Document(outClauses, footnotes) + + +def main() -> None: + """Hauptfunktion zur Verarbeitung von Markdown-Dateien über die Kommandozeile.""" + import argparse + + parser = argparse.ArgumentParser(description='Markdown-Dateien verarbeiten, um Gittertabellen zu konvertieren und andere Formatierungen zu handhaben') + parser.add_argument('eingabe', help='Eingabe-Markdown-Datei') + parser.add_argument('-v', '--verbose', action='store_true', help='Ausführliche Ausgabe aktivieren') + parser.add_argument('-vv', '--sehr-verbose', action='store_true', help='Sehr ausführliche Ausgabe aktivieren') + parser.add_argument('-i', '--ignoriere-titel', nargs='+', default=[], help='Liste der zu ignorierenden Titel') + parser.add_argument('-s', '--teilungs-ebene', type=int, default=1, help='Ebene, auf der das Dokument geteilt werden soll (Standard: 1)') + parser.add_argument('-f', '--ignoriere-erste', action='store_true', help='Inhalt bis zur ersten Überschrift ignorieren') + + args = parser.parse_args() + + # Verbositätsebenen setzen + global verbose, veryVerbose + verbose = args.verbose + veryVerbose = args.sehr_verbose + + # Markdown-Datei verarbeiten + doc = analyseMarkdown(args.eingabe) + + # Dokument teilen und verarbeiten + doc.splitMarkdownDocument( + ignoreTitles=args.ignoriere_titel, + splitLevel=args.teilungs_ebene, + ignoreUntilFirstHeading=args.ignoriere_erste + ) + + # Dokumentenelemente aktualisieren + doc.insertFootnotes() + doc.updateLinks() + doc.updateNotes() + + # Verarbeitetes Dokument ausgeben + for clause in doc.clauses: + print(f"\n{'#' * clause.level} {clause.title}") + for line in clause.lines: + print(line.text, end='') + +if __name__ == '__main__': + main() + + diff --git a/toMkdocs/regexMatches.py b/toMkdocs/regexMatches.py new file mode 100644 index 0000000000000000000000000000000000000000..cbac6c3f3c5bcb54c6bf6b5b9165e2bbdff2851e --- /dev/null +++ b/toMkdocs/regexMatches.py @@ -0,0 +1,40 @@ +# +# regexMatches.py +# +# (c) 2025 by Andreas Kraft & Miguel Angel Reina Ortega +# License: BSD 3-Clause License. See the LICENSE file for further details. + +# +""" This module contains the regular expressions used in the markdown processing. +""" + +import re + + +# Regular expressions +match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE) +matchFootnote = re.compile(r'\[\^([^\]]*)\]:', re.IGNORECASE) +matchHtmlAnchorLink = re.compile(r'<a\s+name="([^"]*)">[^<]*</a>', re.IGNORECASE) +matchHtmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE) +matchHtmlTag = re.compile(r'<[^>]*>', re.IGNORECASE) +MatchInlineFootnote = re.compile(r'\[\^([^\]]*)\]', re.IGNORECASE) +markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE) +matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE) +matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE) +matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE) +matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE) +matchGridTableBodySeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE) +matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE) +matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE) +matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE) +matchHeader = re.compile(r'(#+)\s+(.*)', re.IGNORECASE) +matchHeaderNumber = re.compile(r'\b[A-Za-z0-9]\d*(\.\d+)*\b', re.IGNORECASE) +matchListInContent = re.compile(r'^(?:\s*(P<marker>[-*+]|\s*\d+\.))\s+(P<content>.+)$', re.IGNORECASE) +matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE) +matchNoteStart = re.compile(r'^\s*>\s*(note)?\s*[:]?\s*', re.IGNORECASE) +matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECASE) +matchTable = re.compile(r'^\s*\|.*\|\s*$', re.IGNORECASE) +matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE) + +matchBold = re.compile(r'(^|\s)(\*\*|__)(?P<text>.+?)\2(?!\w)') +matchItalic = re.compile(r'(^|\s)(\*|_)(?P<text>.+?)\2(?!\w)') \ No newline at end of file diff --git a/toMkdocs/toMkdocs.py b/toMkdocs/toMkdocs.py index 0222748f70934431c1308cb20d57b228ec29c1ec..af4a2860c9e140aa56789ffcc5655d85404b9666 100644 --- a/toMkdocs/toMkdocs.py +++ b/toMkdocs/toMkdocs.py @@ -1,1146 +1,155 @@ # # toMkdocs.py # -# (c) 2024 by Andreas Kraft +# (c) 2024 by Andreas Kraft & Miguel Angel Reina Ortega +# License: BSD 3-Clause License. See the LICENSE file for further details. + # # This script converts oneM2M spec markdown file to a mkdocs compatible # directory structure. # from __future__ import annotations -import logging -from enum import Enum, auto -import argparse, re, os, shutil, hashlib, base64 -from dataclasses import dataclass +import argparse, os, shutil from rich import print +from markdownTools import Line, Document, analyseMarkdown, setLoggers +from regexMatches import match2spaceListIndention verbose = False veryVerbose = False -class LineType(Enum): - """ Represents the type of a line in the markdown file. """ - HEADING = auto() - TEXT = auto() - CODEFENCESTART = auto() - CODE = auto() - CODEFENCEEND = auto() - LIST = auto() - NOTE = auto() - STANDALONEIMAGE = auto() - TABLEHEADER = auto() - TABLESEPARATOR = auto() - TABLEROW = auto() - TABLELASTROW = auto() - - -@dataclass -class Line: - """ Represents a line in the markdown file. """ - text:str = '\n' - lineType:LineType = LineType.TEXT - - - -@dataclass -class Clause: - """ Represents a clause in the markdown file. """ - _level:int - _clauseNumber:str - _title:str - _lines:list[Line] - - - @property - def level(self) -> int: - """ Return the level of the clause. """ - return self._level - - - @property - def clauseNumber(self) -> str: - """ Return the clause number. """ - return self._clauseNumber if self._clauseNumber else '0' - - - @clauseNumber.setter - def clauseNumber(self, value:str) -> None: - """ Set the clause number. """ - self._clauseNumber = value - - - @property - def title(self) -> str: - """ Return the title of the clause. """ - return self._title - - - @title.setter - def title(self, value:str) -> None: - """ Set the title of the clause. """ - self._title = value - - - @property - def lines(self) -> list[Line]: - """ Return the lines of the clause. """ - return self._lines - - - @lines.setter - def lines(self, value:list[Line]) -> None: - """ Set the lines of the clause. """ - self._lines = value - - - @property - def linesCount(self) -> int: - """ Return the number of lines in the clause. - - Returns: - The number of lines in the clause. - """ - return len(self.lines) - - - def append(self, line:Line) -> None: - """ Append a line to the clause. - - Args: - line: The line to append. - """ - self.lines.append(line) - - - def extend(self, clause:Clause) -> None: - """ Extend the clause with the lines of another clause. - - Args: - clause: The clause to extend with. - """ - self.lines.extend(clause.lines) - - - def asStringList(self, paddings:int = 0) -> list[str]: - """ Return the clause as a list of strings. - - Args: - paddings: The number of empty lines to add before the clause. - Returns: - The clause's lines as a list of strings. - """ - return [ '\n' for _ in range(paddings) ] + [ l.text for l in self.lines ] - - - def __len__(self) -> int: - """ Return the number of characters in the clause. This does not include - empty lines or lines that contain only whitespace. - - Returns: - The number of characters in the clause. - """ - return sum([ len(l.text.strip()) for l in self.lines ]) - - -class Footnote: - """ Represents a footnote in the markdown file. """ - def __init__(self, id:str, line:Line) -> None: - self.id = id - self.line = line - - -class Document: - """ Represents the document object. """ - clauses:list[Clause] = [] - footnotes:list[Footnote] = [] - - def __init__(self, clauses:list[Clause], footnotes:list[Footnote]) -> None: - self.clauses = clauses - self.footnotes = footnotes - - - def splitMarkdownDocument(self, - ignoreTitles:list[str] = [], - splitLevel:int = 1, - ignoreUntilFirstHeading:bool = False) -> None: - """ Split the clauses at a certain level. This is used to create the separate - markdown files for MkDocs. - - After the split, the clauses are stored in the document object. - - Args: - ignoreTitles: A list of titles that should be ignored. They are not included in the output. - splitLevel: The level at which the clauses should be split. - ignoreUntilFirstHeader: Ignore all clauses until the first heading. - - """ - result:list[Clause] = [] - - ignoreTitles = [ t.casefold() for t in ignoreTitles ] # convert to lower case - - for clause in self.clauses: - level = clause.level - - # Check if the current clause should be ignored - if clause.title.casefold() in ignoreTitles: - continue - - # Add a new output clause if the current clause's level is - # equal or less than the split level - if clause.level <= splitLevel: - result.append(Clause(level, clause.clauseNumber, clause.title, [])) - - # Add the lines to the output clause - result[-1].extend(clause) - - # Remove the first clause if it has no title - if ignoreUntilFirstHeading: - while len(result[0].title) == 0: - result.pop(0) - - self.clauses = result - - - def insertFootnotes(self) -> None: - """ Insert footnotes into the clauses. - - After the insertion, the clauses are stored in the document object. - - """ - print(f'[green]Adding footnotes to clauses') - - for clause in self.clauses: - foundFootnotes:list[Footnote] = [] - for line in clause.lines: - # ATTN: Only footnotes in normal text lines are checked - - if line.lineType == LineType.TEXT and (fn := _inlineFootnote.search(line.text)): - # Find the footnote in the list of footnotes - for f in self.footnotes: - if f.id == fn.groups()[0]: - foundFootnotes.append(f) - - # Insert the footnotes at the end of the clause - if len(foundFootnotes) > 0: - clause.append(Line('\n', LineType.TEXT)) - for f in foundFootnotes: - clause.append(f.line) - - - def updateLinks(self) -> None: - """ Update the links in the clauses to the new structure. This is done by - creating a dictionary of all links and their targets and then replacing - the links in the clauses. - - After the update, the clauses are stored in the document object. - """ - print(f'[green]Updating links in clauses') - - # Build the link target dictionary. Mapping anchor -> clause - linkTargets:dict[str, Clause] = {} - - # Find all Markdown headers in the clauses and convert them to anchor format - for i, clause in enumerate(self.clauses): - # Find all headers in the clause - for line in clause.lines: - if (m := _matchHeader.match(line.text)): - - # convert the header to anchor format and add it to the dictionary - # Remove special characters - # TODO move perhaps to an own function - anchor = m.groups()[1].strip().casefold().replace(' ', '-') - for c in ( '.', '(', ')', '[', ']', ':', ',', "'", '"'): - anchor = anchor.replace(c, '') - # remove html tags from the anchor - anchor = re.sub(_htmlTag, '', anchor) - - linkTargets[f'#{anchor}'] = clause - if veryVerbose: - print(f'[dim]Added Markdown anchor "{anchor}"') - - # Find all HTML anchors in the clauses and add them to the dictionary - for i, clause in enumerate(self.clauses): - for line in clause.lines: - if (anchors := _htmlAnchorLink.findall(line.text)): - for a in anchors: - linkTargets[f'#{a}'] = clause - if veryVerbose: - print(f'[dim]Found HTML anchor "{a}" in clause "{clause.title}"') - - # Replace the html links - for clause in self.clauses: - for i, line in enumerate(clause.lines): - if (links := _htmlLink.findall(line.text)): - for lnk in links: - if lnk in linkTargets: - line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[lnk].clauseNumber}/#{lnk[1:]}') # Update the current line as well - if veryVerbose: - print(f'[dim]Updated HTML link "{lnk}" in clause "{clause.title}"') - - # Replace the markdown links - for clause in self.clauses: - for i, line in enumerate(clause.lines): - if (links := _markdownLink.findall(line.text)): - # Replace the old link targets with converted - # (lower case) versions that point to the output files - for lnk in links: - _lnk =lnk.casefold() - if _lnk in linkTargets: - line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[_lnk].clauseNumber}/#{lnk[1:]}') # Update the current line as well - if veryVerbose: - print(f'[dim]Updated Markdown link "{lnk}" in clause "{clause.title}"') - - - def updateNotes(self) -> None: - """ Update the notes in the clauses to the mkDocs notes version. - - After the update, the clauses are stored in the document object. - """ - print(f'[green]Updating notes in clauses') - - for clause in self.clauses: - lines:list[Line] = [] - inNote = False - for line in clause.lines: - if line.lineType == LineType.NOTE: - if not inNote: - lines.append(Line('\n', LineType.TEXT)) - lines.append(Line('!!! note\n', LineType.NOTE)) - inNote = True - lines.append(Line(f"\t{re.sub(_matchNoteStart, '', line.text)}", LineType.NOTE)) - if verbose: - print(f'[dim]Converted note in clause "{clause.title}"') - else: - if inNote: - lines.append(Line('\n', LineType.TEXT)) - inNote = False - lines.append(line) - clause.lines = lines - - - def prepareForMkdocs(self, includeHangingParagraphs:bool = False) -> None: - """ Prepare the clauses for MkDocs. This includes removing the heading - from the clauses and marking the clauses that are only for navigation. - - After the preparation, the clauses are stored in the document object. - - Args: - includeHangingParagraphs: Include hanging paragraphs in the output. - """ - - # Remove the heading from the lines. The heading is the first line - # in the clause. This is done because MkDocs repeats the heading when - # displaying the page. - for clause in self.clauses: - if clause.linesCount > 0: - clause.lines.pop(0) - # Also, remove the first empty lines if they exist - while clause.linesCount > 0 and clause.lines[0].text.strip() == '': - clause.lines.pop(0) - - # Detect and handle hanging paragraphs. This is extra text in a clause, which - # has sub-clauses. This text is not allowed in oneM2M specifications. - for i, clause in enumerate(self.clauses): - if clause.level > 0 and clause.linesCount > 0: - # Check if there is a sub-clause in the next clause - if i + 1 < len(self.clauses) and self.clauses[i+1].level > clause.level: - # This is a hanging paragraph. Remove the text from the current clause. - print(f'[yellow]Hanging paragraph in clause "{clause.title}" {"(removed)" if not includeHangingParagraphs else "(kept)"}') - if not includeHangingParagraphs: - self.clauses[i].lines = [] - else: - self.clauses[i].lines = [Line("<mark>Editor note: This is a hanging paragraph and it must be moved to its own clause</mark>")] + [Line()] + self.clauses[i].lines - - # Repair wrong markdown for indented lines. - # Add 2 spaces to existing 2-space indentions - for clause in self.clauses: - for i, line in enumerate(clause.lines): - if _match2spaceListIndention.match(line.text): - clause.lines[i].text = ' ' + line.text - - - def writeClausesMkDocs(self, filename:str, navTitle:str, addNavTitle:bool = False) -> None: - """ Write the clauses to separate files and create a navigation file. - - Args: - filename: The name of the original markdown file. - navTitle: The title of the navigation entry. This is used to determine the directories. - addNavTitle: Add the title as an extra navigation level to the navigation file. - """ - - print(f'[green]Writing clauses to files') - # create directory first - os.makedirs(f'{os.path.dirname(filename)}/{navTitle}', exist_ok = True) - - # Write the files - for i, f in enumerate(self.clauses): - # write to single files, even empty ones - if verbose: - print(f'[dim]Writing "{f.clauseNumber}.md" - "{f.title}"') - with open(f'{os.path.dirname(filename)}/{navTitle}/{f.clauseNumber}.md', 'w') as file: - # Add one empty line before the clause. This is done to avoid - # a bug in MkDocs that does not display the first line of a clause - # if it contains a colon. It does not matter otherwise if the line - # is empty or not. - file.writelines(f.asStringList(1)) - - # write nav.yml file - print(f'[green]Writing "_nav.yml"') - indentation = ' ' if addNavTitle else '' # TODO make number of spaces configurable - with open(f'{os.path.dirname(filename)}/_nav.yml', 'w') as file: - if veryVerbose: - print(f'[dim]Writing navigation file') - if addNavTitle: - file.write(f'{indentation}- {navTitle}:\n') - for i, f in enumerate(self.clauses): - - if not f.title: - print("continue") - continue - - # TODO handle if the next clause is more than one level deeper - - _title = f.title.replace("'", '"') - nextClause = self.clauses[i+1] if i+1 < len(self.clauses) else None - if nextClause is None or nextClause.level <= f.level: - file.write(f"{indentation}{' '*f.level}- '{_title}': '{navTitle}/{f.clauseNumber}.md'\n") - else: - file.write(f"{indentation}{' '*f.level}- '{_title}':\n") - if len(f) > 0: - file.write(f"{indentation}{' '*nextClause.level}- 'Hanging paragraph': '{navTitle}/{f.clauseNumber}.md'\n") - - - - -_matchHeader = re.compile(r'(#+)\s+(.*)', re.IGNORECASE) -_matchHeaderNumber = re.compile(r'\b[A-Za-z0-9]\d*(\.\d+)*\b', re.IGNORECASE) -_matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE) -_matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE) -_matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE) -_matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECASE) -_matchTable = re.compile(r'^\s*\|.*\|\s*$', re.IGNORECASE) -_matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE) -_matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE) -_matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE) -_matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE) -_matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE) -_matchGridTableBodySeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE) -_match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE) -_matchListInContent = re.compile(r'^(?:\s*(P<marker>[-*+]|\s*\d+\.))\s+(P<content>.+)$', re.IGNORECASE) -_markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE) -_htmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE) -_htmlAnchorLink = re.compile(r'<a\s+name="([^"]*)">[^<]*</a>', re.IGNORECASE) -_htmlTag = re.compile(r'<[^>]*>', re.IGNORECASE) -_matchNoteStart = re.compile(r'^\s*>\s*(note)?\s*[:]?\s*', re.IGNORECASE) -_footnote = re.compile(r'\[\^([^\]]*)\]:', re.IGNORECASE) -_inlineFootnote = re.compile(r'\[\^([^\]]*)\]', re.IGNORECASE) - - -# TODO handle multiple nav levels (left bar) better (make conifgurable) - - -def shortHash(value:str, length:int) -> str: - """ Generate a short hash of a string value. +def printDebug(text:str) -> None: + """ Print a debug message. Args: - value: The value to hash. - length: The length of the hash. - - Returns: - The hash. - """ - return base64.b64encode( - hashlib.sha256( - value.encode() - ).digest() - ).decode()[:length] - -def parse_pandoc_table_with_spans(pandoc_table): - """ - Parse a Pandoc-style grid table into a structure for HTML conversion with rowspan and colspan. - - :param pandoc_table: String of the Pandoc-style grid table. - :return: List of lists representing the table with metadata for spans. + text: The text of the debug message. """ - # Split the input into lines - lines = [line.strip() for line in pandoc_table.strip().split("\n")] - - class Cell: - """ Represents the document object. """ - content: str - rowspan: int - colspan: int - colspan_adjusted: bool - alignment: str - position: int - list_flag: bool - auxiliar_index: int - - def __init__(self): - self.content = None - self.rowspan = 0 - self.colspan = 0 - self.colspan_adjusted = False - self.alignment = "align=\"center\"" - self.position = None - self.list_flag = False - - def set_alignment(self): - if has_header: - header_delimiter_index = 0 - while header_delimiter_index in range(len(default_alignments)) and self.position > header_delimiter_positions[header_delimiter_index]: - header_delimiter_index += 1 - if header_delimiter_index in range(len(default_alignments)): - if self.position < header_delimiter_positions[header_delimiter_index]: - self.alignment = default_alignments[header_delimiter_index] - elif self.position == header_delimiter_positions[header_delimiter_index]: - self.alignment = default_alignments[header_delimiter_index] - header_delimiter_index += 1 - else: - raise ValueError("Invalid table formatting") - else: - body_delimiter_index = 0 - while body_delimiter_index in range(len(default_alignments)) and self.position > \ - delimiter_positions[body_delimiter_index]: - body_delimiter_index += 1 - if body_delimiter_index in range(len(default_alignments)): - if self.position < delimiter_positions[body_delimiter_index]: - self.alignment = default_alignments[body_delimiter_index] - elif self.position == delimiter_positions[body_delimiter_index]: - self.alignment = default_alignments[body_delimiter_index] - body_delimiter_index += 1 - else: - raise ValueError("Invalid table formatting") - class Row(): - """ Represents a row in the markdown file. """ - cells:list[Cell] = [] - - def __init__(self, length: int = 1) -> None: - self.cells = [Cell() for _ in range(length)] - - def __getitem__(self, item): - return self.cells[item] - - def __setitem__(self, key, value): - self.cells[key] = value - - class RowTracker(): - """ Represents the document object. """ - def __init__(self, items): - self.rowTracker = [0 for _ in range(items)] - - def __getitem__(self, item): - return self.rowTracker[item] - - def __setitem__(self, key, value): - self.rowTracker[key] = value - - # Detect separator lines by pattern (it does not take into account partial separators - def is_separator(line): - return _matchGridTableSeparator.match(line) - - # Set content on the cell - concatenating multilines, flagging lists - def handling_content(cell, content): - if cell.content is None: - cell.rowspan += 1 - cell.colspan += 1 - if content.strip().startswith("- "): # List - cell.list_flag = True - #print(content) - content = re.sub(r'\\\s*$', "\n", content.strip()) - cell.content = content + "@" # Add list element end mark to know when the list element ends - elif cell.list_flag and content.strip() != "": # any other content when handling list is concatenated to the last list element - content = re.sub(r'\\\s*$', "\n", content.strip()) - cell.content += content + "@" #add the list element end mark - elif content.strip == "": # separation between list and other paragraph - #if cell.list_flag: - # cell.list_flag = False - cell.content += "\n" if not cell['content'].endswith("\n") else "" - else: - cell.content = re.sub(r'\\\s*$', "\n", content.strip()) - else: - if content.strip().startswith("- "): # List - if not cell.list_flag: - cell.content += "\n" - #cell['content'] = cell['content'].strip("\n") - cell.list_flag = True - content = re.sub(r'\\\s*$', "\n", content.strip()) - cell.content += content + "@" # Add list element end mark to know when the list element ends - elif cell.list_flag and content.strip() != "": # any other content when handling list is concatenated to the last list element - cell.content = cell.content.strip("@") #remove list element end mark - content = re.sub(r'\\\s*$', "\n", content.strip()) - cell.content += " " + content + "@" #add list element end mark - elif content.strip() == "": # separation between list and other paragraph - if cell.list_flag: - cell.list_flag = False - cell.content += "\n\n" #end list by \n - #content = re.sub(r'\\\s*$', "\n", content.strip()) - cell.content += "\n" if not cell.content.endswith("\n") else "" - else: - content = re.sub(r'\\\s*$', "\n", content.strip()) - cell.content += " " + content - #print(cell['content']) - return cell - - # Adjust colspan of a cell - def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions): - for j in range(column_index, number_of_parts): - delimiter_start = None - col_i= column_index - while delimiter_start == None: - delimiter_start = row[col_i - 1].position if col_i > 0 else 0 - col_i -= 1 - positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "|+" if delimiter in line[delimiter_start + 1:]] - position = min(positions) if positions else -1 - if position > delimiter_positions[j]: # Colspan to be increased - row[column_index].colspan += 1 - if position == delimiter_positions[len(delimiter_positions) - 1]: # last cell in row, adjust colspan to get max number columns - colspan_allocated = row[column_index].colspan - #for cell_index in range(number_of_parts): - # colspan_allocated += row[cell_index].colspan - row[column_index].colspan += number_of_columns - colspan_allocated - column_index - elif position < delimiter_positions[j]: - raise ValueError("Wrong cell formatting") - else: - break - return row[column_index] - - separator_indices = [i for i, line in enumerate(lines) if is_separator(line)] - - print(separator_indices) - if not separator_indices: - raise ValueError("No valid separators found in the provided Pandoc table.") - - # Calculate max number of columns - delimiter_positions = [] - number_of_columns = 0 - for separator_index in separator_indices: - if lines[separator_index].count("+") - 1 > number_of_columns: - number_of_columns = lines[separator_index].count("+") - 1 - delimiter_positions = [] - for j in range(number_of_columns): - delimiter_positions_start = delimiter_positions[j - 1] if j != 0 else 0 - del_positions = [lines[separator_index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[separator_index][delimiter_positions_start + 1:]] - delimiter_positions.append(min(del_positions) if del_positions else -1) - has_header = False - header_delimiter_positions = [] - header_rows = [] - for index in separator_indices: - if _matchGridTableHeaderSeparator.match(lines[index]): - has_header = True - header_separator_index = index - header_rows = [] - parts = re.split(r"\+", lines[index].strip("+")) - default_alignments = [] - #Calculate default alignments and positions of delimiters - for part_index in range(len(parts)): - if parts[part_index].startswith(":") and not parts[part_index].endswith(":"): - default_alignments.append("align=\"left\"") - elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"): - default_alignments.append("align=\"right\"") - else: - default_alignments.append("align=\"center\"") - # Delimiter position - delimiter_positions_start = delimiter_positions[part_index - 1] if part_index != 0 else 0 - del_positions = [lines[index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[index][delimiter_positions_start + 1:]] - header_delimiter_positions.append(min(del_positions) if del_positions else -1) + if verbose: + print(f'[dim]{text}') - if not has_header: - #Set default alignments from the first separator - parts = re.split(r"\+", lines[0].strip("+")) - default_alignments = [] - # Calculate default alignments and positions of delimiters - for part_index in range(len(parts)): - if parts[part_index].startswith(":") and not parts[part_index].endswith(":"): - default_alignments.append("align=\"left\"") - elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"): - default_alignments.append("align=\"right\"") - else: - default_alignments.append("align=\"center\"") - data_rows = [] - for row in range(len(separator_indices) - 1): - rows = [] - rows_tracker = [] - in_data_row = False - start, end = separator_indices[row], separator_indices[row + 1] - row_lines = lines[start:end] # Lines between separators including separator line start as it gives information about the number of columns of the row - if row_lines: - # Combine multiline content into single strings for each cell - for line in row_lines: - if is_separator(line) and not in_data_row: - in_data_row = True - # Add delimiter alignment check for separator lines - if not check_delimiter_alignment(line, delimiter_positions): - raise ValueError(f"Misaligned delimiters in separator row: {line}") - - parts = re.split(r"\s*\+\s*", line.strip("+")) - delimiter_index = 0 - # Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator) - # we need to assign the default alignment as defined in the header separator line - # We may not need the code below, as that supports alignment per cell and row - #alignments = [] - #for part_index in range(len(parts)): - # if parts[part_index].startswith(":") and not parts[part_index].endswith(":"): - # alignments.append("align=\"left\"") - # elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"): - # alignments.append("align=\"right\"") - # else: - # alignments.append("align=\"center\"") - rows.append(Row(number_of_columns)) - #rows_tracker = [RowTracker() for _ in range(number_of_columns)] - rows_tracker = RowTracker(number_of_columns) - i = 0 - for j in range(len(parts)): - if i in range(number_of_columns): - delimiter_index += len(parts[j]) + 1 - # Set position - rows[-1][i].position = delimiter_index # Position of cell delimiter + - # Set alignment as defined by header separator line - rows[-1][i].set_alignment() - while delimiter_index > delimiter_positions[i]: - i += 1 - i += 1 - - elif in_data_row: - # Regular data row or partial separator - if _matchGridTableBodySeparator.match(line): # Partial separator - # Add delimiter alignment check for partial separators - if not check_delimiter_alignment(line, delimiter_positions): - raise ValueError(f"Misaligned delimiters in partial separator: {line}") - - cells_content = re.split(r"[\|\+]", line.strip("|").strip("+")) - #Add another row, set delimiters for each cell - rows.append(Row(number_of_columns)) - aux_delimiter_index = 0 - auxiliar_cell_index = 0 - for i in range(len(cells_content)): - if auxiliar_cell_index in range(number_of_columns): - aux_delimiter_index += len(cells_content[i]) + 1 - rows[-1][auxiliar_cell_index].position = aux_delimiter_index # Position of cell delimiter + - rows[-1][auxiliar_cell_index].set_alignment() - while aux_delimiter_index > delimiter_positions[auxiliar_cell_index]: - auxiliar_cell_index += 1 - auxiliar_cell_index += 1 - - if len(cells_content) <= number_of_columns: # Colspan: Positions of | with respect to + need to be determined - column_index = 0 - for i in range(len(cells_content)): - if _matchGridTableBodySeparatorLine.match(cells_content[i]): # A new row is to be added - rows_tracker[column_index] += 1 - rows[rows_tracker[column_index]][column_index].list_flag = False - #auxiliar_rows[-1]['use_auxiliar_row'][i] = True - #if cells[i].startswith(":") and not cells[i].endswith(":"): - # auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"left\"" - #elif not cells[i].startswith(":") and cells[i].endswith(":"): - # auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"right\"" - #else: - # auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"center\"" - column_forward = 0 - for del_index in range(column_index, len(delimiter_positions)): - if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[del_index]: - column_forward += 1 - rows_tracker[column_index + column_forward - 1] += 1 if column_forward > 1 else 0 - column_index += column_forward - continue - else: - # Handle content of the cell - rows[rows_tracker[column_index]][column_index] = handling_content(rows[rows_tracker[column_index]][column_index], cells_content[i]) - rows[rows_tracker[column_index]][column_index].rowspan += 1 - if not rows[rows_tracker[column_index]][column_index].colspan_adjusted: - rows[rows_tracker[column_index]][column_index].colspan_adjusted = True - # TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator - rows[rows_tracker[column_index]][column_index] = adjust_colspan(rows[rows_tracker[column_index]], column_index, number_of_columns, line, number_of_columns, delimiter_positions) - - if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[column_index]: - column_index += rows[rows_tracker[column_index]][column_index].colspan if rows[rows_tracker[column_index]][column_index].colspan != 0 else 1 - continue - - else: - raise ValueError("More cells than columns found") - else: # Data row - cells_content = line.strip() - cells_content = re.split(r"\|", line.strip("|")) - - # Add delimiter alignment check - if not check_delimiter_alignment(line, delimiter_positions): - raise ValueError(f"Misaligned delimiters in row: {line}") - - column_index = 0 - if len(cells_content) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined - for i in range(len(cells_content)): - # Handle content of the cell - rows[rows_tracker[column_index]][column_index] = handling_content(rows[rows_tracker[column_index]][column_index], cells_content[i]) - if not rows[rows_tracker[column_index]][column_index].colspan_adjusted: - rows[rows_tracker[column_index]][column_index].colspan_adjusted = True - #TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator - rows[rows_tracker[column_index]][column_index] = adjust_colspan(rows[rows_tracker[column_index]], column_index, number_of_columns, line, number_of_columns, delimiter_positions) - if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[column_index]: - column_index += rows[rows_tracker[column_index]][column_index].colspan # Move forward index i - elif len(cells_content) == number_of_columns: # Simple row - for i in range(len(cells_content)): - rows[rows_tracker[i]][i] = handling_content(rows[rows_tracker[i]][i], cells_content[i]) - else: - raise ValueError("More cells than columns found") - else: - raise ValueError("No separator line found for row starting") - - if has_header and start >= header_separator_index: # table_row and auxiliar_row are part of data_rows - for body_row in rows: - data_rows.append(body_row.cells) - elif has_header and start < header_separator_index: # table_row and auxiliar_row are part of header_rows - for header_row in rows: - header_rows.append(header_row.cells) - else: - #only body - for body_row in rows: - data_rows.append(body_row.cells) +def printInfo(text:str) -> None: + """ Print an information message. - #print(header_rows) - #print(data_rows) - # Check if there are any data rows - if not data_rows and not header_rows: - raise ValueError("No valid rows found in the provided Pandoc table.") + Args: + text: The text of the information message. + """ + print(f'[green]{text}') - # Format text - for rows in [header_rows, data_rows]: - bold = "<strong>" - italic = "<i>" - for row in rows: - for cell in row: - if cell.content is not None: - # Replacing "<" by < - #cell.content = cell.content.replace("<", "<") - #Bold - for bold_characters in ["**", "__"]: - while cell.content.find(bold_characters) != -1: - cell.content = cell.content.replace(bold_characters, bold, 1) - if bold == "<strong>": - bold = "</strong>" - else: - bold = "<strong>" - #Italic - while cell.content.find("_") != -1 and cell.content.find("\_") == -1: - cell.content = cell.content.rstrip() .replace("_", italic, 1) - if italic == "<i>": - italic = "</i>" - else: - italic = "<i>" - while cell.content.find("\_") != -1: - cell.content = cell.content.rstrip().replace("\_", "_", 1) +def printWarning(text:str) -> None: + """ Print a warning message. - # Correct newlines characters - for row in header_rows: - for cell in row: - cell.content = cell.content.replace("\n", "<br />") if cell.content is not None else None - for row in data_rows: - for cell in row: - cell.content = cell.content.replace("\n", "<br />") if cell.content is not None else None + Args: + text: The text of the warning message. + """ + print(f'[yellow]{text}') - # Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows - forward_rowspan = [] - for row_index in range(len(header_rows)): - if len(forward_rowspan) == 0: - forward_rowspan = [0 for _ in range(len(header_rows[row_index]))] - sum = 0 - for cell_index in range(len(header_rows[row_index])): - sum += header_rows[row_index][cell_index].colspan - if row_index > 0 and header_rows[row_index][cell_index].colspan == 0: - if forward_rowspan[cell_index] > 0: - sum += 1 - forward_rowspan[cell_index] -= 1 - if forward_rowspan[cell_index] == 0 and header_rows[row_index][cell_index].rowspan > 1: - forward_rowspan[cell_index] = header_rows[row_index][cell_index].rowspan -1 - if not sum == number_of_columns: - raise ValueError("Grid table not converted properly") - forward_rowspan = [] - for row_index in range(len(data_rows)): - if len(forward_rowspan) == 0: - forward_rowspan = [0 for _ in range(len(data_rows[row_index]))] - sum = 0 - for cell_index in range(len(data_rows[row_index])): - sum += data_rows[row_index][cell_index].colspan - if row_index > 0 and data_rows[row_index][cell_index].colspan == 0: - if forward_rowspan[cell_index] > 0: - sum += 1 - forward_rowspan[cell_index] -= 1 - if forward_rowspan[cell_index] == 0 and data_rows[row_index][cell_index].rowspan > 1: - forward_rowspan[cell_index] = data_rows[row_index][cell_index].rowspan - 1 - if not sum == number_of_columns: - raise ValueError("Grid table not converted properly") - return header_rows, data_rows +def printError(text:str) -> None: + """ Print an error message. -def generate_html_table_with_spans(pandoc_table: str) -> str: + Args: + text: The text of the error message. """ - Generate an HTML table from a Pandoc-style grid table with row and column spans. + print(f'[red]{text}') - Args: - pandoc_table (str): String of the Pandoc-style grid table. - - Returns: - str: Generated HTML table markup, or error message if generation fails. - """ - debug_output = [] - def debug_print(msg): - debug_output.append(str(msg)) # Convert message to string +def prepareForMkdocs(document:Document, includeHangingParagraphs:bool = False) -> None: + """ Prepare the clauses for MkDocs. This includes removing the heading + from the clauses and marking the clauses that are only for navigation. - try: - # Redirect print statements to our debug collector - global print - original_print = print - print = debug_print - - grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table) - - # Restore original print - print = original_print - - # Generate table HTML... - html = "<table>\n" - has_header = False + After the preparation, the clauses are stored in the document object. - for row in grid_header: - for cell in row: - if cell.rowspan != 0 and cell.colspan != 0: - has_header = True - if has_header: - html += " <thead>\n" - for row in grid_header: - html += " <tr>\n" - for cell in row: - if cell.rowspan == 0 or cell.colspan == 0: - continue - else: - # Prepare content, in case there's a list - #print(cell.content) - if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+)@", - cell.content): # Update cell in new row - #print("MATCHING") - list = "<ul>" - # Build list the matches - for match in matches: - list += "<li>" + match[1] + "</li>" - list += "</ul>" - cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+(?:(?!@).)+@)+", list, cell.content) - # Enforce left alignment if cell contains a list - cell.alignment = "align=\"left\"" - #else: - # print("NOT MATCHING") + Args: + document: The document object. + includeHangingParagraphs: Include hanging paragraphs in the output. + """ - rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else "" - colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else "" - html += f" <th{rowspan}{colspan} {cell.alignment}>{cell.content}</th>\n" - html += " </tr>\n" - html += " </thead>\n" + # Remove the heading from the lines. The heading is the first line + # in the clause. This is done because MkDocs repeats the heading when + # displaying the page. + for clause in document.clauses: + if clause.linesCount > 0: + clause.lines.pop(0) + # Also, remove the first empty lines if they exist + while clause.linesCount > 0 and clause.lines[0].text.strip() == '': + clause.lines.pop(0) - html += " <tbody>\n" - for row in grid_body: - html += " <tr>\n" - for cell in row: - if cell.rowspan == 0 or cell.colspan == 0: - continue + # Detect and handle hanging paragraphs. This is extra text in a clause, which + # has sub-clauses. This text is not allowed in oneM2M specifications. + for i, clause in enumerate(document.clauses): + if clause.level > 0 and clause.linesCount > 0: + # Check if there is a sub-clause in the next clause + if i + 1 < len(document.clauses) and document.clauses[i+1].level > clause.level: + # This is a hanging paragraph. Remove the text from the current clause. + printWarning(f'Hanging paragraph in clause "{clause.title}" {"(removed)" if not includeHangingParagraphs else "(kept)"}') + if not includeHangingParagraphs: + document.clauses[i].lines = [] else: - #Prepare content, in case there's a list - #print(cell.content) - if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+)@", cell.content): # Update cell in new row - #print("MATCHING") - #print(cell.content) - list = "<ul>" - # Build list the matches - for match in matches: - list += "<li>" + match[1] + "</li>" - list += "</ul>" - cell.content = re.sub(r"\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+@)+",list, cell.content) - # Enforce left alignment if cell contains a list - cell.alignment = "align=\"left\"" - #else: - #print("NOT MATCHING") - rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else "" - colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else "" - html += f" <td{rowspan}{colspan} {cell.alignment}>{cell.content}</td>\n" - html += " </tr>\n" + # Add a note to the hanging paragraph + document.clauses[i].lines = [Line("<mark>Editor note: This is a hanging paragraph and it must be moved to its own clause</mark>")] + [Line()] + document.clauses[i].lines - html += " </tbody>\n" - html += "</table>" - return html - except Exception as e: - logging.error("Grid table could not be generated") - debug_text = "<br>".join(debug_output) # Now all items are strings - return f"HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE.<br><pre>{debug_text}</pre>" + # Repair wrong markdown for indented lines. + # Add 2 spaces to existing 2-space indentions + for clause in document.clauses: + for i, line in enumerate(clause.lines): + if match2spaceListIndention.match(line.text): + clause.lines[i].text = ' ' + line.text -def check_delimiter_alignment(line: str, delimiter_positions: list[int], delimiters: str = "|+") -> bool: - """ - Check if delimiters in a row align with expected positions. - - Args: - line: The line of text to check - delimiter_positions: List of expected positions (based on + characters) - delimiters: String containing valid delimiter characters (default: "|+") - - Returns: - bool: True if delimiters align correctly, False otherwise - """ - if not line or not delimiter_positions: - return False - - print(f"\nChecking line: '{line}'") - print(f"Expected delimiter positions: {delimiter_positions}") - - # For full separator lines (only +) - if '+' in line and '|' not in line: - current_positions = [i for i, char in enumerate(line) if (char == '+' and i != 0)] - print(f"Full separator line - Found + at positions: {current_positions}") - return all(delimiter_positions[-1] in current_positions and - line.startswith("+") and - pos in delimiter_positions for pos in current_positions) - - # For data lines (only |) - if '|' in line and '+' not in line: - current_positions = [i for i, char in enumerate(line) if (char == '|' and i != 0)] - print(f"Data line - Found | at positions: {current_positions}") - return all(delimiter_positions[-1] in current_positions and - line.startswith("|") and - pos in delimiter_positions for pos in current_positions) - - # For partial separators (mix of + and |) - current_positions = [i for i, char in enumerate(line) if (char in delimiters and i != 0)] - print(f"Partial separator - Found delimiters at positions: {current_positions}") - print(f"Characters at those positions: {[line[pos] for pos in current_positions]}") - return all(delimiter_positions[-1] in current_positions and - (line.startswith("+") or line.startswith("|")) and - pos in delimiter_positions for pos in current_positions) -def analyseMarkdown(filename:str) -> Document: - """ Analyse the markdown file and split it into clauses. +def writeClausesMkDocs(document:Document, filename:str, navTitle:str, addNavTitle:bool = False) -> None: + """ Write the clauses to separate files and create a navigation file. Args: - filename: The name of the markdown file. - - Returns: - The document object. + document: The document object. + filename: The name of the original markdown file. + navTitle: The title of the navigation entry. This is used to determine the directories. + addNavTitle: Add the title as an extra navigation level to the navigation file. """ - print(f'[green]Analyzing "{filename}"') + printInfo(f'Writing clauses to files') + # create directory first + os.makedirs(f'{os.path.dirname(filename)}/{navTitle}', exist_ok = True) + + # Write the files + for i, f in enumerate(document.clauses): + # write to single files, even empty ones + printDebug(f'Writing "{f.clauseNumber}.md" - "{f.title}"') + with open(f'{os.path.dirname(filename)}/{navTitle}/{f.clauseNumber}.md', 'w') as file: + # Add one empty line before the clause. This is done to avoid + # a bug in MkDocs that does not display the first line of a clause + # if it contains a colon. It does not matter otherwise if the line + # is empty or not. + file.writelines(f.asStringList(1)) - # Read the file. - # Note: We use utf-8 and replace errors to avoid problems with special or unknown characters. - with open(filename, 'r', encoding = 'utf-8', errors = 'replace') as file: - inLines = file.readlines() - # The list of clauses. The first clause contains the text before the first heading. - outClauses:list[Clause] = [Clause(0, '', '', [])] - footnotes:list[Footnote] = [] - - # Go through the lines and detect headers and codefences - inCodefence = False - inTable = False - tableHasSeparator = False - inGridTable = False - gridTableHasSeparator = False - gridTable = "" - for line in inLines: - - # Detect and handle codefences - # For the moment we support only codefences that start and end - # with 3 backticks. This is the most common way to define codefences. - # Note, that longer codefences are allowed by the markdown specification. - - if _matchCodefenceStart.match(line) and not inCodefence: - inCodefence = True - outClauses[-1].append(Line(line, LineType.CODEFENCESTART)) - continue - if _matchCodefenceEnd.match(line): - inCodefence = False - outClauses[-1].append(Line(line, LineType.CODEFENCEEND)) - continue - if inCodefence: - outClauses[-1].append(Line(line, LineType.CODE)) - continue - - # Detect and handle tables - if _matchTable.match(line) and not inTable and not inGridTable: - inTable = True - outClauses[-1].append(Line(line, LineType.TABLEHEADER)) - continue - if inTable: - if _matchTableSeparator.match(line) and not tableHasSeparator: - outClauses[-1].append(Line(line, LineType.TABLESEPARATOR)) - tableHasSeparator = True - continue - elif _matchTable.match(line): - outClauses[-1].append(Line(line, LineType.TABLEROW)) - continue - else: - inTable = False - tableHasSeparator = False - # Mark the previous line as the last row in the table - outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW - # continue with other matches - - #Detect grid tables and convert them to html table - if _matchGridTable.match(line) and not inGridTable: - inGridTable = True - #outClauses[-1].append(Line(line, LineType.TABLEHEADER)) - gridTable += line - continue - if inGridTable: - if _matchGridTableHeaderSeparator.match(line) or _matchGridTableBodySeparator.match(line): - #outClauses[-1].append(Line(line, LineType.TABLESEPARATOR)) - gridTable += line - continue - elif _matchTable.match(line): - #outClauses[-1].append(Line(line, LineType.TABLEROW)) - gridTable += line + # write nav.yml file + printInfo(f'Writing "_nav.yml"') + indentation = ' ' if addNavTitle else '' # TODO make number of spaces configurable + with open(f'{os.path.dirname(filename)}/_nav.yml', 'w') as file: + printDebug(f'Writing navigation file') + if addNavTitle: + file.write(f'{indentation}- {navTitle}:\n') + for i, f in enumerate(document.clauses): + + if not f.title: + # print("continue") continue - else: - inGridTable = False - # Mark the previous line as the last row in the table - #outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW - print(gridTable) - htmltable = "" - htmltable = generate_html_table_with_spans(gridTable) - print(htmltable) - for row in htmltable: - outClauses[-1].append(Line(row, LineType.TABLEROW)) - gridTable = "" - # continue with other matches - - # Detect notes - # Notes are lines that start with a '>'. - if _matchNote.match(line): - outClauses[-1].append(Line(line, LineType.NOTE)) - continue - # Detect footnotes - # Footnotes are lines that start with a '^' - if (_fn := _footnote.match(line)): - footnotes.append(Footnote(_fn.groups()[0], Line(line, LineType.TEXT))) - continue + # TODO handle if the next clause is more than one level deeper - # Detect images on a single line - if (m := _matchStandAloneImage.match(line)): - outClauses[-1].append(Line(line, LineType.STANDALONEIMAGE)) - continue - - # Detect headers - _lineType = LineType.TEXT - if (m := _matchHeader.match(line)): - # Add a new clause - clauseTitle = m.groups()[1].strip() - clauseTitle = re.sub(_htmlTag, '', clauseTitle) - headerNumber = _matchHeaderNumber.search(clauseTitle) - outClauses.append(Clause(len(m.groups()[0]), # level - headerNumber.group() if headerNumber else shortHash(clauseTitle, 6), - clauseTitle, - [])) - _lineType = LineType.HEADING + _title = f.title.replace("'", '"') + nextClause = document.clauses[i+1] if i+1 < len(document.clauses) else None + if nextClause is None or nextClause.level <= f.level: + file.write(f"{indentation}{' '*f.level}- '{_title}': '{navTitle}/{f.clauseNumber}.md'\n") + else: + file.write(f"{indentation}{' '*f.level}- '{_title}':\n") + if len(f) > 0: + file.write(f"{indentation}{' '*nextClause.level}- 'Hanging paragraph': '{navTitle}/{f.clauseNumber}.md'\n") - # Just add the line to the current clause as text - outClauses[-1].append(Line(line, _lineType)) - return Document(outClauses, footnotes) +# TODO handle multiple nav levels (left bar) better (make conifgurable) def copyMediaFiles(filename:str, navTitle:str, mediaDirectory:str = 'media') -> None: @@ -1155,10 +164,10 @@ def copyMediaFiles(filename:str, navTitle:str, mediaDirectory:str = 'media') -> targetDirectory = f'{os.path.dirname(filename)}/{navTitle}/{mediaDirectory}' if os.path.exists(sourceDirectory): - print(f'[green]Copying media files from "{sourceDirectory}" to "{targetDirectory}"') + printInfo(f'Copying media files from "{sourceDirectory}" to "{targetDirectory}"') shutil.copytree(sourceDirectory, targetDirectory, dirs_exist_ok = True) else: - print(f'[red]Media directory "{sourceDirectory}" does not exist') + printError(f'Media directory "{sourceDirectory}" does not exist') def processDocument(args:argparse.Namespace) -> None: @@ -1175,20 +184,22 @@ def processDocument(args:argparse.Namespace) -> None: document.insertFootnotes() document.updateLinks() document.updateNotes() - document.prepareForMkdocs(args.include_hanging_paragraphs) + + prepareForMkdocs(document, args.include_hanging_paragraphs) # Write the clauses to files - document.writeClausesMkDocs(inDocumentFilename, args.title, args.nav_add_title) + writeClausesMkDocs(document, inDocumentFilename, args.title, args.nav_add_title) # Copy the media files copyMediaFiles(inDocumentFilename, args.title, args.media_directory) -if __name__ == '__main__': +def main() -> None: parser = argparse.ArgumentParser(description = 'Convert oneM2M markdown specificatios to MkDocs format', formatter_class = argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--verbose', '-v', action = 'store_true', help = 'verbose output during processing') + parser.add_argument('--out', '-o', metavar='outfile', help = 'write output to file instead of stdout') parser.add_argument('--very-verbose', '-vv', action = 'store_true', help = 'very verbose output during processing') parser.add_argument('--ignore-clause', '-ic', metavar = 'clause', nargs = '+', default = [ 'Contents', 'History' ], help = 'ignore headers in the markdown document') parser.add_argument('--include-hanging-paragraphs', '-ihp', action = 'store_true', default = False, help = 'include hanging paragraphs (text in clauses with sub-clauses) in the output files') @@ -1198,8 +209,15 @@ if __name__ == '__main__': parser.add_argument('--title', '-t', metavar = 'title', required = True, help = 'mkdocs navigation tile') parser.add_argument('--nav-add-title', '-nat', action = 'store_true', default = False, help = 'add the title as an extra navigation level to the navigation file') - parser.add_argument('document', type = str, help = 'a oneM2M markdown specification document to process') args = parser.parse_args() + setLoggers(info = printInfo, + debug = printDebug, + error = printError) processDocument(args) + + +if __name__ == '__main__': + main() +