# # toMkdocs.py # # (c) 2024 by Andreas Kraft # # This script converts oneM2M spec markdown file to a mkdocs compatible # directory structure. # from __future__ import annotations from enum import Enum, auto import argparse, re, os, shutil, hashlib, base64 from dataclasses import dataclass from rich import print verbose = False veryVerbose = False class LineType(Enum): """ Represents the type of a line in the markdown file. """ HEADING = auto() TEXT = auto() CODEFENCESTART = auto() CODE = auto() CODEFENCEEND = auto() LIST = auto() NOTE = auto() STANDALONEIMAGE = auto() TABLEHEADER = auto() TABLESEPARATOR = auto() TABLEROW = auto() TABLELASTROW = auto() @dataclass class Line: """ Represents a line in the markdown file. """ text:str = '' lineType:LineType = LineType.TEXT @dataclass class Clause: """ Represents a clause in the markdown file. """ _level:int _clauseNumber:str _title:str _lines:list[Line] @property def level(self) -> int: """ Return the level of the clause. """ return self._level @property def clauseNumber(self) -> str: """ Return the clause number. """ return self._clauseNumber if self._clauseNumber else '0' @clauseNumber.setter def clauseNumber(self, value:str) -> None: """ Set the clause number. """ self._clauseNumber = value @property def title(self) -> str: """ Return the title of the clause. """ return self._title @title.setter def title(self, value:str) -> None: """ Set the title of the clause. """ self._title = value @property def lines(self) -> list[Line]: """ Return the lines of the clause. """ return self._lines @lines.setter def lines(self, value:list[Line]) -> None: """ Set the lines of the clause. """ self._lines = value @property def linesCount(self) -> int: """ Return the number of lines in the clause. Returns: The number of lines in the clause. """ return len(self.lines) def append(self, line:Line) -> None: """ Append a line to the clause. Args: line: The line to append. """ self.lines.append(line) def extend(self, clause:Clause) -> None: """ Extend the clause with the lines of another clause. Args: clause: The clause to extend with. """ self.lines.extend(clause.lines) def asStringList(self, paddings:int = 0) -> list[str]: """ Return the clause as a list of strings. Args: paddings: The number of empty lines to add before the clause. Returns: The clause's lines as a list of strings. """ return [ '\n' for _ in range(paddings) ] + [ l.text for l in self.lines ] def __len__(self) -> int: """ Return the number of characters in the clause. Returns: The number of characters in the clause. """ return sum([ len(l.text) for l in self.lines ]) class Footnote: """ Represents a footnote in the markdown file. """ def __init__(self, id:str, line:Line) -> None: self.id = id self.line = line class Document: """ Represents the document object. """ clauses:list[Clause] = [] footnotes:list[Footnote] = [] def __init__(self, clauses:list[Clause], footnotes:list[Footnote]) -> None: self.clauses = clauses self.footnotes = footnotes def splitMarkdownDocument(self, ignoreTitles:list[str] = [], splitLevel:int = 1, ignoreUntilFirstHeading:bool = False) -> None: """ Split the clauses at a certain level. This is used to create the separate markdown files for MkDocs. After the split, the clauses are stored in the document object. Args: ignoreTitles: A list of titles that should be ignored. They are not included in the output. splitLevel: The level at which the clauses should be split. ignoreUntilFirstHeader: Ignore all clauses until the first heading. """ result:list[Clause] = [] ignoreTitles = [ t.casefold() for t in ignoreTitles ] # convert to lower case for clause in self.clauses: level = clause.level # Check if the current clause should be ignored if clause.title.casefold() in ignoreTitles: continue # Add a new output clause if the current clause's level is # equal or less than the split level if clause.level <= splitLevel: result.append(Clause(level, clause.clauseNumber, clause.title, [])) # Add the lines to the output clause result[-1].extend(clause) # Remove the first clause if it has no title if ignoreUntilFirstHeading: while len(result[0].title) == 0: result.pop(0) self.clauses = result def insertFootnotes(self) -> None: """ Insert footnotes into the clauses. After the insertion, the clauses are stored in the document object. """ print(f'[green]Adding footnotes to clauses') for clause in self.clauses: foundFootnotes:list[Footnote] = [] for line in clause.lines: # ATTN: Only footnotes in normal text lines are checked if line.lineType == LineType.TEXT and (fn := _inlineFootnote.search(line.text)): # Find the footnote in the list of footnotes for f in self.footnotes: if f.id == fn.groups()[0]: foundFootnotes.append(f) # Insert the footnotes at the end of the clause if len(foundFootnotes) > 0: clause.append(Line('\n', LineType.TEXT)) for f in foundFootnotes: clause.append(f.line) def updateLinks(self) -> None: """ Update the links in the clauses to the new structure. This is done by creating a dictionary of all links and their targets and then replacing the links in the clauses. After the update, the clauses are stored in the document object. """ print(f'[green]Updating links in clauses') # Build the link target dictionary. Mapping anchor -> clause linkTargets:dict[str, Clause] = {} # Find all Markdown headers in the clauses and convert them to anchor format for i, clause in enumerate(self.clauses): # Find all headers in the clause for line in clause.lines: if (m := _matchHeader.match(line.text)): # convert the header to anchor format and add it to the dictionary # Remove special characters # TODO move perhaps to an own function anchor = m.groups()[1].strip().casefold().replace(' ', '-') for c in ( '.', '(', ')', '[', ']', ':', ',', "'", '"'): anchor = anchor.replace(c, '') # remove html tags from the anchor anchor = re.sub(_htmlTag, '', anchor) linkTargets[f'#{anchor}'] = clause if veryVerbose: print(f'[dim]Added Markdown anchor "{anchor}"') # Find all HTML anchors in the clauses and add them to the dictionary for i, clause in enumerate(self.clauses): for line in clause.lines: if (anchors := _htmlAnchorLink.findall(line.text)): for a in anchors: linkTargets[f'#{a}'] = clause if veryVerbose: print(f'[dim]Found HTML anchor "{a}" in clause "{clause.title}"') # Replace the html links for clause in self.clauses: for i, line in enumerate(clause.lines): if (links := _htmlLink.findall(line.text)): for lnk in links: if lnk in linkTargets: line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[lnk].clauseNumber}/#{lnk[1:]}') # Update the current line as well if veryVerbose: print(f'[dim]Updated HTML link "{lnk}" in clause "{clause.title}"') # Replace the markdown links for clause in self.clauses: for i, line in enumerate(clause.lines): if (links := _markdownLink.findall(line.text)): # Replace the old link targets with converted # (lower case) versions that point to the output files for lnk in links: _lnk =lnk.casefold() if _lnk in linkTargets: line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[_lnk].clauseNumber}/#{lnk[1:]}') # Update the current line as well if veryVerbose: print(f'[dim]Updated Markdown link "{lnk}" in clause "{clause.title}"') def updateNotes(self) -> None: """ Update the notes in the clauses to the mkDocs notes version. After the update, the clauses are stored in the document object. """ print(f'[green]Updating notes in clauses') for clause in self.clauses: lines:list[Line] = [] inNote = False for line in clause.lines: if line.lineType == LineType.NOTE: if not inNote: lines.append(Line('\n', LineType.TEXT)) lines.append(Line('!!! note\n', LineType.NOTE)) inNote = True lines.append(Line(f"\t{re.sub(_matchNoteStart, '', line.text)}", LineType.NOTE)) if verbose: print(f'[dim]Converted note in clause "{clause.title}"') else: if inNote: lines.append(Line('\n', LineType.TEXT)) inNote = False lines.append(line) clause.lines = lines def prepareForMkdocs(self, includeHangingParagraphs:bool = False) -> None: """ Prepare the clauses for MkDocs. This includes removing the heading from the clauses and marking the clauses that are only for navigation. After the preparation, the clauses are stored in the document object. Args: includeHangingParagraphs: Include hanging paragraphs in the output. """ # Remove the heading from the lines. The heading is the first line # in the clause. This is done because MkDocs repeats the heading when # displaying the page. for clause in self.clauses: if clause.linesCount > 0: clause.lines.pop(0) # Also, remove the first empty lines if they exist while clause.linesCount > 0 and clause.lines[0].text.strip() == '': clause.lines.pop(0) # Detect and handle hanging paragraphs. This is extra text in a clause, which # has sub-clauses. This text is not allowed in oneM2M specifications. for i, clause in enumerate(self.clauses): if clause.level > 0 and clause.linesCount > 0: # Check if there is a sub-clause in the next clause if i + 1 < len(self.clauses) and self.clauses[i+1].level > clause.level: # This is a hanging paragraph. Remove the text from the current clause. print(f'[yellow]Hanging paragraph in clause "{clause.title}" {"(removed)" if not includeHangingParagraphs else "(kept)"}') if not includeHangingParagraphs: self.clauses[i].lines = [] else: self.clauses[i].lines = [Line("<mark>Editor note: This is a hanging paragraph and it must be moved to its own clause</mark>")] + [Line()] + self.clauses[i].lines # Repair wrong markdown for indented lines. # Add 2 spaces to existing 2-space indentions for clause in self.clauses: for i, line in enumerate(clause.lines): if _match2spaceListIndention.match(line.text): clause.lines[i].text = ' ' + line.text def writeClausesMkDocs(self, filename:str, navTitle:str, addNavTitle:bool = False) -> None: """ Write the clauses to separate files and create a navigation file. Args: filename: The name of the original markdown file. navTitle: The title of the navigation entry. This is used to determine the directories. addNavTitle: Add the title as an extra navigation level to the navigation file. """ print(f'[green]Writing clauses to files') # create directory first os.makedirs(f'{os.path.dirname(filename)}/{navTitle}', exist_ok = True) # Write the files for i, f in enumerate(self.clauses): # write to single files, even empty ones if verbose: print(f'[dim]Writing "{f.clauseNumber}.md" - "{f.title}"') with open(f'{os.path.dirname(filename)}/{navTitle}/{f.clauseNumber}.md', 'w') as file: # Add one empty line before the clause. This is done to avoid # a bug in MkDocs that does not display the first line of a clause # if it contains a colon. It does not matter otherwise if the line # is empty or not. file.writelines(f.asStringList(1)) # write nav.yml file print(f'[green]Writing "_nav.yml"') indentation = ' ' if addNavTitle else '' # TODO make number of spaces configurable with open(f'{os.path.dirname(filename)}/_nav.yml', 'w') as file: if veryVerbose: print(f'[dim]Writing navigation file') if addNavTitle: file.write(f'{indentation}- {navTitle}:\n') for i, f in enumerate(self.clauses): if not f.title: print("continue") continue # TODO handle if the next clause is more than one level deeper _title = f.title.replace("'", '"') nextClause = self.clauses[i+1] if i+1 < len(self.clauses) else None if nextClause is None or nextClause.level <= f.level: file.write(f"{indentation}{' '*f.level}- '{_title}': '{navTitle}/{f.clauseNumber}.md'\n") else: file.write(f"{indentation}{' '*f.level}- '{_title}':\n") if len(f) > 0: file.write(f"{indentation}{' '*nextClause.level}- 'Hanging paragraph': '{navTitle}/{f.clauseNumber}.md'\n") _matchHeader = re.compile(r'(#+)\s+(.*)', re.IGNORECASE) _matchHeaderNumber = re.compile(r'\b[A-Za-z0-9]\d*(\.\d+)*\b', re.IGNORECASE) _matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE) _matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE) _matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE) _matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECASE) _matchTable = re.compile(r'^\s*\|.*\|\s$', re.IGNORECASE) _matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE) _match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE) _markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE) _htmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE) _htmlAnchorLink = re.compile(r'<a\s+name="([^"]*)">[^<]*</a>', re.IGNORECASE) _htmlTag = re.compile(r'<[^>]*>', re.IGNORECASE) _matchNoteStart = re.compile(r'^\s*>\s*(note)?\s*[:]?\s*', re.IGNORECASE) _footnote = re.compile(r'\[\^([^\]]*)\]:', re.IGNORECASE) _inlineFootnote = re.compile(r'\[\^([^\]]*)\]', re.IGNORECASE) # TODO handle multiple nav levels (left bar) better (make conifgurable) def shortHash(value:str, length:int) -> str: """ Generate a short hash of a string value. Args: value: The value to hash. length: The length of the hash. Returns: The hash. """ return base64.b64encode( hashlib.sha256( value.encode() ).digest() ).decode()[:length] def analyseMarkdown(filename:str) -> Document: """ Analyse the markdown file and split it into clauses. Args: filename: The name of the markdown file. Returns: The document object. """ print(f'[green]Analyzing "{filename}"') # Read the file. # Note: We use utf-8 and replace errors to avoid problems with special or unknown characters. with open(filename, 'r', encoding = 'utf-8', errors = 'replace') as file: inLines = file.readlines() # The list of clauses. The first clause contains the text before the first heading. outClauses:list[Clause] = [Clause(0, '', '', [])] footnotes:list[Footnote] = [] # Go through the lines and detect headers and codefences inCodefence = False inTable = False tableHasSeparator = False for line in inLines: # Detect and handle codefences # For the moment we support only codefences that start and end # with 3 backticks. This is the most common way to define codefences. # Note, that longer codefences are allowed by the markdown specification. if _matchCodefenceStart.match(line) and not inCodefence: inCodefence = True outClauses[-1].append(Line(line, LineType.CODEFENCESTART)) continue if _matchCodefenceEnd.match(line): inCodefence = False outClauses[-1].append(Line(line, LineType.CODEFENCEEND)) continue if inCodefence: outClauses[-1].append(Line(line, LineType.CODE)) continue # Detect and handle tables if _matchTable.match(line) and not inTable: inTable = True outClauses[-1].append(Line(line, LineType.TABLEHEADER)) continue if inTable: if _matchTableSeparator.match(line) and not tableHasSeparator: outClauses[-1].append(Line(line, LineType.TABLESEPARATOR)) tableHasSeparator = True continue elif _matchTable.match(line): outClauses[-1].append(Line(line, LineType.TABLEROW)) continue else: inTable = False tableHasSeparator = False # Mark the previous line as the last row in the table outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW # continue with other matches # Detect notes # Notes are lines that start with a '>'. if _matchNote.match(line): outClauses[-1].append(Line(line, LineType.NOTE)) continue # Detect footnotes # Footnotes are lines that start with a '^' if (_fn := _footnote.match(line)): footnotes.append(Footnote(_fn.groups()[0], Line(line, LineType.TEXT))) continue # Detect images on a single line if (m := _matchStandAloneImage.match(line)): outClauses[-1].append(Line(line, LineType.STANDALONEIMAGE)) continue # Detect headers _lineType = LineType.TEXT if (m := _matchHeader.match(line)): # Add a new clause clauseTitle = m.groups()[1].strip() clauseTitle = re.sub(_htmlTag, '', clauseTitle) headerNumber = _matchHeaderNumber.search(clauseTitle) outClauses.append(Clause(len(m.groups()[0]), # level headerNumber.group() if headerNumber else shortHash(clauseTitle, 6), clauseTitle, [])) _lineType = LineType.HEADING # Just add the line to the current clause as text outClauses[-1].append(Line(line, _lineType)) return Document(outClauses, footnotes) def copyMediaFiles(filename:str, navTitle:str, mediaDirectory:str = 'media') -> None: """ Copy media files from the source directory to the target directory. Args: filename: The name of the markdown file. navTitle: The title of the navigation entry. mediaDirectory: The name of the media directory. """ sourceDirectory = f'{os.path.dirname(filename)}/{mediaDirectory}' targetDirectory = f'{os.path.dirname(filename)}/{navTitle}/{mediaDirectory}' if os.path.exists(sourceDirectory): print(f'[green]Copying media files from "{sourceDirectory}" to "{targetDirectory}"') shutil.copytree(sourceDirectory, targetDirectory, dirs_exist_ok = True) else: print(f'[red]Media directory "{sourceDirectory}" does not exist') def processDocument(args:argparse.Namespace) -> None: global verbose, veryVerbose inDocumentFilename = os.path.abspath(args.document) veryVerbose = args.very_verbose verbose = args.verbose if veryVerbose: verbose = True # Analyse the markdown file document = analyseMarkdown(inDocumentFilename) document.splitMarkdownDocument(args.ignore_clause, args.split_level) document.insertFootnotes() document.updateLinks() document.updateNotes() document.prepareForMkdocs(args.include_hanging_paragraphs) # Write the clauses to files document.writeClausesMkDocs(inDocumentFilename, args.title, args.nav_add_title) # Copy the media files copyMediaFiles(inDocumentFilename, args.title, args.media_directory) if __name__ == '__main__': parser = argparse.ArgumentParser(description = 'Convert oneM2M markdown specificatios to MkDocs format', formatter_class = argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--verbose', '-v', action = 'store_true', help = 'verbose output during processing') parser.add_argument('--very-verbose', '-vv', action = 'store_true', help = 'very verbose output during processing') parser.add_argument('--ignore-clause', '-ic', metavar = 'clause', nargs = '+', default = [ 'Contents', 'History' ], help = 'ignore headers in the markdown document') parser.add_argument('--include-hanging-paragraphs', '-ihp', action = 'store_true', default = False, help = 'include hanging paragraphs (text in clauses with sub-clauses) in the output files') parser.add_argument('--include-title', '-it', action = 'store_true', help = 'include the content before the first heading in the output files as "0.md"') parser.add_argument('--split-level', '-sl', metavar = 'level', type = int, default = 2, help = 'on which level to split clauses to separate files') parser.add_argument('--media-directory', '-md', metavar = 'media-directory', default = 'media', help = 'directory name where media files are stored') parser.add_argument('--title', '-t', metavar = 'title', required = True, help = 'mkdocs navigation tile') parser.add_argument('--nav-add-title', '-nat', action = 'store_true', default = False, help = 'add the title as an extra navigation level to the navigation file') parser.add_argument('document', type = str, help = 'a oneM2M markdown specification document to process') args = parser.parse_args() processDocument(args)