# # markdownTools.py # # (c) 2025 by Andreas Kraft & Miguel Angel Reina Ortega # License: BSD 3-Clause License. See the LICENSE file for further details. """ Various tools for markdown processing """ from __future__ import annotations from typing import Callable, Optional from dataclasses import dataclass import base64, hashlib from enum import Enum, auto from gridTableTools import generateHtmlTableWithSpans, setLoggers as setGridTableLoggers from regexMatches import * # TODO use a verbosity level instead verbose = False veryVerbose = False printInfo = print printDebug = print printError = print def setLoggers(info:Callable = print, debug:Callable = print, error:Callable= print) -> None: global printInfo, printDebug, printError printInfo = info printDebug = debug printError = error # Set the loggers for the grid table tools setGridTableLoggers(info, debug, error) def _shortHash(value:str, length:int) -> str: """ Generate a short hash of a string value. Args: value: The value to hash. length: The length of the hash. Returns: The hash. """ return base64.b64encode( hashlib.sha256( value.encode() ).digest() ).decode()[:length] class LineType(Enum): """ Represents the type of a line in the markdown file. """ HEADING = auto() TEXT = auto() CODEFENCESTART = auto() CODE = auto() CODEFENCEEND = auto() LIST = auto() NOTE = auto() STANDALONEIMAGE = auto() TABLEHEADER = auto() TABLESEPARATOR = auto() TABLEROW = auto() TABLELASTROW = auto() RAWHTML = auto() @dataclass class Line: """ Represents a line in the markdown file. """ text:str = '\n' lineType:LineType = LineType.TEXT def __str__(self) -> str: """ Return the line as a string. """ return self.text def __repr__(self) -> str: """ Return the line as a string. """ return self.__str__() @dataclass class Clause: """ Represents a clause in the markdown file. """ _level:int _clauseNumber:str _title:str _lines:list[Line] @property def level(self) -> int: """ Return the level of the clause. """ return self._level @property def clauseNumber(self) -> str: """ Return the clause number. """ return self._clauseNumber if self._clauseNumber else '0' @clauseNumber.setter def clauseNumber(self, value:str) -> None: """ Set the clause number. """ self._clauseNumber = value @property def title(self) -> str: """ Return the title of the clause. """ return self._title @title.setter def title(self, value:str) -> None: """ Set the title of the clause. """ self._title = value @property def lines(self) -> list[Line]: """ Return the lines of the clause. """ return self._lines @lines.setter def lines(self, value:list[Line]) -> None: """ Set the lines of the clause. """ self._lines = value @property def linesCount(self) -> int: """ Return the number of lines in the clause. Returns: The number of lines in the clause. """ return len(self.lines) def append(self, line:Line) -> None: """ Append a line to the clause. Args: line: The line to append. """ self.lines.append(line) def extend(self, clause:Clause) -> None: """ Extend the clause with the lines of another clause. Args: clause: The clause to extend with. """ self.lines.extend(clause.lines) def asStringList(self, paddings:int = 0) -> list[str]: """ Return the clause as a list of strings. Args: paddings: The number of empty lines to add before the clause. Returns: The clause's lines as a list of strings. """ return [ '\n' for _ in range(paddings) ] + [ l.text for l in self.lines ] def __len__(self) -> int: """ Return the number of characters in the clause. This does not include empty lines or lines that contain only whitespace. Returns: The number of characters in the clause. """ return sum([ len(l.text.strip()) for l in self.lines ]) def __str__(self) -> str: """ Return the clause as a string. """ return ''.join([str(l) for l in self.lines ]) def __repr__(self) -> str: """ Return the clause as a string. """ return self.__str__() class Footnote: """ Represents a footnote in the markdown file. """ def __init__(self, id:str, line:Line) -> None: """ Constructor. Args: id: The id of the footnote. line: The line of the footnote. """ self.id = id """ The id of the footnote. """ self.line = line """ The line of the footnote. """ def __str__(self) -> str: return self.line.text def __repr__(self) -> str: return self.__str__() class Document: """ Represents the document object. """ clauses:list[Clause] = [] footnotes:list[Footnote] = [] def __init__(self, clauses:list[Clause], footnotes:list[Footnote] = []) -> None: self.clauses = clauses self.footnotes = footnotes def splitMarkdownDocument(self, ignoreTitles:list[str] = [], splitLevel:int = 1, ignoreUntilFirstHeading:bool = False) -> None: """ Split the clauses at a certain level. This is used to create the separate markdown files for MkDocs. After the split, the clauses are stored in the document object. Args: ignoreTitles: A list of titles that should be ignored. They are not included in the output. splitLevel: The level at which the clauses should be split. ignoreUntilFirstHeader: Ignore all clauses until the first heading. """ result:list[Clause] = [] ignoreTitles = [ t.casefold() for t in ignoreTitles ] # convert to lower case for clause in self.clauses: level = clause.level # Check if the current clause should be ignored if clause.title.casefold() in ignoreTitles: continue # Add a new output clause if the current clause's level is # equal or less than the split level if clause.level <= splitLevel: result.append(Clause(level, clause.clauseNumber, clause.title, [])) # Add the lines to the output clause result[-1].extend(clause) # Remove the first clause if it has no title if ignoreUntilFirstHeading: while len(result[0].title) == 0: result.pop(0) self.clauses = result def insertFootnotes(self) -> None: """ Insert footnotes into the clauses. After the insertion, the clauses are stored in the document object. """ printInfo('Adding footnotes to clauses') for clause in self.clauses: foundFootnotes:list[Footnote] = [] for line in clause.lines: # ATTN: Only footnotes in normal text lines are checked if line.lineType == LineType.TEXT and (fn := MatchInlineFootnote.search(line.text)): # Find the footnote in the list of footnotes for f in self.footnotes: if f.id == fn.groups()[0]: foundFootnotes.append(f) # Insert the footnotes at the end of the clause if len(foundFootnotes) > 0: clause.append(Line('\n', LineType.TEXT)) for f in foundFootnotes: clause.append(f.line) def updateLinks(self) -> None: """ Update the links in the clauses to the new structure. This is done by creating a dictionary of all links and their targets and then replacing the links in the clauses. After the update, the clauses are stored in the document object. """ printInfo('Updating links in clauses') # Build the link target dictionary. Mapping anchor -> clause linkTargets:dict[str, Clause] = {} # Find all Markdown headers in the clauses and convert them to anchor format for i, clause in enumerate(self.clauses): # Find all headers in the clause for line in clause.lines: if (m := matchHeader.match(line.text)): # convert the header to anchor format and add it to the dictionary # Remove special characters # TODO move perhaps to an own function anchor = m.groups()[1].strip().casefold().replace(' ', '-') for c in ( '.', '(', ')', '[', ']', ':', ',', "'", '"'): anchor = anchor.replace(c, '') # remove html tags from the anchor anchor = re.sub(matchHtmlTag, '', anchor) linkTargets[f'#{anchor}'] = clause if veryVerbose: printDebug(f'Added Markdown anchor "{anchor}"') # Find all HTML anchors in the clauses and add them to the dictionary for i, clause in enumerate(self.clauses): for line in clause.lines: if (anchors := matchHtmlAnchorLink.findall(line.text)): for a in anchors: linkTargets[f'#{a}'] = clause if veryVerbose: printDebug(f'Found HTML anchor "{a}" in clause "{clause.title}"') # Replace the html links for clause in self.clauses: for i, line in enumerate(clause.lines): if (links := matchHtmlLink.findall(line.text)): for lnk in links: if lnk in linkTargets: line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[lnk].clauseNumber}/#{lnk[1:]}') # Update the current line as well if veryVerbose: printDebug(f'Updated HTML link "{lnk}" in clause "{clause.title}"') # Replace the markdown links for clause in self.clauses: for i, line in enumerate(clause.lines): if (links := markdownLink.findall(line.text)): # Replace the old link targets with converted # (lower case) versions that point to the output files for lnk in links: _lnk =lnk.casefold() if _lnk in linkTargets: line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[_lnk].clauseNumber}/#{lnk[1:]}') # Update the current line as well if veryVerbose: printDebug(f'Updated Markdown link "{lnk}" in clause "{clause.title}"') def updateNotes(self) -> None: """ Update the notes in the clauses to the mkDocs notes version. After the update, the clauses are stored in the document object. """ printInfo('Updating notes in clauses') for clause in self.clauses: lines:list[Line] = [] inNote = False for line in clause.lines: if line.lineType == LineType.NOTE: if not inNote: lines.append(Line('\n', LineType.TEXT)) lines.append(Line('!!! note\n', LineType.NOTE)) inNote = True lines.append(Line(f"\t{re.sub(matchNoteStart, '', line.text)}", LineType.NOTE)) if verbose: printDebug(f'Converted note in clause "{clause.title}"') else: if inNote: lines.append(Line('\n', LineType.TEXT)) inNote = False lines.append(line) clause.lines = lines def __str__(self) -> str: """ Return the document as a string. """ return '\n'.join([ str(c) for c in self.clauses + self.footnotes ]) def __repr__(self) -> str: """ Return the document as a string. """ return self.__str__() def analyseMarkdown(filename:Optional[str]=None, inLines:Optional[list[str]]=None) -> Document: """ Analyse the markdown file and split it into clauses. Either the filename or the inLines must be provided. Args: filename: The name of the markdown file. inLines: The lines of the markdown file. Returns: The document object. """ gridTable:str = '' def processGridTable() -> None: """ Process a grid table and convert it to an html table. This function adds the html table to the output clauses and clears the gridTable variable. """ nonlocal gridTable htmltable:str = '' try: htmltable = generateHtmlTableWithSpans(gridTable) printDebug(htmltable) except Exception as e: printError(f"Error: {e}") htmltable = f'<mark>Conversion error: {e}</mark>\n' outClauses[-1].append(Line(htmltable, LineType.RAWHTML)) gridTable = '' printInfo(f'Analyzing "{filename}"') # Read the file. # Note: We use utf-8 and replace errors to avoid problems with special or unknown characters. if filename and not inLines: with open(filename, 'r', encoding = 'utf-8', errors = 'replace') as file: inLines = file.readlines() elif not filename and inLines: pass else: raise ValueError('Either the filename or the lines must be provided.') # The list of clauses. The first clause contains the text before the first heading. outClauses:list[Clause] = [Clause(0, '', '', [])] footnotes:list[Footnote] = [] # Go through the lines and detect headers and codefences inCodefence = False inTable = False tableHasSeparator = False inGridTable = False for line in inLines: # Detect and handle codefences # For the moment we support only codefences that start and end # with 3 backticks. This is the most common way to define codefences. # Note, that longer codefences are allowed by the markdown specification. if matchCodefenceStart.match(line) and not inCodefence: inCodefence = True outClauses[-1].append(Line(line, LineType.CODEFENCESTART)) continue if matchCodefenceEnd.match(line): inCodefence = False outClauses[-1].append(Line(line, LineType.CODEFENCEEND)) continue if inCodefence: outClauses[-1].append(Line(line, LineType.CODE)) continue # Detect and handle tables if matchTable.match(line) and not inTable and not inGridTable: inTable = True outClauses[-1].append(Line(line, LineType.TABLEHEADER)) continue if inTable: if matchTableSeparator.match(line) and not tableHasSeparator: outClauses[-1].append(Line(line, LineType.TABLESEPARATOR)) tableHasSeparator = True continue elif matchTable.match(line): outClauses[-1].append(Line(line, LineType.TABLEROW)) continue else: inTable = False tableHasSeparator = False # Mark the previous line as the last row in the table outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW # continue with other matches #Detect grid tables and convert them to html table if matchGridTable.match(line) and not inGridTable: inGridTable = True #outClauses[-1].append(Line(line, LineType.TABLEHEADER)) gridTable += line continue if inGridTable: if matchGridTableHeaderSeparator.match(line) or matchGridTableBodySeparator.match(line): #outClauses[-1].append(Line(line, LineType.TABLESEPARATOR)) gridTable += line continue elif matchTable.match(line): #outClauses[-1].append(Line(line, LineType.TABLEROW)) gridTable += line continue else: inGridTable = False processGridTable() # continue with other matches # Detect notes # Notes are lines that start with a '>'. if matchNote.match(line): outClauses[-1].append(Line(line, LineType.NOTE)) continue # Detect footnotes # Footnotes are lines that start with a '^' if (_fn := matchFootnote.match(line)): footnotes.append(Footnote(_fn.groups()[0], Line(line, LineType.TEXT))) continue # Detect images on a single line if (m := matchStandAloneImage.match(line)): outClauses[-1].append(Line(line, LineType.STANDALONEIMAGE)) continue # Detect headers _lineType = LineType.TEXT if (m := matchHeader.match(line)): # Add a new clause clauseTitle = m.groups()[1].strip() clauseTitle = re.sub(matchHtmlTag, '', clauseTitle) headerNumber = matchHeaderNumber.search(clauseTitle) outClauses.append(Clause(len(m.groups()[0]), # level headerNumber.group() if headerNumber else _shortHash(clauseTitle, 6), clauseTitle, [])) _lineType = LineType.HEADING # Just add the line to the current clause as text outClauses[-1].append(Line(line, _lineType)) # Process still unfinished cases if gridTable: processGridTable() return Document(outClauses, footnotes) def main() -> None: """Hauptfunktion zur Verarbeitung von Markdown-Dateien über die Kommandozeile.""" import argparse parser = argparse.ArgumentParser(description='Markdown-Dateien verarbeiten, um Gittertabellen zu konvertieren und andere Formatierungen zu handhaben') parser.add_argument('eingabe', help='Eingabe-Markdown-Datei') parser.add_argument('-v', '--verbose', action='store_true', help='Ausführliche Ausgabe aktivieren') parser.add_argument('-vv', '--sehr-verbose', action='store_true', help='Sehr ausführliche Ausgabe aktivieren') parser.add_argument('-i', '--ignoriere-titel', nargs='+', default=[], help='Liste der zu ignorierenden Titel') parser.add_argument('-s', '--teilungs-ebene', type=int, default=1, help='Ebene, auf der das Dokument geteilt werden soll (Standard: 1)') parser.add_argument('-f', '--ignoriere-erste', action='store_true', help='Inhalt bis zur ersten Überschrift ignorieren') args = parser.parse_args() # Verbositätsebenen setzen global verbose, veryVerbose verbose = args.verbose veryVerbose = args.sehr_verbose # Markdown-Datei verarbeiten doc = analyseMarkdown(args.eingabe) # Dokument teilen und verarbeiten doc.splitMarkdownDocument( ignoreTitles=args.ignoriere_titel, splitLevel=args.teilungs_ebene, ignoreUntilFirstHeading=args.ignoriere_erste ) # Dokumentenelemente aktualisieren doc.insertFootnotes() doc.updateLinks() doc.updateNotes() # Verarbeitetes Dokument ausgeben for clause in doc.clauses: print(f"\n{'#' * clause.level} {clause.title}") for line in clause.lines: print(line.text, end='') if __name__ == '__main__': main()