From 78b6c6f45904a50a3c475c376f8202c64f5c795d Mon Sep 17 00:00:00 2001 From: ankraft <an.kraft@gmail.com> Date: Fri, 26 Apr 2024 12:44:03 +0200 Subject: [PATCH] Detecting images and tables in the markdown parser --- toMkdocs/toMkdocs.py | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/toMkdocs/toMkdocs.py b/toMkdocs/toMkdocs.py index aaace80..9277e2b 100644 --- a/toMkdocs/toMkdocs.py +++ b/toMkdocs/toMkdocs.py @@ -24,6 +24,11 @@ class LineType(Enum): CODEFENCEEND = auto() LIST = auto() NOTE = auto() + STANDALONEIMAGE = auto() + TABLEHEADER = auto() + TABLESEPARATOR = auto() + TABLEROW = auto() + TABLELASTROW = auto() @dataclass @@ -136,6 +141,9 @@ _matchHeaderNumber = re.compile(r'\b[A-Za-z0-9]\d*(\.\d+)*\b', re.IGNORECASE) _matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE) _matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE) _matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE) +_matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECASE) +_matchTable = re.compile(r'^\s*\|.*\|\s$', re.IGNORECASE) +_matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE) _match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE) _markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE) _htmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE) @@ -186,6 +194,8 @@ def analyseMarkdown(filename:str) -> list[Clause]: # Go through the lines and detect headers and codefences inCodefence = False + inTable = False + tableHasSeparator = False for line in inLines: # Detect and handle codefences @@ -204,13 +214,38 @@ def analyseMarkdown(filename:str) -> list[Clause]: if inCodefence: outClauses[-1].append(Line(line, LineType.CODE)) continue - + + # Detect and handle tables + if _matchTable.match(line) and not inTable: + inTable = True + outClauses[-1].append(Line(line, LineType.TABLEHEADER)) + continue + if inTable: + if _matchTableSeparator.match(line) and not tableHasSeparator: + outClauses[-1].append(Line(line, LineType.TABLESEPARATOR)) + tableHasSeparator = True + continue + elif _matchTable.match(line): + outClauses[-1].append(Line(line, LineType.TABLEROW)) + continue + else: + inTable = False + tableHasSeparator = False + # Mark the previous line as the last row in the table + outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW + # continue with other matches + # Detect notes # Notes are lines that start with a '>'. if _matchNote.match(line): outClauses[-1].append(Line(line, LineType.NOTE)) continue - + + # Detect images on a single line + if (m := _matchStandAloneImage.match(line)): + outClauses[-1].append(Line(line, LineType.STANDALONEIMAGE)) + continue + # Detect headers _lineType = LineType.TEXT if (m := _matchHeader.match(line)): -- GitLab