From 11136957d61b248473d5ee5d06280cc94bddf0d9 Mon Sep 17 00:00:00 2001 From: ankraft <an.kraft@gmail.com> Date: Wed, 31 May 2023 16:40:38 +0200 Subject: [PATCH] First version of pandoc filter tool --- pandocFilter/pandocFilter.py | 119 ++++++++++++++++++++++++++++++++++ pandocFilter/requirements.txt | 14 ++++ pandocFilter/setup.py | 13 ++++ 3 files changed, 146 insertions(+) create mode 100644 pandocFilter/pandocFilter.py create mode 100644 pandocFilter/requirements.txt create mode 100644 pandocFilter/setup.py diff --git a/pandocFilter/pandocFilter.py b/pandocFilter/pandocFilter.py new file mode 100644 index 0000000..65c65e7 --- /dev/null +++ b/pandocFilter/pandocFilter.py @@ -0,0 +1,119 @@ +# +# pandocFilter.py +# +# Script to convert a oneM2M Markdown file to pandoc input format +# +# (c) 2023 by Andreas Kraft, Miguel Ortega +# License: BSD 3-Clause License. See the LICENSE file for further details. +# + +import argparse, os, re +from rich import print +from rich.progress import Progress, TextColumn, TimeElapsedColumn + +def readMDFile(progress:Progress, document:str) -> list[str]: + """ Read the markdown file and return a list of lines. + """ + _taskID = progress.add_task('[blue]Reading document', start=False, total=0) + + # Check if file exists + if not os.path.exists(document): + print(f'File {document} does not exist') + exit(1) + + # Read the file + with open(document, 'r', encoding='utf-8', errors = 'replace') as f: + progress.stop_task(_taskID) + return f.readlines() + + +def writeMDFile(progress:Progress, mdLines:list[str], document:str, outDirectory:str) -> None: + """ Write the markdown file. + + """ + _taskID = progress.add_task('[blue]Writing document', start=False, total=0) + + # Write the file + with open(f'{outDirectory}/{os.path.basename(document)}', 'w', encoding='utf-8', errors = 'replace') as f: + f.writelines(mdLines) + progress.stop_task(_taskID) + + +def correctTOC(progress:Progress, mdLines:list[str], tocSection:str = 'Contents') -> list[str]: + """ Correct the TOC to be compatible with pandoc. + """ + _taskID = progress.add_task('[blue]Correcting TOC', start=False, total=0) + + _contents = f'# {tocSection}\n' + tocregex = re.compile('^(.*\[.*\])') + + _lines:list[str] = [] + _inTOC = False + for line in mdLines: + # find TOC section first + if line == _contents: + _inTOC = True + _lines.append(line) + continue + if _inTOC: + if line.startswith('#'): # End of TOC? + _inTOC = False + _lines.append(line) + continue + matches = re.findall(tocregex, line) # Replace entry + if matches: + _lines.append(f'{matches[0]} \n') + continue + else: + _lines.append(line) + + progress.stop_task(_taskID) + return _lines + + +def replaceTableCaptions(progress:Progress, mdLines:list[str]) -> list[str]: + """ Replace table captions with a pandoc table caption. + """ + _taskID = progress.add_task('[blue]Replacing table captions', start=False, total=0) + # progress.update() + tableregex = re.compile('^\*\*(Table .*)\*\*') + + _lines:list[str] = [] + for line in mdLines: + matches = re.findall(tableregex, line) + if matches: + # move the caption to the beginning of the table and add a "Table:" prefix + _idx = len(_lines) - 1 + while _idx >= 0 and _lines[_idx].startswith('|'): + _idx -= 1 + if _idx > 0: + _lines.insert(_idx+1, f'Table: {matches[0]}\n') + _lines.insert(_idx+1, f'Table: {matches[0]}\n') + else: + _lines.append(line) + + #print(_lines) + progress.stop_task(_taskID) + return _lines + + +def process(document:str, outDirectory:str) -> None: + with Progress(TextColumn('{task.description}'), TimeElapsedColumn()) as progress: + mdLines = readMDFile(progress, document) + mdLines = correctTOC(progress, mdLines) + mdLines = replaceTableCaptions(progress, mdLines) + writeMDFile(progress, mdLines, document, outDirectory) + + + +if __name__ == '__main__': + # Parse command line arguments + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>', help = 'specify output directory') + parser.add_argument('document', help = 'document to parse') + args = parser.parse_args() + + # Process documents and print output + os.makedirs(args.outDirectory, exist_ok = True) + + process(args.document, args.outDirectory) diff --git a/pandocFilter/requirements.txt b/pandocFilter/requirements.txt new file mode 100644 index 0000000..4c6b806 --- /dev/null +++ b/pandocFilter/requirements.txt @@ -0,0 +1,14 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile +# +markdown-it-py==2.2.0 + # via rich +mdurl==0.1.2 + # via markdown-it-py +pygments==2.15.1 + # via rich +rich==13.3.5 + # via oneM2M-markdown-to-pandoc-filter (setup.py) diff --git a/pandocFilter/setup.py b/pandocFilter/setup.py new file mode 100644 index 0000000..589119e --- /dev/null +++ b/pandocFilter/setup.py @@ -0,0 +1,13 @@ +from setuptools import setup, find_packages + +setup( + name='oneM2M markdown to pandoc filter', + version='0.0.1', + author='Andreas Kraft, Miguel Ortega', + author_email='an.kraft@gmail.com', + description='Convert oneM2M Markdown to Pandoc input', + packages=find_packages(), + install_requires=[ + 'rich', + ] +) -- GitLab