Skip to content
Snippets Groups Projects
Commit 11136957 authored by Andreas Kraft's avatar Andreas Kraft
Browse files

First version of pandoc filter tool

parent 35c69bd8
No related branches found
No related tags found
No related merge requests found
#
# pandocFilter.py
#
# Script to convert a oneM2M Markdown file to pandoc input format
#
# (c) 2023 by Andreas Kraft, Miguel Ortega
# License: BSD 3-Clause License. See the LICENSE file for further details.
#
import argparse, os, re
from rich import print
from rich.progress import Progress, TextColumn, TimeElapsedColumn
def readMDFile(progress:Progress, document:str) -> list[str]:
""" Read the markdown file and return a list of lines.
"""
_taskID = progress.add_task('[blue]Reading document', start=False, total=0)
# Check if file exists
if not os.path.exists(document):
print(f'File {document} does not exist')
exit(1)
# Read the file
with open(document, 'r', encoding='utf-8', errors = 'replace') as f:
progress.stop_task(_taskID)
return f.readlines()
def writeMDFile(progress:Progress, mdLines:list[str], document:str, outDirectory:str) -> None:
""" Write the markdown file.
"""
_taskID = progress.add_task('[blue]Writing document', start=False, total=0)
# Write the file
with open(f'{outDirectory}/{os.path.basename(document)}', 'w', encoding='utf-8', errors = 'replace') as f:
f.writelines(mdLines)
progress.stop_task(_taskID)
def correctTOC(progress:Progress, mdLines:list[str], tocSection:str = 'Contents') -> list[str]:
""" Correct the TOC to be compatible with pandoc.
"""
_taskID = progress.add_task('[blue]Correcting TOC', start=False, total=0)
_contents = f'# {tocSection}\n'
tocregex = re.compile('^(.*\[.*\])')
_lines:list[str] = []
_inTOC = False
for line in mdLines:
# find TOC section first
if line == _contents:
_inTOC = True
_lines.append(line)
continue
if _inTOC:
if line.startswith('#'): # End of TOC?
_inTOC = False
_lines.append(line)
continue
matches = re.findall(tocregex, line) # Replace entry
if matches:
_lines.append(f'{matches[0]} \n')
continue
else:
_lines.append(line)
progress.stop_task(_taskID)
return _lines
def replaceTableCaptions(progress:Progress, mdLines:list[str]) -> list[str]:
""" Replace table captions with a pandoc table caption.
"""
_taskID = progress.add_task('[blue]Replacing table captions', start=False, total=0)
# progress.update()
tableregex = re.compile('^\*\*(Table .*)\*\*')
_lines:list[str] = []
for line in mdLines:
matches = re.findall(tableregex, line)
if matches:
# move the caption to the beginning of the table and add a "Table:" prefix
_idx = len(_lines) - 1
while _idx >= 0 and _lines[_idx].startswith('|'):
_idx -= 1
if _idx > 0:
_lines.insert(_idx+1, f'Table: {matches[0]}\n')
_lines.insert(_idx+1, f'Table: {matches[0]}\n')
else:
_lines.append(line)
#print(_lines)
progress.stop_task(_taskID)
return _lines
def process(document:str, outDirectory:str) -> None:
with Progress(TextColumn('{task.description}'), TimeElapsedColumn()) as progress:
mdLines = readMDFile(progress, document)
mdLines = correctTOC(progress, mdLines)
mdLines = replaceTableCaptions(progress, mdLines)
writeMDFile(progress, mdLines, document, outDirectory)
if __name__ == '__main__':
# Parse command line arguments
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>', help = 'specify output directory')
parser.add_argument('document', help = 'document to parse')
args = parser.parse_args()
# Process documents and print output
os.makedirs(args.outDirectory, exist_ok = True)
process(args.document, args.outDirectory)
#
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile
#
markdown-it-py==2.2.0
# via rich
mdurl==0.1.2
# via markdown-it-py
pygments==2.15.1
# via rich
rich==13.3.5
# via oneM2M-markdown-to-pandoc-filter (setup.py)
from setuptools import setup, find_packages
setup(
name='oneM2M markdown to pandoc filter',
version='0.0.1',
author='Andreas Kraft, Miguel Ortega',
author_email='an.kraft@gmail.com',
description='Convert oneM2M Markdown to Pandoc input',
packages=find_packages(),
install_requires=[
'rich',
]
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment