From 11136957d61b248473d5ee5d06280cc94bddf0d9 Mon Sep 17 00:00:00 2001
From: ankraft <an.kraft@gmail.com>
Date: Wed, 31 May 2023 16:40:38 +0200
Subject: [PATCH] First version of pandoc filter tool

---
 pandocFilter/pandocFilter.py  | 119 ++++++++++++++++++++++++++++++++++
 pandocFilter/requirements.txt |  14 ++++
 pandocFilter/setup.py         |  13 ++++
 3 files changed, 146 insertions(+)
 create mode 100644 pandocFilter/pandocFilter.py
 create mode 100644 pandocFilter/requirements.txt
 create mode 100644 pandocFilter/setup.py

diff --git a/pandocFilter/pandocFilter.py b/pandocFilter/pandocFilter.py
new file mode 100644
index 0000000..65c65e7
--- /dev/null
+++ b/pandocFilter/pandocFilter.py
@@ -0,0 +1,119 @@
+#
+#	pandocFilter.py
+#
+#	Script to convert a oneM2M Markdown file to pandoc input format
+#
+#	(c) 2023 by Andreas Kraft, Miguel Ortega
+#	License: BSD 3-Clause License. See the LICENSE file for further details.
+#
+
+import argparse, os, re
+from rich import print
+from rich.progress import Progress, TextColumn, TimeElapsedColumn
+
+def readMDFile(progress:Progress, document:str) -> list[str]:
+	"""	Read the markdown file and return a list of lines.
+	"""	
+	_taskID = progress.add_task('[blue]Reading document', start=False, total=0)
+
+	# Check if file exists
+	if not os.path.exists(document):
+		print(f'File {document} does not exist')
+		exit(1)
+
+	# Read the file
+	with open(document, 'r', encoding='utf-8', errors = 'replace') as f:
+		progress.stop_task(_taskID)
+		return f.readlines()
+	
+
+def writeMDFile(progress:Progress, mdLines:list[str], document:str, outDirectory:str) -> None:
+	"""	Write the markdown file.
+
+	"""
+	_taskID = progress.add_task('[blue]Writing document', start=False, total=0)
+
+	# Write the file
+	with open(f'{outDirectory}/{os.path.basename(document)}', 'w', encoding='utf-8', errors = 'replace') as f:
+		f.writelines(mdLines)
+	progress.stop_task(_taskID)
+
+
+def correctTOC(progress:Progress, mdLines:list[str], tocSection:str = 'Contents') -> list[str]:
+	"""	Correct the TOC to be compatible with pandoc.
+	"""
+	_taskID = progress.add_task('[blue]Correcting TOC', start=False, total=0)
+
+	_contents = f'# {tocSection}\n'
+	tocregex = re.compile('^(.*\[.*\])')
+	
+	_lines:list[str] = []
+	_inTOC = False
+	for line in mdLines:
+		# find TOC section first
+		if line == _contents:
+			_inTOC = True
+			_lines.append(line)
+			continue
+		if _inTOC:
+			if line.startswith('#'):	# End of TOC?
+				_inTOC = False
+				_lines.append(line)
+				continue
+			matches = re.findall(tocregex, line)	# Replace entry
+			if matches:
+				_lines.append(f'{matches[0]}  \n')
+				continue
+		else:
+			_lines.append(line)
+
+	progress.stop_task(_taskID)
+	return _lines
+
+
+def replaceTableCaptions(progress:Progress, mdLines:list[str]) -> list[str]:
+	"""	Replace table captions with a pandoc table caption.
+	"""
+	_taskID = progress.add_task('[blue]Replacing table captions', start=False, total=0)
+	# progress.update()
+	tableregex = re.compile('^\*\*(Table .*)\*\*')
+	
+	_lines:list[str] = []
+	for line in mdLines:
+		matches = re.findall(tableregex, line)
+		if matches:
+			# move the caption to the beginning of the table and add a "Table:" prefix
+			_idx = len(_lines) - 1
+			while _idx >= 0 and _lines[_idx].startswith('|'):
+				_idx -= 1
+			if _idx > 0:
+				_lines.insert(_idx+1, f'Table: {matches[0]}\n')
+			_lines.insert(_idx+1, f'Table: {matches[0]}\n')
+		else:
+			_lines.append(line)
+
+	#print(_lines)
+	progress.stop_task(_taskID)
+	return _lines
+
+
+def process(document:str, outDirectory:str) -> None:
+	with Progress(TextColumn('{task.description}'),  TimeElapsedColumn()) as progress:
+		mdLines = readMDFile(progress, document)
+		mdLines = correctTOC(progress, mdLines)
+		mdLines = replaceTableCaptions(progress, mdLines)
+		writeMDFile(progress, mdLines, document, outDirectory)
+
+
+
+if __name__ == '__main__':
+	# Parse command line arguments
+	parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+	parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>',  help = 'specify output directory')
+	parser.add_argument('document',  help = 'document to parse')
+	args = parser.parse_args()
+
+	# Process documents and print output
+	os.makedirs(args.outDirectory, exist_ok = True)
+
+	process(args.document, args.outDirectory)
diff --git a/pandocFilter/requirements.txt b/pandocFilter/requirements.txt
new file mode 100644
index 0000000..4c6b806
--- /dev/null
+++ b/pandocFilter/requirements.txt
@@ -0,0 +1,14 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile
+#
+markdown-it-py==2.2.0
+    # via rich
+mdurl==0.1.2
+    # via markdown-it-py
+pygments==2.15.1
+    # via rich
+rich==13.3.5
+    # via oneM2M-markdown-to-pandoc-filter (setup.py)
diff --git a/pandocFilter/setup.py b/pandocFilter/setup.py
new file mode 100644
index 0000000..589119e
--- /dev/null
+++ b/pandocFilter/setup.py
@@ -0,0 +1,13 @@
+from setuptools import setup, find_packages
+
+setup(
+	name='oneM2M markdown to pandoc filter',
+	version='0.0.1',
+	author='Andreas Kraft, Miguel Ortega',
+	author_email='an.kraft@gmail.com',
+	description='Convert oneM2M Markdown to Pandoc input',
+	packages=find_packages(),
+	install_requires=[
+		'rich',
+	 ]
+)
-- 
GitLab