# # pandocFilter.py # # Script to convert a oneM2M Markdown file to pandoc input format # # (c) 2023 by Andreas Kraft, Miguel Ortega # License: BSD 3-Clause License. See the LICENSE file for further details. # import argparse, os, re, sys from rich import print from rich.progress import Progress, TextColumn, TimeElapsedColumn def readMDFile(progress:Progress, document:str) -> list[str]: """ Read the markdown file and return a list of lines. """ _taskID = progress.add_task('[blue]Reading document', start=False, total=0) # Check if file exists if not os.path.exists(document): print(f'File {document} does not exist') exit(1) # Read the file with open(document, 'r', encoding='utf-8', errors = 'replace') as f: progress.stop_task(_taskID) return f.readlines() def writeMDFile(progress:Progress, mdLines:list[str], document:str, outDirectory:str) -> None: """ Write the markdown file. """ _taskID = progress.add_task('[blue]Writing document', start=False, total=0) # Write the file with open(f'{outDirectory}/{os.path.basename(document)}', 'w', encoding='utf-8', errors = 'replace') as f: f.writelines(mdLines) progress.stop_task(_taskID) def correctTOC(progress:Progress, mdLines:list[str], tocSection:str = 'Contents') -> list[str]: """ Correct the TOC to be compatible with pandoc. """ _taskID = progress.add_task('[blue]Correcting TOC', start=False, total=0) _contents = f'# {tocSection}\n' tocregex = re.compile('^(.*\[.*\])') _lines:list[str] = [] _inTOC = False for line in mdLines: # find TOC section first if line == _contents: _inTOC = True _lines.append(line) continue if _inTOC: if line.startswith('#'): # End of TOC? _inTOC = False _lines.append(f'\n') _lines.append(line) continue matches = re.findall(tocregex, line) # Replace entry if matches: _lines.append(f'{matches[0]} \n') continue else: _lines.append(line) progress.stop_task(_taskID) return _lines def replaceTableCaptions(progress:Progress, mdLines:list[str]) -> list[str]: """ Replace table captions with a pandoc table caption. """ _taskID = progress.add_task('[blue]Replacing table captions', start=False, total=0) # progress.update() tableregex = re.compile('^\*\*(Table .*)\*\*') _lines:list[str] = [] for line in mdLines: matches = re.findall(tableregex, line) if matches: # move the caption to the beginning of the table and add a "Table:" prefix _idx = len(_lines) - 1 while _idx >= 0 and _lines[_idx].startswith('|'): _idx -= 1 #if _idx > 0: # _lines.insert(_idx+1, f'Table: {matches[0]}\n') _lines.insert(_idx+1, f'Table: {matches[0]}\n') else: _lines.append(line) #print(_lines) progress.stop_task(_taskID) return _lines def replaceFigureCaptions(progress:Progress, mdLines:list[str]) -> list[str]: """ Replace table captions with a pandoc table caption. """ _taskID = progress.add_task('[blue]Replacing figure captions', start=False, total=0) # progress.update() figureregex = re.compile('^\*\*(Figure .*)\*\*') _lines:list[str] = [] for line in mdLines: matches = re.findall(figureregex, line) if matches: # Replace the previous figure markdown name with the captions _idx = len(_lines) - 1 while _idx >= 0 and not _lines[_idx].startswith('!['): _idx -= 1 if _idx > 0: _lines[_idx] = re.sub(r'^.*?]', f'![{matches[0]}]', _lines[_idx]) else: _lines.append(line) progress.stop_task(_taskID) return _lines def process(document:str, outDirectory:str) -> None: with Progress(TextColumn('{task.description}'), TimeElapsedColumn()) as progress: mdLines = readMDFile(progress, document) mdLines = correctTOC(progress, mdLines) mdLines = replaceTableCaptions(progress, mdLines) mdLines = replaceFigureCaptions(progress, mdLines) writeMDFile(progress, mdLines, document, outDirectory) def main(args=None): # Parse command line arguments parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>', help = 'specify output directory') parser.add_argument('document', help = 'document to parse') args = parser.parse_args() # Process documents and print output os.makedirs(args.outDirectory, exist_ok = True) process(args.document, args.outDirectory) if __name__ == '__main__': sys.exit(main())