diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..dde3a1c421b37d07b628bcad9de317d284724d2e --- /dev/null +++ b/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2022, Andreas Kraft +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md index 0545dd3682dbba62d0a4535df89c8cd220ca0f42..55c16da649db297772be3037b45204c0bd2d5b39 100644 --- a/README.md +++ b/README.md @@ -1 +1,27 @@ -TO BE ADDED \ No newline at end of file +# onem2m-spec2md + +Convert oneM2M specification documents to markdown + +## Installation + +- Requirement: Python version >= 3.10 +- Install the necessary packages with: +``` +python3 -m pip install -r requirements.txt +``` + +## Usage +- Create a directory with the Word document in it. The Word document **must** be in *docx* format. This can be achieved by opening the document with *Word* and save it in *docx* format to another file. +- Create a configuration file with the same basename as the Word document + *.ini* extension. This file may contain different configurations as the standard *config.ini* file provided. + - Alternativaly, a file named *config.ini* will apply to all files in that directory. + - It is only necessary to add the settings that are different from the *config.ini* file in the projects root directoy. That file will always act as a fallback. +- Run the converter as follows: +``` +python3 spec2md.py <path-to-word-document> +``` + +## FAQ + +### The converter doesn't seem to generate image files. + +Is *LibreOffice* already running? If yes, then close it. diff --git a/config.ini b/config.ini new file mode 100644 index 0000000000000000000000000000000000000000..3428d74fa85a8621df97a6623aa2a1b99645e879 --- /dev/null +++ b/config.ini @@ -0,0 +1,107 @@ +# +# config.ini +# +# Default paragraph mappings for oneM2M specs to Markdown format +# +# Note: all paragraph names must be in lower case. + + +[general] +replaceNbsp = + +; Rename EMF/WMF image references to a different file extension. +; Allowed values: png, svg. +; If not preseent, no renaming will happen. +renameEMFExtension = svg + +; Skip unrefereneced media files in conversion. +; Default: false +skipUnreferencedMediaFiles = false + +replaceLt = < + +; Add image captions to the markdown's alternate text. +; Note, that the image caption has follow the image in the document. +imageCaptions2AltText = true + +[toc] +addSectionNumbers = false +excludeFromNumbering = +tocStartParagraph = heading no numbering +tocHeaderLevel = 1 +generateToc = false +addTocMacro = false + + +[paragraphs] +normal = normal +h1 = heading 1 +h2 = heading 2 +h3 = heading 3 +h4 = heading 4 +h5 = heading 5 +h6 = heading 6 +h7 = heading 7 +a1 = heading 1 +a2 = heading 2 +a3 = heading 3 +note = no +example = ex, ew +ul1 = b1, b1+, list paragraph +ul2 = b2, b2+ +ul3 = b3, b3+ +ul4 = b4, b4+ +ul5 = b5, b5+ +ol1 = bn +ol2 = bl +tablecaption = caption, th +imagecaption = tf +image = fl +empty = fp +ignore = toc 1, toc 2, toc 3, toc 4, toc 5, toc 6, toc 7 + + +[characters] +; character value to replace with a character or string. +; The characters to be replaced and the characters that make the +; replacement string must be specified as hex values +; To remove a character from the file set it to 00 (2 zeros) + +; "(c)" +a9 = 286329 +; "(R)" +ae = 285229 +; space +a0 = 20 +; double quote +201c = 22 +201d = 22 +; single quote +2018 = 27 +2019 = 27 +; bullets +2022 = 2D20 +b7 = 2a +; Dashes +2013 = 2d +; Full-size comma +ff0c = 2c20 +; "<=" +2264 = 3c3d +; ">=" +2265 = 3e3d +; "..." +2026 = 2e2e2e +; um +339b = 756d +; "x" +d7 = 78 +; Ligature "fi" +fb01 = 6669 + + +[media] +; The following configurations specifies the cli command to convert a single .emf file to the.png and .svg image formats. +; If no conversion should or can be done, remove or comment the lines. +emfConverterPng = /Applications/LibreOffice.app/Contents/MacOS/soffice --headless --convert-to png "{infile}" --outdir "{outdir}" +emfConverterSvg = /Applications/LibreOffice.app/Contents/MacOS/soffice --headless --convert-to svg "{infile}" --outdir "{outdir}" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..42ccd5d54f107a4502dee1485af7374c0d185e44 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,16 @@ +# +# This file is autogenerated by pip-compile with python 3.10 +# To update, run: +# +# pip-compile +# +commonmark==0.9.1 + # via rich +lxml==4.9.1 + # via python-docx +pygments==2.13.0 + # via rich +python-docx==0.8.11 + # via oneM2M-spec-2-MD-converter (setup.py) +rich==12.5.1 + # via oneM2M-spec-2-MD-converter (setup.py) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..0e24443ca70f1a182d29db70e5815a4a65919243 --- /dev/null +++ b/setup.py @@ -0,0 +1,15 @@ +from setuptools import setup, find_packages + +setup( + name='oneM2M spec-2-MD converter', + version='0.0.1', + url='https://git.onem2m.org/tools/spec2md', + author='Andreas Kraft', + author_email='an.kraft@gmail.com', + description='Convert oneM2M specifications to Markdown', + packages=find_packages(), + install_requires=[ + 'rich', + 'python-docx', + ] +) diff --git a/spec2md.py b/spec2md.py new file mode 100644 index 0000000000000000000000000000000000000000..e7608526cd189a783ecead9092b6e3236b38102d --- /dev/null +++ b/spec2md.py @@ -0,0 +1,772 @@ +# +# spec2md.py +# +# Script to convert oneM2M specs in docx / openXML to markdown. +# +# (c) 2022 by Andreas Kraft +# License: BSD 3-Clause License. See the LICENSE file for further details. +# + + +from enum import IntEnum, auto +from typing import Callable, Tuple, Dict, Optional +from pathlib import Path, PurePath +from docx.document import Document +from docx.text.paragraph import Paragraph +import docx.opc.exceptions +from docx.table import _Cell, Table +from docx.oxml.table import CT_Tbl +from docx.oxml.text.paragraph import CT_P +import argparse, os, binascii, re, subprocess +from pathlib import Path +from rich.console import Console +from rich.progress import Progress, TextColumn, BarColumn +from rich.console import Console +from rich import inspect +import configparser, zipfile +from xml.etree import ElementTree as ET + + +class Style(IntEnum): + example = auto() + image = auto() + imagecaption = auto() + none = auto() + normal = auto() + note = auto() + orderedlist = auto() + orderedlist2 = auto() + unorderedlist = auto() + unorderedlist2 = auto() + unorderedlist3 = auto() + unorderedlist4 = auto() + unorderedlist5 = auto() + + # TODO more styles + + +defaultConfigFile = 'config.ini' +imagesSubDir = 'media' +unreferencedSubDir = 'unreferenced' + +# special characters +_linebreak = '<br />' +_entityLt = '<' +_nbsp = ' ' +_tocInsertPoint = '__t_o_c__' +_captionMarker = '__CAPTION__' + + +# https://learn.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing?view=openxml-2.8.1 + +# NOTE Crash when extracting tables +# Until a fix from the python-docx package is available, parsing tables might crash. +# Fix in python-docx table.py, around line 173 the lines in _cells() to: +# +# ... +# if tc.vMerge == ST_Merge.CONTINUE: +# if len(cells) >= col_count: # <-- +# cells.append(cells[-col_count]) +# .. + +# TODO regard addTOCMacro configuration +# TODO move -section switch to configuration +# TODO Support internal links +# TODO at least mark unsupported objects and images? + +# Rich console for pretty printing +console = Console() +_print:Callable = print + + + +class SectionNumbers(object): + + def __init__(self) -> None: + self.levels:list[int] = [ 0, 0, 0, 0] + self.heading:int = 0 + self.annex:int = 64 + + + def nextSectionNumber(self, level:int, isAnnex:bool = False) -> str: + if isAnnex: + self.levels[0] = self.annex + else: + self.levels[0] = self.heading + + # Increment appropriate level + self.levels[level - 1] += 1 + for i in range(level, len(self.levels)): + self.levels[i] = 0 + + if isAnnex: + self.levels[0] = chr(self.levels[0]) # type: ignore + + nr = '.'.join([ str(x) for x in self.levels if x != 0 ]) + ' ' + + if isAnnex: + self.annex = ord(self.levels[0]) # type:ignore + nr = f'Annex {nr}' + else: + self.heading = self.levels[0] # type:ignore + + return nr + + + +class DocumentConfiguration(object): + """ Per document configuration settings. + """ + + + def __init__(self, documentFileName:str) -> None: + self.documentFileName = documentFileName + self.configFileNameDef = f'{os.path.split(documentFileName)[0]}/config.ini' + self.configFileName = f'{os.path.splitext(documentFileName)[0]}.ini' + # print(self.configFileName) + + try: + + config = configparser.ConfigParser( interpolation=configparser.ExtendedInterpolation(), + converters={'list': lambda x: [i.strip() for i in x.split(',')]}, # Convert csv to list + ) + if len(config.read( [defaultConfigFile, self.configFileNameDef, self.configFileName])) == 0: + _print(f'[grey39]Configuration file missing or not readable for file: "{self.documentFileName}"') + return + # print([defaultConfigFile, self.configFileName]) + except configparser.Error as e: + _print('[red]Error in configuration file') + raise e + + # General + self.replaceNbsp = config.get('general', 'replaceNbsp', fallback = None) + self.replaceLt = config.get('general', 'replaceLt', fallback = _entityLt) + self.renameEMFExtension = config.get('general', 'renameEMFExtension', fallback = None) + self.skipUnreferencedMediaFiles = config.getboolean('general', 'skipUnreferencedMediaFiles', fallback = False) + self.imageCaptions2AltText = config.getboolean('general', 'imageCaptions2AltText', fallback = True) + + # Paragraphs + self.paragraphs = { c : config.getlist('paragraphs', c) # type: ignore [attr-defined] + for c in config['paragraphs'] } + self.normal = self.paragraphs['normal'] + self.h1 = self.paragraphs['h1'] + self.h2 = self.paragraphs['h2'] + self.h3 = self.paragraphs['h3'] + self.h4 = self.paragraphs['h4'] + self.h5 = self.paragraphs['h5'] + self.h6 = self.paragraphs['h6'] + self.h7 = self.paragraphs['h7'] + self.a1 = self.paragraphs['a1'] + self.a2 = self.paragraphs['a2'] + self.a3 = self.paragraphs['a3'] + self.ol1 = self.paragraphs['ol1'] + self.ol2 = self.paragraphs['ol2'] + self.ul1 = self.paragraphs['ul1'] + self.ul2 = self.paragraphs['ul2'] + self.ul3 = self.paragraphs['ul3'] + self.ul4 = self.paragraphs['ul4'] + self.ul5 = self.paragraphs['ul5'] + #self.continuedlist = self.paragraphs['continuedlist'] + self.note = self.paragraphs['note'] + self.example = self.paragraphs['example'] + self.tablecaption = self.paragraphs['tablecaption'] + self.imagecaption = self.paragraphs['imagecaption'] + self.image = self.paragraphs['image'] + self.ignore = self.paragraphs['ignore'] + self.empty = self.paragraphs['empty'] + + # TOC + self.addSectionNumbers = config.getboolean('toc', 'addSectionNumbers', fallback = False) + self.excludeFromNumbering = config.getlist('toc', 'excludeFromNumbering') # type: ignore [attr-defined] + self.tocStartParagraph = config.get('toc', 'tocStartParagraph') + self.tocHeaderLevel = config.getint('toc', 'tocHeaderLevel') + self.addTocMacro = config.getboolean('toc', 'addTocMacro', fallback = False) + self.generateToc = config.getboolean('toc', 'generateToc', fallback = False) + + # characters + self.characters = { int(c, 16) : binascii.unhexlify(config.get('characters', c)).decode('utf-8') # type: ignore [attr-defined] + for c in config['characters'] } + + # Media + self.emfConverterPng = config.get('media', 'emfConverterPng', fallback = None) + self.emfConverterSvg = config.get('media', 'emfConverterSvg', fallback = None) + + + + +def processDocuments(documents:list[str], outDirectory:str) -> None: + docs:Dict[str, Tuple[Document, DocumentConfiguration]] = {} + ptasks = {} + mediaRelations:Dict[str, str] = {} + addSectionNumbers = False + excludeFromNumbering:list[str] = [] + headers:list[Tuple[int, str]] = [] + emfFiles:list[str] = [] + referencedImages:list[str] = [] + + global _print + + with Progress( TextColumn('[progress.description]{task.description}'), + BarColumn(), + TextColumn('[progress.percentage]{task.percentage:>3.0f}%'), + speed_estimate_period=2.0) as progress: + + + _print = progress.print # Assign progress internal print to global print + _lastStyle = Style.none + + + def stopProgress(msg:str='') -> None: + progress.stop() + progress.remove_task(readTask) + _print(msg) + + + def iter_block_items(parent): + """ + Yield each paragraph and table child within *parent*, in document order. + Each returned value is an instance of either Table or Paragraph. *parent* + would most commonly be a reference to a main Document object, but + also works for a _Cell object, which itself can contain paragraphs and tables. + """ + if isinstance(parent, Document): + parent_elm = parent.element.body + elif isinstance(parent, _Cell): + parent_elm = parent._tc + else: + raise ValueError("something's not right") + + for child in parent_elm.iterchildren(): + if isinstance(child, CT_P): + yield Paragraph(child, parent) + elif isinstance(child, CT_Tbl): + yield Table(child, parent) + else: + # print(child.__class__.__name__) + ... + + def replaceNL(text:str, rpl:str = '') -> str: + return text.replace('\n', rpl) + + + def toMD(text:str) -> str: + return text.replace('<', docConfig.replaceLt) + + + #sectionNrs:list[int] = [ 0, 0, 0, 0] + sectionNrs = SectionNumbers() + + def toHeader(style:str, text:str, level:int, numbering:bool = True, isAnnex:bool = False) -> list[str]: + nonlocal addSectionNumbers, excludeFromNumbering + + if style in excludeFromNumbering: + numbering = False + + nr = '' + if numbering and addSectionNumbers: + nr = sectionNrs.nextSectionNumber(level, isAnnex = isAnnex) + + # Replace multiple white spaces + text = ' '.join(text.split()) + + # Remove linebreak in header lines + text = text.replace(_linebreak, ' ').strip() + + # Store header + headers.append( (level, replaceNL(text))) + + + return [ '', f'{"#" * level} {nr}{replaceNL(text)}' if text else '' ] + + + def strippedTag(tag:str) -> str: + """ Stripp the namespace from an element or attribute name. + """ + _, _, tag = tag.rpartition('}') + return tag + + + def getTextFromXML(elem:Paragraph) -> str: + + # Not-used document tags. + _ignoredTags = ( 'AlternateContent', + 'fldChar', + 'fldSimple', + 'instrText', + 'lastRenderedPageBreak', + 'noBreakHyphen', + 'pPr', + 'proofErr', + 'rPr', + 'moveFromRangeEnd', + 'ins', + 'del', + 'commentRangeStart', + 'commentRangeEnd', + 'commentReference', + ) + + + def _parseXML(element:ET.Element) -> str: + """ Recursively parse a document paragraph. + """ + nonlocal _ignoredTags + + _result = '' + tag = strippedTag(element.tag) # remove namespaces for easier handlings + match tag: + case 'p': + for x in element: + _result += _parseXML(x) + case 'r': + for x in element: + _result += _parseXML(x) + case 't': + _result += str(toMD(str(element.text))) + case 'br': + _result += _linebreak + case 'bookmarkStart' | 'bookmarkEnd': # TODO ? + pass + + case 'hyperlink': + # Hyperlinks and URLs + _hresult = '' + for x in element: + _hresult += _parseXML(x) + _result += f'[{_hresult}]({_hresult})' + + case 'drawing': + # Get the rID of a media file from the element's XML + # and map to an extracted media file + # inspect(element) + # _print(element.items()) + # _print(element.attrib) + blip = element.findall('ns1:inline/ns3:graphic/ns3:graphicData/ns4:pic/ns4:blipFill/ns3:blip', + namespaces = { + 'ns1' : 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing', + 'ns3' : 'http://schemas.openxmlformats.org/drawingml/2006/main', + 'ns4' : 'http://schemas.openxmlformats.org/drawingml/2006/picture', + }) + if blip and \ + (rId := blip[0].attrib.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')) and \ + (mediaFile := mediaRelations.get(rId)): + referencedImages.append(Path(mediaFile).stem) # Add to referenced files + if docConfig.renameEMFExtension and mediaFile.lower().endswith('.emf'): + mediaFile = str(Path(mediaFile).with_suffix(f'.{docConfig.renameEMFExtension}')) + _print(f'[yellow]Renaming EMF file reference to "{mediaFile}"') + _result += f'' + else: + _print(blip) + + case 'pict': + # for e in element: + # print(f'----{e}') + _print(f'[yellow]unsupported pict element: {element}') + _result += f'<mark>unsupported pict element: {element}</mark>' + + case 'object': + #inspect(element) + pass + + + case 'tab': + _result += ' ' # TODO nbsp? + + case _ if tag in _ignoredTags: # ignore + pass + + case _: # unknown + _print(tag) + _print(element) + return _result + + #_print(ET.fromstring(elem._p.xml)) + return _parseXML(ET.fromstring(elem._p.xml)) + + + + def checkSameStyle(style:Style, action:Optional[Callable] = None) -> None: + """ Check whether the given style is the same as the last one. If no, + then execute the optional action. + """ + nonlocal _lastStyle + if style == _lastStyle: + return + _lastStyle = style + if action: + action() + + + # Preparing tasks for progress + readTask = progress.add_task(f'Reading document{"s" if len(documents) > 1 else ""}', total = len(documents)) + + # + # Reading documents + # + + for d in documents: + if not (dp := Path(d)).exists(): + stopProgress(f'[red]Input document "{d}" does not esist') + return + if not dp.is_file(): + stopProgress(f'[red]Input document "{d}" is not a file') + return + try: + docs[d] = (docx.Document(d), DocumentConfiguration(d)) + ptasks[d] = progress.add_task(f'Processing {d}', total = 1000) + progress.update(readTask, advance=1) + except docx.opc.exceptions.PackageNotFoundError as e: + stopProgress(f'[red]Input document "{d}" is not a .docx file') + return + except Exception as e: + stopProgress(f'[red]Error reading file "{d}"') + console.print_exception() + return + + # + # Processing Documents + # + + for docFileName, (doc, docConfig) in docs.items(): + processTask = ptasks[docFileName] + docItems = list(iter_block_items(doc)) + addSectionNumbers = docConfig.addSectionNumbers + excludeFromNumbering = docConfig.excludeFromNumbering + + paragraphNr = 0 + + # TODO + #(docConfig.replaceNbsp) + + # Create output directories + try: + os.makedirs(documentDirName := f'{outDirectory}{os.sep}{Path(docFileName).stem}', exist_ok = True) + os.makedirs(imageDirName := f'{documentDirName}{os.sep}{imagesSubDir}', exist_ok = True) + os.makedirs(unreferencedDirName := f'{documentDirName}{os.sep}{unreferencedSubDir}', exist_ok = True) + except Exception as e: + stopProgress(f'[red]Error creating output directory "{d}"') + console.print_exception() + return + + # Add sub-progress task + progress.update(processTask, total = len(docItems) + 5) # + relations + image extraction + characters + toc + media convert + + + # Extract the media relations file, and get the mappings from document IDs to media files + mediaRelations = {} + with open(docFileName, 'rb') as docfile: + zip = zipfile.ZipFile(docfile) + for z in zip.filelist: + if z.filename == 'word/_rels/document.xml.rels': + xml = ET.fromstring(zip.read(z.filename)) + for element in xml: + if strippedTag(element.tag) == 'Relationship': + if (_a := element.attrib.get('Type')) and _a.endswith('/image'): # Only image relationships + mediaRelations[element.attrib['Id']] = element.attrib['Target'] + break + else: + _print('[red]Media relations file not found in document') + return + progress.update(processTask, advance = 1) # progress update + + + # Extracting images for the document next + with open(docFileName, 'rb') as docfile: + zip = zipfile.ZipFile(docfile) + for zipMediaFilename in [z.filename + for z in zip.filelist + if z.filename.startswith('word/media/')]: + + fn = f'{imageDirName}{os.sep}{os.path.basename(zipMediaFilename)}' + if fn.lower().endswith(('.emf', '.wmf')): + _print(f'[yellow]unsupported media file: {fn}', highlight = False) + emfFiles.append(fn) + with open(f'{fn}', 'wb') as imgFile: + imgFile.write(zip.read(zipMediaFilename)) + progress.update(processTask, advance = 1) # progress update + + # Processing the document + lines:list[str] = [] + imageIndex = 1 + + for elem in docItems: + paragraphNr += 1 + progress.update(processTask, advance = 1) + match type(elem).__name__: + case 'Paragraph': + text = getTextFromXML(elem) + style = elem.style.name.lower() + # print(f'{style} {text}') + + # Normal, body text + if style in docConfig.normal: + checkSameStyle(Style.normal, lambda:lines.append('')) + lines.append(text) + lines.append('') # Add empty line + + # Headers + elif style in docConfig.h1: + lines.extend(toHeader(style, text, 1)) + elif style in docConfig.h2: + lines.extend(toHeader(style, text, 2)) + elif style in docConfig.h3: + lines.extend(toHeader(style, text, 3)) + elif style in docConfig.h4: + lines.extend(toHeader(style, text, 4)) + elif style in docConfig.h5: + lines.extend(toHeader(style, text, 5)) + elif style in docConfig.h6: + lines.extend(toHeader(style, text, 6)) + elif style in docConfig.h7: + lines.extend(toHeader(style, text, 7)) + + # Annexes + elif style in docConfig.a1: + lines.extend(toHeader(style, text, 1, isAnnex = True)) + elif style in docConfig.a2: + lines.extend(toHeader(style, text, 2, isAnnex = True)) + elif style in docConfig.a3: + lines.extend(toHeader(style, text, 3, isAnnex = True)) + + # Ordered Lists + elif style in docConfig.ol1: + checkSameStyle(Style.orderedlist2, lambda:lines.append('')) + if len(elem.text): # ignore empty + lines.append(f'1. {text}') + elif style in docConfig.ol2: + checkSameStyle(Style.orderedlist2, lambda:lines.append('')) + if len(elem.text): # ignore empty + lines.append(f' 1. {text}') + + # Unordered Lists + elif style in docConfig.ul1: + checkSameStyle(Style.unorderedlist, lambda:lines.append('')) + if len(elem.text): # ignore empty + lines.append(f'- {text}') + elif style in docConfig.ul2: + checkSameStyle(Style.unorderedlist2, lambda:lines.append('')) + if len(elem.text): # ignore empty + lines.append(f'{" "*1}- {text}') + elif style in docConfig.ul3: + checkSameStyle(Style.unorderedlist3, lambda:lines.append('')) + if len(elem.text): # ignore empty + lines.append(f'{" "*2}- {text}') + elif style in docConfig.ul4: + checkSameStyle(Style.unorderedlist4, lambda:lines.append('')) + if len(elem.text): # ignore empty + lines.append(f'{" "*3}- {text}') + elif style in docConfig.ul5: + checkSameStyle(Style.unorderedlist5, lambda:lines.append('')) + if len(elem.text): # ignore empty + lines.append(f'{" "*4}- {text}') + + # Table Caption + elif style in docConfig.tablecaption: + lines.append('') + lines.append(f'**{replaceNL(text).strip()}**') + + # Image Caption + elif style in docConfig.imagecaption: + checkSameStyle(Style.imagecaption, lambda:lines.append('')) + _t = replaceNL(text).strip() + lines.append(f'**{_t}**') + lines.append('') + if docConfig.imageCaptions2AltText: + # Search and replace the previous image reference (max 10 lines back-search TODO configurable) + for idx in range(len(lines)-1, len(lines)-11, -1): + if _captionMarker in lines[idx]: + lines[idx] = lines[idx].replace(_captionMarker, _t) + + # Image & Figure + elif style in docConfig.image: + lines.append('') + lines.append(text) + + # Example + elif style in docConfig.example: + checkSameStyle(Style.example, lambda:lines.append('')) + # Replace linebreaks + for _t in text.split(_linebreak): + lines.append(f'`{_t if _t else " "}` ') # at least an empty space. And 2 spaces at the end for newline + + # Notes + elif style in docConfig.note: + checkSameStyle(Style.note) + lines.append(f'> {text}') + + # Add TOC + elif style in docConfig.tocStartParagraph: + lines.extend(toHeader(style, elem.text, docConfig.tocHeaderLevel, numbering = False)) + if docConfig.addTocMacro: + lines.append('[toc]') + if docConfig.generateToc: + lines.append(_tocInsertPoint) + + # Ignore & empty + elif style in docConfig.ignore: + pass + elif style in docConfig.empty: + lines.append('') + + # Print Unhandled tokens also to the console + else: + _print(f'{paragraphNr} {style}: {elem.style}: {text}') + lines.append(text) + + + case 'Table': + rows:list[list[str]] = [] + nrRows = 0 + for row in elem.rows: + cells:list[str] = [] + for cell in row.cells: + cells.append(f'{toMD(cell.text)} ') # add at least a space + rows.append(cells) + nrRows += 1 + + # Warning if this is a single-row table + if nrRows == 1: + _print(f'[red]Single-row table found. Consider replacing it in the original document:\n{rows[0]}') + + lines.append('') # Add an empty line before a table + for idx, row in enumerate(rows): + if idx == 1: + lines.append('-'.join('|' * (len(row) + 1) )) + lines.append(f'|{"|".join(row)}|' + .replace('\n', _linebreak)) # replace line breaks in cells + lines.append('') # Add another empty line after a table + + case _: + _print('[blue] {type(elem).__name__}') + + # + # Replace non-ascii characters + # + progress.update(processTask, advance = 1) # progress update + for i in range(len(lines)): + line = lines[i] + for ch in line: + if not ch.isascii(): + if (_ch := ord(ch)) in docConfig.characters: + if (rch := docConfig.characters[_ch]) == chr(0): + rch = '' + # line = line.replace(ch, docConfig.characters[_ch]) # we need the line for further replacements + line = line.replace(ch, rch) # we need the line for further replacements + lines[i] = line + else: + _print(f'[yellow]Non-ASCII character (consider to add a replacement in the config.ini file): "{ch}" / {ord(ch)} / {hex(ord(ch))}') + + # + # Insert auto-genrated table of contents + # + progress.update(processTask, advance = 1) # progress update + if docConfig.generateToc: + toc = '' + for l, t in headers: + link = t + # Convert to + # All text is converted to lowercase. + link = link.lower() + # All non-word text (e.g., punctuation (except: -()), HTML) is removed. Some characters are converted to upper-case hex. + # TODO decide / configurable how to replace special characters in links- Depends on rendering? "Markdown 2" (mac) likes to include + # link = ''.join( c if c not in '():' else f'{ord(c):x}'.upper() + link = ''.join( c if c not in '():' else ''#f'{ord(c):x}'.upper() + for c in link + if c.isspace() or c.isalnum() or c in '-():') + # All spaces are converted to hyphens. + link = ''.join( '-' if c.isspace() else c + for c in link) + # Two or more hyphens in a row are converted to one. + link = '-'.join(link.split()) + # TODO If a header with the same ID has already been generated, a unique incrementing number is appended, starting at 1. + # Add to toc + toc += f'{_nbsp * 4 * (l - 1)}[{t}](#{link}) \n' + + for i in range(len(lines)): + line = lines[i] + if line == _tocInsertPoint: + lines[i] = toc + # continue when found, perhaps we want to have more than one toc? + + # + # Map internal references + # + _definitionExpression = re.compile(r'^[`]?\[([\d]+|i.[\d]+)\]([^`]*)[`]?') + _referenceExpression = re.compile(r'([^>])\[([\d]+|i.[\d]+)\](?!</a>)') + + for i in range(len(lines)): + line = lines[i] + if (m := _definitionExpression.match(line)) is not None: + lines[i] = f'- <a name="_ref_{m.group(1)}"[{m.group(1)}]">[{m.group(1)}]</a>{m.group(2)}' + + def _repl(m:re.Match) -> str|None: + if m.group(1) == '"': + return None + return f'{m.group(1)}<a href="#_ref_{m.group(2)}">[{m.group(2)}]</a>' + + for i in range(len(lines)): + line = lines[i] + lines[i] = re.sub(_referenceExpression, _repl, line) + + # + # Write produced Markdown file + # + + with open(f'{documentDirName}{os.sep}{Path(d).stem}.md', 'w') as file: + file.write('\n'.join(lines)) + + # + # Convert media files + # + + def _convertImage(converter:str, format:str): + if converter: + for fn in list(emfFiles): + _f = Path(fn) # Filename to handle + _t = imageDirName # Target directory + if _f.stem not in referencedImages: + if not fn.startswith(unreferencedDirName): + _print(f'[red]Unreferenced image: {PurePath(fn).name} {"(skipped)" if docConfig.skipUnreferencedMediaFiles else ""}', highlight = False) + _print(f'[yellow]Moving image file to: {unreferencedDirName}', highlight = False) + _n = f'{unreferencedDirName}/{_f.name}' + _p = _f.replace(_n) + emfFiles.remove(fn) + emfFiles.append(_n) + fn = _n + if docConfig.skipUnreferencedMediaFiles: + continue + _t = unreferencedDirName + cmd = converter + cmd = cmd.replace('{infile}', fn).replace('{outdir}', _t) + _print(f'Converting EMF file: {fn} to "{format}"', highlight = False) + if (res := subprocess.run(cmd, shell = True, capture_output = True)).returncode != 0: + _print(f'[red] Error running command: {res.stderr.decode("utf-8")}') + + if docConfig.emfConverterPng: + _convertImage(docConfig.emfConverterPng, 'png') + if docConfig.emfConverterSvg: + _convertImage(docConfig.emfConverterSvg, 'svg') + + emfFiles.clear() + referencedImages.clear() + progress.update(processTask, advance = 1) # progress update + + + progress.stop() + + + +if __name__ == '__main__': + + # Parse command line arguments + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>', help = 'specify output directory') + + parser.add_argument('document', nargs = '+', help = 'documents to parse') + args = parser.parse_args() + + # Process documents and print output + os.makedirs(args.outDirectory, exist_ok = True) + + processDocuments(sorted(args.document), args.outDirectory) +