Initial version copied from the original repository

68c2f30b · Andreas Kraft · 45fac16d · 68c2f30b · 68c2f30b · 68c2f30b
Commit 68c2f30b authored 2 years ago by Andreas Kraft
--- a/LICENSE
+++ b/LICENSE
+BSD 3-Clause License
+Copyright (c) 2022, Andreas Kraft
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/README.md
+++ b/README.md
-TO BE ADDED
+# onem2m-spec2md
\ No newline at end of file
+Convert oneM2M specification documents to markdown
+## Installation
+- Requirement: Python version >= 3.10
+- Install the necessary packages with:
+```
+python3 -m pip install -r requirements.txt
+```
+## Usage
+- Create a directory with the Word document in it. The Word document **must** be in *docx* format. This can be achieved by opening the document with *Word* and save it in *docx* format to another file.
+- Create a configuration file with the same basename as the Word document + *.ini* extension. This file may contain different configurations as the standard *config.ini* file provided. 
+  - Alternativaly, a file named *config.ini* will apply to all files in that directory.
+  - It is only necessary to add the settings that are different from the *config.ini* file in the projects root directoy. That file will always act as a fallback.
+- Run the converter as follows:
+```
+python3 spec2md.py <path-to-word-document>
+```
+## FAQ
+### The converter doesn't seem to generate image files.
+Is *LibreOffice* already running? If yes, then close it.
--- a/config.ini
+++ b/config.ini
+#
+#	config.ini
+#
+#	Default paragraph mappings for oneM2M specs to Markdown format
+#
+#	Note: all paragraph names must be in lower case.
+[general]
+replaceNbsp = &nbsp;
+; Rename EMF/WMF image references to a different file extension.
+; Allowed values: png, svg.
+; If not preseent, no renaming will happen.
+renameEMFExtension = svg
+; Skip unrefereneced media files in conversion.
+; Default: false
+skipUnreferencedMediaFiles = false
+replaceLt = &lt;
+; Add image captions to the markdown's alternate text.
+; Note, that the image caption has follow the image in the document.
+imageCaptions2AltText = true
+[toc]
+addSectionNumbers = false
+excludeFromNumbering =
+tocStartParagraph = heading no numbering
+tocHeaderLevel = 1
+generateToc = false
+addTocMacro = false
+[paragraphs]
+normal = normal
+h1 = heading 1
+h2 = heading 2
+h3 = heading 3
+h4 = heading 4
+h5 = heading 5
+h6 = heading 6
+h7 = heading 7
+a1 = heading 1
+a2 = heading 2
+a3 = heading 3
+note = no
+example = ex, ew
+ul1 = b1, b1+, list paragraph
+ul2 = b2, b2+
+ul3 = b3, b3+
+ul4 = b4, b4+
+ul5 = b5, b5+
+ol1 = bn
+ol2 = bl
+tablecaption = caption, th
+imagecaption = tf
+image = fl
+empty = fp
+ignore = toc 1, toc 2, toc 3, toc 4, toc 5, toc 6, toc 7
+[characters]
+; character value to replace with a character or string.
+; The characters to be replaced and the characters that make the
+; replacement string must be specified as hex values
+; To remove a character from the file set it to 00 (2 zeros)
+; "(c)"
+a9 = 286329
+; "(R)"
+ae = 285229
+; space
+a0 = 20
+; double quote
+201c = 22
+201d = 22
+; single quote
+2018 = 27
+2019 = 27
+; bullets
+2022 = 2D20
+b7 = 2a
+; Dashes
+2013 = 2d
+; Full-size comma
+ff0c = 2c20
+; "<="
+2264 = 3c3d
+; ">="
+2265 = 3e3d
+; "..."
+2026 = 2e2e2e
+; um
+339b = 756d
+; "x"
+d7 = 78
+; Ligature "fi"
+fb01 = 6669
+[media]
+; The following configurations specifies the cli command to convert a single .emf file to the.png and .svg image formats.
+; If no conversion should or can be done, remove or comment the lines.
+emfConverterPng = /Applications/LibreOffice.app/Contents/MacOS/soffice --headless --convert-to png "{infile}" --outdir "{outdir}"
+emfConverterSvg = /Applications/LibreOffice.app/Contents/MacOS/soffice --headless --convert-to svg "{infile}" --outdir "{outdir}"
--- a/requirements.txt
+++ b/requirements.txt
+#
+# This file is autogenerated by pip-compile with python 3.10
+# To update, run:
+#
+#    pip-compile
+#
+commonmark==0.9.1
+    # via rich
+lxml==4.9.1
+    # via python-docx
+pygments==2.13.0
+    # via rich
+python-docx==0.8.11
+    # via oneM2M-spec-2-MD-converter (setup.py)
+rich==12.5.1
+    # via oneM2M-spec-2-MD-converter (setup.py)
--- a/setup.py
+++ b/setup.py
+from setuptools import setup, find_packages
+setup(
+	name='oneM2M spec-2-MD converter',
+	version='0.0.1',
+	url='https://git.onem2m.org/tools/spec2md',
+	author='Andreas Kraft',
+	author_email='an.kraft@gmail.com',
+	description='Convert oneM2M specifications to Markdown',
+	packages=find_packages(),
+	install_requires=[
+		'rich',
+		'python-docx',
+	 ]
+)
--- a/spec2md.py
+++ b/spec2md.py
+#
+#	spec2md.py
+#
+#	Script to convert oneM2M specs in docx / openXML to markdown.
+#
+#	(c) 2022 by Andreas Kraft
+#	License: BSD 3-Clause License. See the LICENSE file for further details.
+#
+from enum import IntEnum, auto
+from typing import Callable, Tuple, Dict, Optional
+from pathlib import Path, PurePath
+from docx.document import Document
+from docx.text.paragraph import Paragraph
+import docx.opc.exceptions
+from docx.table import _Cell, Table
+from docx.oxml.table import CT_Tbl
+from docx.oxml.text.paragraph import CT_P
+import argparse, os, binascii, re, subprocess
+from pathlib import Path
+from rich.console import Console
+from rich.progress import Progress, TextColumn, BarColumn
+from rich.console import Console
+from rich import inspect
+import configparser, zipfile
+from xml.etree import ElementTree as ET
+class Style(IntEnum):
+	example = auto()
+	image = auto()
+	imagecaption = auto()
+	none = auto()
+	normal = auto()
+	note = auto()
+	orderedlist = auto()
+	orderedlist2 = auto()
+	unorderedlist = auto()
+	unorderedlist2 = auto()
+	unorderedlist3 = auto()
+	unorderedlist4 = auto()
+	unorderedlist5 = auto()
+	# TODO more styles
+defaultConfigFile = 'config.ini'
+imagesSubDir = 'media'
+unreferencedSubDir = 'unreferenced'
+# special characters
+_linebreak = '<br />'
+_entityLt = '&lt;'
+_nbsp = '&nbsp;'
+_tocInsertPoint = '__t_o_c__'
+_captionMarker = '__CAPTION__'
+# https://learn.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing?view=openxml-2.8.1
+# NOTE Crash when extracting tables
+# Until a fix from the python-docx package is available, parsing tables might crash.
+# Fix in python-docx table.py, around line 173 the lines in _cells() to:
+#
+#	...
+#	if tc.vMerge == ST_Merge.CONTINUE:
+#		if len(cells) >= col_count:         # <--
+#			cells.append(cells[-col_count])
+#	..
+# TODO regard addTOCMacro configuration
+# TODO move -section switch to configuration
+# TODO Support internal links
+# TODO at least mark unsupported objects and images?
+#	Rich console for pretty printing
+console = Console()
+_print:Callable = print
+class SectionNumbers(object):
+	def __init__(self) -> None:
+		self.levels:list[int] = [ 0, 0, 0, 0]
+		self.heading:int = 0
+		self.annex:int = 64
+	def nextSectionNumber(self, level:int, isAnnex:bool = False) -> str:
+		if isAnnex:
+			self.levels[0] = self.annex
+		else:
+			self.levels[0] = self.heading
+		# Increment appropriate level
+		self.levels[level - 1] += 1
+		for i in range(level, len(self.levels)):
+			self.levels[i] = 0
+		if isAnnex:
+			self.levels[0] = chr(self.levels[0]) 	# type: ignore
+		nr =  '.'.join([ str(x) for x in self.levels if x != 0 ]) + ' '
+		if isAnnex:
+			self.annex = ord(self.levels[0])		# type:ignore
+			nr = f'Annex {nr}'
+		else:
+			self.heading = self.levels[0]			# type:ignore
+		return nr
+class DocumentConfiguration(object):
+	"""	Per document configuration settings.
+	"""
+	def __init__(self, documentFileName:str) -> None:
+		self.documentFileName = documentFileName
+		self.configFileNameDef = f'{os.path.split(documentFileName)[0]}/config.ini'
+		self.configFileName = f'{os.path.splitext(documentFileName)[0]}.ini'
+		# print(self.configFileName)
+		try:
+			config = configparser.ConfigParser(	interpolation=configparser.ExtendedInterpolation(),
+												converters={'list': lambda x: [i.strip() for i in x.split(',')]},	# Convert csv to list
+											)
+			if len(config.read( [defaultConfigFile, self.configFileNameDef, self.configFileName])) == 0:
+				_print(f'[grey39]Configuration file missing or not readable for file: "{self.documentFileName}"')	
+				return
+			# print([defaultConfigFile, self.configFileName])
+		except configparser.Error as e:
+			_print('[red]Error in configuration file')	
+			raise e
+		#	General
+		self.replaceNbsp = config.get('general', 'replaceNbsp', fallback = None)
+		self.replaceLt = config.get('general', 'replaceLt', fallback = _entityLt)
+		self.renameEMFExtension = config.get('general', 'renameEMFExtension', fallback = None)
+		self.skipUnreferencedMediaFiles = config.getboolean('general', 'skipUnreferencedMediaFiles', fallback = False)
+		self.imageCaptions2AltText = config.getboolean('general', 'imageCaptions2AltText', fallback = True)
+		#	Paragraphs
+		self.paragraphs = { c : config.getlist('paragraphs', c)	# type: ignore [attr-defined]
+							for c in config['paragraphs'] }
+		self.normal = self.paragraphs['normal']
+		self.h1 = self.paragraphs['h1']
+		self.h2 = self.paragraphs['h2']
+		self.h3 = self.paragraphs['h3']
+		self.h4 = self.paragraphs['h4']
+		self.h5 = self.paragraphs['h5']
+		self.h6 = self.paragraphs['h6']
+		self.h7 = self.paragraphs['h7']
+		self.a1 = self.paragraphs['a1']
+		self.a2 = self.paragraphs['a2']
+		self.a3 = self.paragraphs['a3']
+		self.ol1 = self.paragraphs['ol1']
+		self.ol2 = self.paragraphs['ol2']
+		self.ul1 = self.paragraphs['ul1']
+		self.ul2 = self.paragraphs['ul2']
+		self.ul3 = self.paragraphs['ul3']
+		self.ul4 = self.paragraphs['ul4']
+		self.ul5 = self.paragraphs['ul5']
+		#self.continuedlist = self.paragraphs['continuedlist']
+		self.note = self.paragraphs['note']
+		self.example = self.paragraphs['example']
+		self.tablecaption = self.paragraphs['tablecaption']
+		self.imagecaption = self.paragraphs['imagecaption']
+		self.image = self.paragraphs['image']
+		self.ignore = self.paragraphs['ignore']
+		self.empty = self.paragraphs['empty']
+		#	TOC
+		self.addSectionNumbers = config.getboolean('toc', 'addSectionNumbers', fallback = False)
+		self.excludeFromNumbering = config.getlist('toc', 'excludeFromNumbering')	# type: ignore [attr-defined]
+		self.tocStartParagraph = config.get('toc', 'tocStartParagraph')
+		self.tocHeaderLevel = config.getint('toc', 'tocHeaderLevel')
+		self.addTocMacro = config.getboolean('toc', 'addTocMacro', fallback = False)
+		self.generateToc = config.getboolean('toc', 'generateToc', fallback = False)
+		# characters
+		self.characters = { int(c, 16) : binascii.unhexlify(config.get('characters', c)).decode('utf-8')	# type: ignore [attr-defined]
+							for c in config['characters'] }
+		# Media
+		self.emfConverterPng = config.get('media', 'emfConverterPng', fallback = None)
+		self.emfConverterSvg = config.get('media', 'emfConverterSvg', fallback = None)
+def processDocuments(documents:list[str], outDirectory:str) -> None:
+	docs:Dict[str, Tuple[Document, DocumentConfiguration]]		= {}
+	ptasks 														= {}
+	mediaRelations:Dict[str, str] 								= {}
+	addSectionNumbers 											= False
+	excludeFromNumbering:list[str]								= []
+	headers:list[Tuple[int, str]]								= []
+	emfFiles:list[str]											= []
+	referencedImages:list[str]									= []
+	global _print
+	with Progress(	TextColumn('[progress.description]{task.description}'),
+					BarColumn(),
+					TextColumn('[progress.percentage]{task.percentage:>3.0f}%'),
+					speed_estimate_period=2.0) as progress:
+		_print = progress.print		# Assign progress internal print to global print
+		_lastStyle = Style.none
+		def stopProgress(msg:str='') -> None:
+			progress.stop()
+			progress.remove_task(readTask)
+			_print(msg)
+		def iter_block_items(parent):
+			"""
+			Yield each paragraph and table child within *parent*, in document order.
+			Each returned value is an instance of either Table or Paragraph. *parent*
+			would most commonly be a reference to a main Document object, but
+			also works for a _Cell object, which itself can contain paragraphs and tables.
+			"""
+			if isinstance(parent, Document):
+				parent_elm = parent.element.body
+			elif isinstance(parent, _Cell):
+				parent_elm = parent._tc
+			else:
+				raise ValueError("something's not right")
+			for child in parent_elm.iterchildren():
+				if isinstance(child, CT_P):
+					yield Paragraph(child, parent)
+				elif isinstance(child, CT_Tbl):
+					yield Table(child, parent)
+				else:
+					# print(child.__class__.__name__)
+					...
+		def replaceNL(text:str, rpl:str = '') -> str:
+			return text.replace('\n', rpl)
+		def toMD(text:str) -> str:
+			return text.replace('<', docConfig.replaceLt)
+		#sectionNrs:list[int] = [ 0, 0, 0, 0]
+		sectionNrs = SectionNumbers()
+		def toHeader(style:str, text:str, level:int, numbering:bool = True, isAnnex:bool = False) -> list[str]:
+			nonlocal addSectionNumbers, excludeFromNumbering
+			if style in excludeFromNumbering:
+				numbering = False
+			nr = ''
+			if numbering and addSectionNumbers:
+				nr = sectionNrs.nextSectionNumber(level, isAnnex = isAnnex)
+			# Replace multiple white spaces
+			text = ' '.join(text.split())
+			# Remove linebreak in header lines
+			text = text.replace(_linebreak, ' ').strip()
+			# Store header
+			headers.append( (level, replaceNL(text)))
+			return [ '', f'{"#" * level} {nr}{replaceNL(text)}' if text else '' ]
+		def strippedTag(tag:str) -> str:
+			"""	Stripp the namespace from an element or attribute name.
+			"""
+			_, _, tag = tag.rpartition('}')
+			return tag
+		def getTextFromXML(elem:Paragraph) -> str:
+			#	Not-used document tags.
+			_ignoredTags = ( 'AlternateContent',
+							 'fldChar',
+							 'fldSimple',
+							 'instrText',
+							 'lastRenderedPageBreak',
+							 'noBreakHyphen',
+							 'pPr',
+							 'proofErr',
+							 'rPr',
+							 'moveFromRangeEnd',
+							 'ins',
+							 'del',
+							 'commentRangeStart',
+							 'commentRangeEnd',
+							 'commentReference',
+			)
+			def _parseXML(element:ET.Element) -> str:
+				"""	Recursively parse a document paragraph.
+				"""
+				nonlocal _ignoredTags
+				_result = ''
+				tag = strippedTag(element.tag)	# remove namespaces for easier handlings
+				match tag:
+					case 'p':
+						for x in element:
+							_result += _parseXML(x)
+					case 'r':
+						for x in element:
+							_result += _parseXML(x)
+					case 't':
+						_result += str(toMD(str(element.text)))
+					case 'br':
+						_result += _linebreak
+					case 'bookmarkStart' | 'bookmarkEnd':		# TODO ?
+						pass
+					case 'hyperlink':
+						# Hyperlinks and URLs
+						_hresult = ''
+						for x in element:
+							_hresult += _parseXML(x)
+						_result += f'[{_hresult}]({_hresult})'
+					case 'drawing':
+						# Get the rID of a media file from the element's XML
+						# and map to an extracted media file
+						# inspect(element)
+						# _print(element.items())
+						# _print(element.attrib)
+						blip = element.findall('ns1:inline/ns3:graphic/ns3:graphicData/ns4:pic/ns4:blipFill/ns3:blip', 
+												namespaces = { 
+													'ns1' : 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
+													'ns3' : 'http://schemas.openxmlformats.org/drawingml/2006/main',
+													'ns4' : 'http://schemas.openxmlformats.org/drawingml/2006/picture',
+												})
+						if blip and \
+							(rId := blip[0].attrib.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')) and \
+							(mediaFile := mediaRelations.get(rId)):
+							referencedImages.append(Path(mediaFile).stem)	# Add to referenced files
+							if docConfig.renameEMFExtension and mediaFile.lower().endswith('.emf'):
+								mediaFile = str(Path(mediaFile).with_suffix(f'.{docConfig.renameEMFExtension}'))
+								_print(f'[yellow]Renaming EMF file reference to "{mediaFile}"')
+							_result += f'![{_captionMarker}]({mediaFile})'
+						else:
+							_print(blip)
+					case 'pict':
+						# for e in element:
+						# 	print(f'----{e}')
+						_print(f'[yellow]unsupported pict element: {element}')
+						_result += f'<mark>unsupported pict element: {element}</mark>'
+					case 'object':
+						#inspect(element)
+						pass
+					case 'tab':
+						_result += '    '	# TODO nbsp?
+					case _ if tag in _ignoredTags:	# ignore
+						pass
+					case _:	# unknown
+						_print(tag)
+						_print(element)
+				return _result
+			#_print(ET.fromstring(elem._p.xml))
+			return _parseXML(ET.fromstring(elem._p.xml))
+		def checkSameStyle(style:Style, action:Optional[Callable] = None) -> None:
+			"""	Check whether the given style is the same as the last one. If no,
+				then execute the optional action.
+			"""
+			nonlocal _lastStyle
+			if style == _lastStyle:
+				return
+			_lastStyle = style
+			if action:
+				action()
+		# Preparing tasks for progress
+		readTask 	= progress.add_task(f'Reading document{"s" if len(documents) > 1 else ""}', total = len(documents))
+		#
+		#	Reading documents
+		#
+		for d in documents:
+			if not (dp := Path(d)).exists():
+				stopProgress(f'[red]Input document "{d}" does not esist')
+				return
+			if not dp.is_file():
+				stopProgress(f'[red]Input document "{d}" is not a file')
+				return
+			try:
+				docs[d] = (docx.Document(d), DocumentConfiguration(d))
+				ptasks[d] = progress.add_task(f'Processing {d}', total = 1000)
+				progress.update(readTask, advance=1)
+			except docx.opc.exceptions.PackageNotFoundError as e:
+				stopProgress(f'[red]Input document "{d}" is not a .docx file')
+				return
+			except Exception as e:
+				stopProgress(f'[red]Error reading file "{d}"')
+				console.print_exception()
+				return 
+		#
+		#	Processing Documents
+		#
+		for docFileName, (doc, docConfig) in docs.items():
+			processTask = ptasks[docFileName]
+			docItems = list(iter_block_items(doc))
+			addSectionNumbers = docConfig.addSectionNumbers
+			excludeFromNumbering = docConfig.excludeFromNumbering
+			paragraphNr = 0
+			# TODO
+			#(docConfig.replaceNbsp)
+			#	Create output directories
+			try:
+				os.makedirs(documentDirName := f'{outDirectory}{os.sep}{Path(docFileName).stem}', exist_ok = True)
+				os.makedirs(imageDirName := f'{documentDirName}{os.sep}{imagesSubDir}', exist_ok = True)
+				os.makedirs(unreferencedDirName := f'{documentDirName}{os.sep}{unreferencedSubDir}', exist_ok = True)
+			except Exception as e:
+				stopProgress(f'[red]Error creating output directory "{d}"')
+				console.print_exception()
+				return
+			# Add sub-progress task
+			progress.update(processTask, total = len(docItems) + 5)	# + relations + image extraction + characters + toc + media convert
+			#	Extract the media relations file, and get the mappings from document IDs to media files
+			mediaRelations = {}
+			with open(docFileName, 'rb') as docfile:
+				zip = zipfile.ZipFile(docfile)
+				for z in zip.filelist:
+					if z.filename == 'word/_rels/document.xml.rels':
+						xml = ET.fromstring(zip.read(z.filename))
+						for element in xml:
+							if strippedTag(element.tag) == 'Relationship':
+								if (_a := element.attrib.get('Type')) and _a.endswith('/image'):	# Only image relationships
+									mediaRelations[element.attrib['Id']] = element.attrib['Target']
+						break
+				else:
+					_print('[red]Media relations file not found in document')
+					return
+			progress.update(processTask, advance = 1)	# progress update
+			#	Extracting images for the document next 
+			with open(docFileName, 'rb') as docfile:
+				zip = zipfile.ZipFile(docfile)
+				for zipMediaFilename in [z.filename 
+										 for z in zip.filelist 
+										 if z.filename.startswith('word/media/')]:
+					fn = f'{imageDirName}{os.sep}{os.path.basename(zipMediaFilename)}'
+					if fn.lower().endswith(('.emf', '.wmf')):
+						_print(f'[yellow]unsupported media file: {fn}', highlight = False)
+						emfFiles.append(fn)
+					with open(f'{fn}', 'wb') as imgFile:
+						imgFile.write(zip.read(zipMediaFilename))
+			progress.update(processTask, advance = 1)	# progress update
+			# 	Processing the document			
+			lines:list[str] = []
+			imageIndex = 1
+			for elem in docItems:
+				paragraphNr += 1
+				progress.update(processTask, advance = 1)
+				match type(elem).__name__:
+					case 'Paragraph':
+						text = getTextFromXML(elem)
+						style = elem.style.name.lower()
+						# print(f'{style} {text}')
+						#	Normal, body text
+						if style in docConfig.normal:
+							checkSameStyle(Style.normal, lambda:lines.append(''))
+							lines.append(text)
+							lines.append('')	# Add empty line 
+						#	Headers
+						elif style in docConfig.h1:
+							lines.extend(toHeader(style, text, 1))
+						elif style in docConfig.h2:
+							lines.extend(toHeader(style, text, 2))
+						elif style in docConfig.h3:
+							lines.extend(toHeader(style, text, 3))
+						elif style in docConfig.h4:
+							lines.extend(toHeader(style, text, 4))
+						elif style in docConfig.h5:
+							lines.extend(toHeader(style, text, 5))
+						elif style in docConfig.h6:
+							lines.extend(toHeader(style, text, 6))
+						elif style in docConfig.h7:
+							lines.extend(toHeader(style, text, 7))
+						#	Annexes
+						elif style in docConfig.a1:
+							lines.extend(toHeader(style, text, 1, isAnnex = True))
+						elif style in docConfig.a2:
+							lines.extend(toHeader(style, text, 2, isAnnex = True))
+						elif style in docConfig.a3:
+							lines.extend(toHeader(style, text, 3, isAnnex = True))
+						#	Ordered Lists
+						elif style in docConfig.ol1:
+							checkSameStyle(Style.orderedlist2, lambda:lines.append(''))
+							if len(elem.text):	# ignore empty
+								lines.append(f'1. {text}')
+						elif style in docConfig.ol2:
+							checkSameStyle(Style.orderedlist2, lambda:lines.append(''))
+							if len(elem.text):	# ignore empty
+								lines.append(f'     1. {text}')
+						#	Unordered Lists 
+						elif style in docConfig.ul1:
+							checkSameStyle(Style.unorderedlist, lambda:lines.append(''))
+							if len(elem.text):	# ignore empty
+								lines.append(f'- {text}')
+						elif style in docConfig.ul2:
+							checkSameStyle(Style.unorderedlist2, lambda:lines.append(''))
+							if len(elem.text):	# ignore empty
+								lines.append(f'{"    "*1}- {text}')
+						elif style in docConfig.ul3:
+							checkSameStyle(Style.unorderedlist3, lambda:lines.append(''))
+							if len(elem.text):	# ignore empty
+								lines.append(f'{"    "*2}- {text}')
+						elif style in docConfig.ul4:
+							checkSameStyle(Style.unorderedlist4, lambda:lines.append(''))
+							if len(elem.text):	# ignore empty
+								lines.append(f'{"    "*3}- {text}')
+						elif style in docConfig.ul5:
+							checkSameStyle(Style.unorderedlist5, lambda:lines.append(''))
+							if len(elem.text):	# ignore empty
+								lines.append(f'{"    "*4}- {text}')
+						#	Table Caption
+						elif style in docConfig.tablecaption:
+							lines.append('')
+							lines.append(f'**{replaceNL(text).strip()}**')
+						#	Image Caption
+						elif style in docConfig.imagecaption:
+							checkSameStyle(Style.imagecaption, lambda:lines.append(''))
+							_t = replaceNL(text).strip()
+							lines.append(f'**{_t}**')
+							lines.append('')
+							if docConfig.imageCaptions2AltText:
+								# Search and replace the previous image reference (max 10 lines back-search TODO configurable)
+								for idx in range(len(lines)-1, len(lines)-11, -1):
+									if _captionMarker in lines[idx]:
+										lines[idx] = lines[idx].replace(_captionMarker, _t)
+						#	Image & Figure
+						elif style in docConfig.image:
+							lines.append('')
+							lines.append(text)
+						#	Example
+						elif style in docConfig.example:
+							checkSameStyle(Style.example, lambda:lines.append(''))
+							# Replace linebreaks
+							for _t in text.split(_linebreak):
+								lines.append(f'`{_t if _t else " "}`  ') # at least an empty space. And 2 spaces at the end for newline
+						#	Notes
+						elif style in docConfig.note:
+							checkSameStyle(Style.note)
+							lines.append(f'> {text}')
+						# 	Add TOC
+						elif style in docConfig.tocStartParagraph:
+							lines.extend(toHeader(style, elem.text, docConfig.tocHeaderLevel, numbering = False))
+							if docConfig.addTocMacro:
+								lines.append('[toc]')
+							if docConfig.generateToc:
+								lines.append(_tocInsertPoint)
+						# 	Ignore & empty
+						elif style in docConfig.ignore:
+							pass
+						elif style in docConfig.empty:
+							lines.append('')
+						# Print Unhandled tokens also to the console
+						else:
+							_print(f'{paragraphNr} {style}: {elem.style}: {text}')
+							lines.append(text)
+					case 'Table':
+						rows:list[list[str]] = []
+						nrRows = 0
+						for row in elem.rows:
+							cells:list[str] = []
+							for cell in row.cells:
+								cells.append(f'{toMD(cell.text)} ')	# add at least a space
+							rows.append(cells)
+							nrRows += 1
+						# Warning if this is a single-row table
+						if nrRows == 1:
+							_print(f'[red]Single-row table found. Consider replacing it in the original document:\n{rows[0]}')
+						lines.append('')	# Add an empty line before a table
+						for idx, row in enumerate(rows):
+							if idx == 1:
+								lines.append('-'.join('|' * (len(row) + 1) ))
+							lines.append(f'|{"|".join(row)}|'
+										 .replace('\n', _linebreak))	# replace line breaks in cells
+						lines.append('')	# Add another empty line after a table
+					case _:
+						_print('[blue] {type(elem).__name__}')
+			#
+			#	Replace non-ascii characters
+			#
+			progress.update(processTask, advance = 1)	# progress update
+			for i in range(len(lines)):
+				line = lines[i]
+				for ch in line:
+					if not ch.isascii():
+						if (_ch := ord(ch)) in docConfig.characters:
+							if (rch := docConfig.characters[_ch]) == chr(0):
+								rch = ''
+							# line = line.replace(ch, docConfig.characters[_ch])	# we need the line for further replacements
+							line = line.replace(ch, rch)	# we need the line for further replacements
+							lines[i] = line
+						else:
+							_print(f'[yellow]Non-ASCII character (consider to add a replacement in the config.ini file): "{ch}" / {ord(ch)} / {hex(ord(ch))}')
+			#
+			#	Insert auto-genrated table of contents
+			#
+			progress.update(processTask, advance = 1)	# progress update
+			if docConfig.generateToc:
+				toc = ''
+				for l, t in headers:
+					link = t
+					# Convert to 
+					# All text is converted to lowercase.
+					link = link.lower()	
+					# All non-word text (e.g., punctuation (except: -()), HTML) is removed. Some characters are converted to upper-case hex.
+					# TODO decide / configurable how to replace special characters in links- Depends on rendering? "Markdown 2" (mac) likes to include
+					# link = ''.join( c if c not in '():' else f'{ord(c):x}'.upper()
+					link = ''.join( c if c not in '():' else ''#f'{ord(c):x}'.upper()
+									for c in link
+									if c.isspace() or c.isalnum() or c in '-():')
+					# All spaces are converted to hyphens.
+					link = ''.join( '-' if c.isspace() else c 
+										for c in link)
+					# Two or more hyphens in a row are converted to one.
+					link = '-'.join(link.split())
+					# TODO If a header with the same ID has already been generated, a unique incrementing number is appended, starting at 1.
+					# Add to toc
+					toc += f'{_nbsp * 4 * (l - 1)}[{t}](#{link})  \n'
+				for i in range(len(lines)):
+					line = lines[i]
+					if line == _tocInsertPoint:
+						lines[i] = toc
+						# continue when found, perhaps we want to have more than one toc?
+			#
+			#	Map internal references
+			#
+			_definitionExpression = re.compile(r'^[`]?\[([\d]+|i.[\d]+)\]([^`]*)[`]?')
+			_referenceExpression = re.compile(r'([^>])\[([\d]+|i.[\d]+)\](?!</a>)')
+			for i in range(len(lines)):
+				line = lines[i]
+				if (m := _definitionExpression.match(line)) is not None:
+					lines[i] = f'- <a name="_ref_{m.group(1)}"[{m.group(1)}]">[{m.group(1)}]</a>{m.group(2)}'
+			def _repl(m:re.Match) -> str|None:
+				if m.group(1) == '"':
+					return None
+				return f'{m.group(1)}<a href="#_ref_{m.group(2)}">[{m.group(2)}]</a>'
+			for i in range(len(lines)):
+				line = lines[i]
+				lines[i] = re.sub(_referenceExpression, _repl, line)
+			#
+			#	Write produced Markdown file
+			#
+			with open(f'{documentDirName}{os.sep}{Path(d).stem}.md', 'w') as file:
+				file.write('\n'.join(lines))
+			#
+			#	Convert media files
+			#
+			def _convertImage(converter:str, format:str):
+				if converter:
+					for fn in list(emfFiles):
+						_f = Path(fn)		# Filename to handle
+						_t = imageDirName	# Target directory
+						if _f.stem not in referencedImages:
+							if not fn.startswith(unreferencedDirName):
+								_print(f'[red]Unreferenced image: {PurePath(fn).name} {"(skipped)" if docConfig.skipUnreferencedMediaFiles else ""}', highlight = False)
+								_print(f'[yellow]Moving image file to: {unreferencedDirName}', highlight = False)
+								_n = f'{unreferencedDirName}/{_f.name}'
+								_p = _f.replace(_n)
+								emfFiles.remove(fn)
+								emfFiles.append(_n)
+								fn = _n
+							if docConfig.skipUnreferencedMediaFiles:
+								continue
+							_t = unreferencedDirName
+						cmd = converter
+						cmd = cmd.replace('{infile}', fn).replace('{outdir}', _t)
+						_print(f'Converting EMF file: {fn} to "{format}"', highlight = False)
+						if (res := subprocess.run(cmd, shell = True, capture_output = True)).returncode != 0:
+							_print(f'[red] Error running command: {res.stderr.decode("utf-8")}')
+			if docConfig.emfConverterPng:
+				_convertImage(docConfig.emfConverterPng, 'png')
+			if docConfig.emfConverterSvg:
+				_convertImage(docConfig.emfConverterSvg, 'svg')
+			emfFiles.clear()
+			referencedImages.clear()
+			progress.update(processTask, advance = 1)	# progress update
+		progress.stop()
+if __name__ == '__main__':
+	# Parse command line arguments
+	parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+	parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>',  help = 'specify output directory')
+	parser.add_argument('document', nargs = '+', help = 'documents to parse')
+	args = parser.parse_args()
+		# Process documents and print output
+	os.makedirs(args.outDirectory, exist_ok = True)
+	processDocuments(sorted(args.document), args.outDirectory)