#
#	spec2md.py
#
#	Script to convert oneM2M specs in docx / openXML to markdown.
#
#	(c) 2022 by Andreas Kraft
#	License: BSD 3-Clause License. See the LICENSE file for further details.
#


from enum import IntEnum, auto
from typing import Callable, Tuple, Dict, Optional, Any

from pathlib import Path, PurePath
from docx.document import Document
from docx.text.paragraph import Paragraph
from docx.package import Package
import docx.opc.exceptions
from docx.table import _Cell, Table
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
import argparse, os, binascii, re, subprocess
from pathlib import Path
from rich.console import Console
from rich.progress import Progress, TextColumn, BarColumn
from rich.console import Console
from rich import inspect
import configparser, zipfile
from lxml import etree as ET

class Style(IntEnum):
	code = auto()
	example = auto()
	image = auto()
	imagecaption = auto()
	none = auto()
	normal = auto()
	note = auto()
	orderedlist = auto()
	orderedlist2 = auto()
	unorderedlist = auto()
	unorderedlist2 = auto()
	unorderedlist3 = auto()
	unorderedlist4 = auto()
	unorderedlist5 = auto()

	# TODO more styles


defaultConfigFile = 'config.ini'
imagesSubDir = 'media'
unreferencedSubDir = 'unreferenced'

# special characters
_linebreak = '<br />'
_entityLt = '&lt;'
_nbsp = '&nbsp;'
_tocInsertPoint = '~~t~o~c~~'
_captionMarker = '~~CAPTION~~'


# https://learn.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing?view=openxml-2.8.1

# NOTE Crash when extracting tables
# Until a fix from the python-docx package is available, parsing tables might crash.
# Fix in python-docx table.py, around line 173 the lines in _cells() to:
#
#	...
#	if tc.vMerge == ST_Merge.CONTINUE:
#		if len(cells) >= col_count:         # <--
#			cells.append(cells[-col_count])
#	..

# TODO regard addTOCMacro configuration
# TODO move -section switch to configuration
# TODO Support internal links
# TODO at least mark unsupported objects and images?

#	Rich console for pretty printing
console = Console()
_print:Callable = print

# Some predefined tags and attributes
wns = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
_val = f'{{{wns}}}val'

class SectionNumbers(object):

	def __init__(self) -> None:
		self.levels:list[int] = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
		self.heading:int = 0
		self.annex:int = 64

		
	def nextSectionNumber(self, level:int, isAnnex:Optional[bool] = False) -> str:
		if isAnnex:
			self.levels[0] = self.annex
		else:
			self.levels[0] = self.heading

		# Increment appropriate level
		self.levels[level - 1] += 1
		for i in range(level, len(self.levels)):
			self.levels[i] = 0
		
		if isAnnex:
			self.levels[0] = chr(self.levels[0]) 	# type: ignore
		
		nr =  '.'.join([ str(x) for x in self.levels if x != 0 ]) + ' '

		if isAnnex:
			self.annex = ord(self.levels[0])		# type:ignore
			nr = f'Annex {nr}'
		else:
			self.heading = self.levels[0]			# type:ignore
		
		return nr



class DocumentConfiguration(object):
	"""	Per document configuration settings.
	"""


	def __init__(self, documentFileName:str) -> None:
		self.documentFileName = documentFileName
		self.configFileNameDef = f'{os.path.split(documentFileName)[0]}/config.ini'
		self.configFileName = f'{os.path.splitext(documentFileName)[0]}.ini'
		# print(self.configFileName)

		try:

			config = configparser.ConfigParser(	interpolation=configparser.ExtendedInterpolation(),
												converters={'list': lambda x: [i.strip() for i in x.split(',')]},	# Convert csv to list
											)
			if len(config.read( [defaultConfigFile, self.configFileNameDef, self.configFileName])) == 0:
				_print(f'[grey39]Configuration file missing or not readable for file: "{self.documentFileName}"')	
				return
			# print([defaultConfigFile, self.configFileName])
		except configparser.Error as e:
			_print('[red]Error in configuration file')	
			raise e

		#	General
		self.replaceNbsp = config.get('general', 'replaceNbsp', fallback = None)
		self.replaceLt = config.get('general', 'replaceLt', fallback = _entityLt)
		self.renameEMFExtension = config.get('general', 'renameEMFExtension', fallback = None)
		self.skipUnreferencedMediaFiles = config.getboolean('general', 'skipUnreferencedMediaFiles', fallback = False)
		self.imageCaptions2AltText = config.getboolean('general', 'imageCaptions2AltText', fallback = True)
		self.combineCodeParagraphs = config.getboolean('general', 'combineCodeParagraphs', fallback = True)

		#	Paragraphs
		self.paragraphs = { c : config.getlist('paragraphs', c)	# type: ignore [attr-defined]
							for c in config['paragraphs'] }
		self.normal = self.paragraphs['normal']
		self.h1 = self.paragraphs['h1']
		self.h2 = self.paragraphs['h2']
		self.h3 = self.paragraphs['h3']
		self.h4 = self.paragraphs['h4']
		self.h5 = self.paragraphs['h5']
		self.h6 = self.paragraphs['h6']
		self.h7 = self.paragraphs['h7']
		self.h8 = self.paragraphs['h8']
		self.h9 = self.paragraphs['h9']
		self.a1 = self.paragraphs['a1']
		self.a2 = self.paragraphs['a2']
		self.a3 = self.paragraphs['a3']
		self.ol1 = self.paragraphs['ol1']
		self.ol2 = self.paragraphs['ol2']
		self.ul1 = self.paragraphs['ul1']
		self.ul2 = self.paragraphs['ul2']
		self.ul3 = self.paragraphs['ul3']
		self.ul4 = self.paragraphs['ul4']
		self.ul5 = self.paragraphs['ul5']
		#self.continuedlist = self.paragraphs['continuedlist']
		self.code = self.paragraphs['code']
		self.note = self.paragraphs['note']
		self.example = self.paragraphs['example']
		self.tablecaption = self.paragraphs['tablecaption']
		self.imagecaption = self.paragraphs['imagecaption']
		self.image = self.paragraphs['image']
		self.ignore = self.paragraphs['ignore']
		self.empty = self.paragraphs['empty']

		#	TOC
		self.addSectionNumbers = config.getboolean('toc', 'addSectionNumbers', fallback = False)
		self.excludeFromNumbering = config.getlist('toc', 'excludeFromNumbering')	# type: ignore [attr-defined]
		self.tocStartParagraph = config.get('toc', 'tocStartParagraph')
		self.tocHeaderLevel = config.getint('toc', 'tocHeaderLevel')
		self.addTocMacro = config.getboolean('toc', 'addTocMacro', fallback = False)
		self.generateToc = config.getboolean('toc', 'generateToc', fallback = False)

		# characters
		# self.characters = { int(c, 16) : binascii.unhexlify(config.get('characters', c)).decode('utf-8')	# type: ignore [attr-defined]
		# 					for c in config['characters'] }
		self.characters = {}
		for c,v in config['characters'].items():
			if v.startswith('&'):
				# HTML entity
				self.characters[int(c, 16)] = v
			else:
				# Unicode character
				self.characters[int(c, 16)] = binascii.unhexlify(config.get('characters', c)).decode('utf-8')	# type: ignore [attr-defined]

		# Media & Converter
		self.emfConverterPng = config.get('media', 'emfConverterPng', fallback = None)
		self.emfConverterSvg = config.get('media', 'emfConverterSvg', fallback = None)




def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:bool) -> None:
	docs:Dict[str, Tuple[Document, DocumentConfiguration, Any]]		= {}
	ptasks 															= {}
	mediaRelations:Dict[str, str] 									= {}
	addSectionNumbers 												= False
	excludeFromNumbering:list[str]									= []
	headers:list[Tuple[int, str]]									= []
	emfFiles:list[str]												= []
	referencedImages:list[str]										= []
	footnotes:dict[str, str]										= {}

	global _print
	
	with Progress(	TextColumn('[progress.description]{task.description}'),
					BarColumn(),
					TextColumn('[progress.percentage]{task.percentage:>3.0f}%'),
					speed_estimate_period=2.0) as progress:
		
		
		_print = progress.print		# Assign progress internal print to global print
		_lastStyle = Style.none

		
		def stopProgress(msg:str='') -> None:
			progress.stop()
			progress.remove_task(readTask)
			_print(msg)


		def iter_block_items(parent):
			"""
			Yield each paragraph and table child within *parent*, in document order.
			Each returned value is an instance of either Table or Paragraph. *parent*
			would most commonly be a reference to a main Document object, but
			also works for a _Cell object, which itself can contain paragraphs and tables.
			"""
			if isinstance(parent, Document):
				parent_elm = parent.element.body
			elif isinstance(parent, _Cell):
				parent_elm = parent._tc
			else:
				raise ValueError("something's not right")

			for child in parent_elm.iterchildren():
				if isinstance(child, CT_P):
					yield Paragraph(child, parent)
				elif isinstance(child, CT_Tbl):
					yield Table(child, parent)
				else:
					# print(child.__class__.__name__)
					...

		def replaceNL(text:str, rpl:str = '') -> str:
			return text.replace('\n', rpl)

		
		def toMD(text:str) -> str:
			return text.replace('<', docConfig.replaceLt)


		#sectionNrs:list[int] = [ 0, 0, 0, 0]
		sectionNrs = SectionNumbers()

		def toHeader(style:str, text:str, level:int, numbering:bool = True, isAnnex:bool = False) -> list[str]:
			nonlocal addSectionNumbers, excludeFromNumbering

			if style in excludeFromNumbering:
				numbering = False

			nr = ''
			if numbering and addSectionNumbers:
				nr = sectionNrs.nextSectionNumber(level, isAnnex = isAnnex)
			
			# Replace multiple white spaces
			text = ' '.join(text.split())
			
			# Remove linebreak in header lines
			text = text.replace(_linebreak, ' ').strip()

			# Store header
			headers.append( (level, replaceNL(text)))

			
			return [ '', f'{"#" * level} {nr}{replaceNL(text)}' if text else '' ]


		def strippedTag(tag:str) -> str:
			"""	Stripp the namespace from an element or attribute name.
			"""
			_, _, tag = tag.rpartition('}')
			return tag


		def getTextFromXML(elem:Paragraph|_Cell|ET._Element) -> str:

			#	Not-used document tags.
			_ignoredTags = ( 'AlternateContent',
							 'fldChar',
							 'fldSimple',
							 'instrText',
							 'lastRenderedPageBreak',
							 'noBreakHyphen',
							 'pPr',
							 'proofErr',
							 'rPr',
							 'moveFromRangeEnd',
							 'ins',
							 'del',
							 'commentRangeStart',
							 'commentRangeEnd',
							 'commentReference',
							 'smartTag',
							 'footnoteRef',
			)
			

			def _parseXML(element:ET.Element, inCell:Optional[bool] = False) -> str:
				"""	Recursively parse a document paragraph.
				"""
				nonlocal _ignoredTags

				_result = ''
				tag = strippedTag(element.tag)	# remove namespaces for easier handlings
				match tag:
					case 'p':
						for x in element:
							_result += _parseXML(x, inCell)
					case 'r':
						for x in element:
							_result += _parseXML(x, inCell)
					case 't':
						_bold = ''
						_italics = ''
						for e in element.getparent():
							if strippedTag(e.tag) == 'rPr':	# paragraph style
								for ep in e:
									match strippedTag(ep.tag):
										case 'b' if ep.attrib.get(_val, 'true') == 'true':
											_bold = '**'
										case 'i' if ep.attrib.get(_val, 'true') == 'true':
											_italics = '_'
										# case _:
										# 	_print(f'[yellow]unsupported style: {ep.tag}')
						
						# Strip white spaces if bold or italics
						_s = str(toMD(str(element.text))).strip() if _bold or _italics else str(toMD(str(element.text)))
						# Replace single * or _
						_s = _s.replace('_', '\\_')
						_s = _s.replace('*', '\\*')
						# Add trailing white space when bold or italics
						_postfix = ' ' if _bold or _italics else ''
						_result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_postfix}'
						# print(_result)

					case 'br':
						_result += _linebreak
						
					case 'bookmarkStart' | 'bookmarkEnd':		# TODO ?
						pass

					case 'hyperlink':
						# Hyperlinks and URLs
						_hresult = ''
						for x in element:
							_hresult += _parseXML(x, inCell)
						_result += f'[{_hresult}]({_hresult})'

					case 'drawing':
						# Get the rID of a media file from the element's XML
						# and map to an extracted media file
						# inspect(element)
						# _print(element.items())
						# _print(element.attrib)
						blip = element.findall('ns1:inline/ns3:graphic/ns3:graphicData/ns4:pic/ns4:blipFill/ns3:blip', 
												namespaces = { 
													'ns1' : 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
													'ns3' : 'http://schemas.openxmlformats.org/drawingml/2006/main',
													'ns4' : 'http://schemas.openxmlformats.org/drawingml/2006/picture',
												})
						if blip and \
							(rId := blip[0].attrib.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')) and \
							(mediaFile := mediaRelations.get(rId)):
							mediaFilePath = Path(mediaFile)
							referencedImages.append(mediaFilePath.stem)	# Add to referenced files
							if docConfig.renameEMFExtension and mediaFilePath.suffix.lower() == '.emf':
								mediaFilePath = mediaFilePath.with_suffix(f'.{docConfig.renameEMFExtension}')
								_print(f'[yellow]Renaming EMF file reference to "{str(mediaFilePath)}"')
							_result += f'![{_captionMarker}]({mediaFilePath.as_posix()})'	# image reference as posix path
						# else:
						# 	_print(blip)

					case 'pict':
						# for e in element:
						# 	print(f'----{e}')
						_print(f'[yellow]unsupported pict element: {element}')
						_result += f'<mark>unsupported pict element: {element}</mark>'

					case 'object':
						#inspect(element)
						pass

					case 'tab':
						_result += '    '	# TODO nbsp?
					
					case 'softHyphen':
						pass	# ignore a soft hyphen character which has no meaning in Markdown and zero-width 
					
					case 'sym':

						def _symError(ch:int) -> None:
							nonlocal _result
							_symError = f'unknown font+symbol: {element.attrib["{"+wns+"}font"]} - "{element.attrib["{"+wns+"}char"]} ({ch})"'
							_print(f'[yellow]{_symError}')
							_result += f'<mark>{_symError}</mark>'

						try:
							_ch = 0
							_ch = int(element.attrib["{"+wns+"}char"], 16)
							if _ch in docConfig.characters:
								if (rch := docConfig.characters[_ch]) == chr(0):
									rch = ''
								_result += rch
							else:
								_symError(_ch)
						except ValueError as e:
							_symError(_ch)

					# ignore deleted test
					case 'del':
						pass

					# try to keep the text of inserted text
					case 'ins':
						for x in element:
							_result += _parseXML(x)
					
					case 'footnoteReference':
						id = element.attrib[f'{{{wns}}}id']
						_result += f'[^{id}]'
						footnotes[id] = '<mark>unknown footnote</mark>'
					
					# The footnote itself is not included in the document but in a separate file.
					# Therefore, we need to extract the footnote from the footnotes.xml file. The format
					# of the footnote is the same as a paragraph.
					case 'footnote':
						for x in element:
							_result += _parseXML(x)
					
					case _ if tag in _ignoredTags:	# ignore
						pass
					
					case _:	# unknown
						_print(tag)
						_print(element)
				return _result

			# _print(ET.fromstring(elem._p.xml))
			match elem:
				case Paragraph():	# type: ignore[misc]
					return _parseXML(ET.fromstring(elem._p.xml))
				case _Cell():		# type: ignore[misc]
					# Iterate over all paragraphs in the cell and parse them
					# Create a list of parsed paragraphs and join them with linebreaks
					return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip() 
										   for p in elem.paragraphs ])
				case ET._Element():
					return _parseXML(elem)
				case _:
					return ''



		def checkSameStyle(style:Style, action:Optional[Callable] = None) -> None:
			"""	Check whether the given style is the same as the last one. If no,
				then execute the optional action.
			"""
			nonlocal _lastStyle
			if style == _lastStyle:
				return
			_lastStyle = style
			if action:
				action()


		# Preparing tasks for progress
		readTask 	= progress.add_task(f'Reading document{"s" if len(documents) > 1 else ""}', total = len(documents))

		#
		#	Reading documents
		#

		for d in documents:
			if not (dp := Path(d)).exists():
				stopProgress(f'[red]Input document "{d}" does not esist')
				return
			if not dp.is_file():
				stopProgress(f'[red]Input document "{d}" is not a file')
				return
			try:
				# Search for footnotes in the document XML
				footnotesPart = None
				for part in Package.open(d).parts:
					if part.partname.endswith('/footnotes.xml'):
						footnotesPart = part
				docs[d] = (docx.Document(d), DocumentConfiguration(d), footnotesPart)
				ptasks[d] = progress.add_task(f'Processing {d}', total = None)
				progress.update(readTask, advance=1)
			except docx.opc.exceptions.PackageNotFoundError as e:
				stopProgress(f'[red]Input document "{d}" is not a .docx file')
				return
			except Exception as e:
				stopProgress(f'[red]Error reading file "{d}"')
				console.print_exception()
				return 

		#
		#	Processing Documents
		#

		for docFileName, (doc, docConfig, footnotesPart) in docs.items():
			processTask = ptasks[docFileName]
			docItems = list(iter_block_items(doc))
			addSectionNumbers = docConfig.addSectionNumbers
			excludeFromNumbering = docConfig.excludeFromNumbering

			paragraphNr = 0

			# TODO
			#(docConfig.replaceNbsp)

			#	Create output directories
			try:
				os.makedirs(documentDirName := f'{outDirectory}{os.sep}{Path(docFileName).stem}', exist_ok = True)
				os.makedirs(imageDirName := f'{documentDirName}{os.sep}{imagesSubDir}', exist_ok = True)
				os.makedirs(unreferencedDirName := f'{documentDirName}{os.sep}{unreferencedSubDir}', exist_ok = True)
			except Exception as e:
				stopProgress(f'[red]Error creating output directory "{d}"')
				console.print_exception()
				return

			# Add sub-progress task
			progress.update(processTask, total = len(docItems) + 6)	# + relations + image extraction + characters + toc + footnotes + media convert


			#	Extract the media relations file, and get the mappings from document IDs to media files
			mediaRelations = {}
			with open(docFileName, 'rb') as docfile:
				zip = zipfile.ZipFile(docfile)
				for z in zip.filelist:
					if z.filename == 'word/_rels/document.xml.rels':
						xml = ET.fromstring(zip.read(z.filename))
						for element in xml:
							if strippedTag(element.tag) == 'Relationship':
								if (_a := element.attrib.get('Type')) and _a.endswith('/image'):	# Only image relationships
									mediaRelations[element.attrib['Id']] = element.attrib['Target']
						break
				else:
					_print('[red]Media relations file not found in document')
					return
			progress.update(processTask, advance = 1)	# progress update


			#	Extracting images for the document next 
			with open(docFileName, 'rb') as docfile:
				zip = zipfile.ZipFile(docfile)
				for zipMediaFilename in [z.filename 
										 for z in zip.filelist 
										 if z.filename.startswith('word/media/')]:
					
					fn = f'{imageDirName}{os.sep}{os.path.basename(zipMediaFilename)}'
					if fn.lower().endswith(('.emf', '.wmf')):
						_print(f'[yellow]unsupported media file: {fn}', highlight = False)
						emfFiles.append(fn)
					with open(f'{fn}', 'wb') as imgFile:
						imgFile.write(zip.read(zipMediaFilename))
			progress.update(processTask, advance = 1)	# progress update

			# 	Processing the document			
			lines:list[str] = []
			imageIndex = 1

			for elem in docItems:
				paragraphNr += 1
				progress.update(processTask, advance = 1)
				match type(elem).__name__:
					case 'Paragraph':
						text = getTextFromXML(elem)
						style = elem.style.name.lower()
						# print(f'{style} {text}')

						#	Normal, body text
						if style in docConfig.normal:
							checkSameStyle(Style.normal, lambda:lines.append(''))
							lines.append(text)
							lines.append('')	# Add empty line 
							
						#	Headers
						elif style in docConfig.h1:
							lines.extend(toHeader(style, text, 1))
						elif style in docConfig.h2:
							lines.extend(toHeader(style, text, 2))
						elif style in docConfig.h3:
							lines.extend(toHeader(style, text, 3))
						elif style in docConfig.h4:
							lines.extend(toHeader(style, text, 4))
						elif style in docConfig.h5:
							lines.extend(toHeader(style, text, 5))
						elif style in docConfig.h6:
							lines.extend(toHeader(style, text, 6))
						elif style in docConfig.h7:
							lines.extend(toHeader(style, text, 7))
						elif style in docConfig.h8:
							lines.extend(toHeader(style, text, 8))
						elif style in docConfig.h9:
							lines.extend(toHeader(style, text, 9))

						#	Annexes
						elif style in docConfig.a1:
							lines.extend(toHeader(style, text, 1, isAnnex = True))
						elif style in docConfig.a2:
							lines.extend(toHeader(style, text, 2, isAnnex = True))
						elif style in docConfig.a3:
							lines.extend(toHeader(style, text, 3, isAnnex = True))

						#	Ordered Lists
						elif style in docConfig.ol1:
							checkSameStyle(Style.orderedlist2, lambda:lines.append(''))
							if len(elem.text):	# ignore empty
								lines.append(f'1. {text}')
						elif style in docConfig.ol2:
							checkSameStyle(Style.orderedlist2, lambda:lines.append(''))
							if len(elem.text):	# ignore empty
								lines.append(f'     1. {text}')

						#	Unordered Lists 
						elif style in docConfig.ul1:
							checkSameStyle(Style.unorderedlist, lambda:lines.append(''))
							if len(elem.text):	# ignore empty
								lines.append(f'- {text}')
						elif style in docConfig.ul2:
							checkSameStyle(Style.unorderedlist2, lambda:lines.append(''))
							if len(elem.text):	# ignore empty
								lines.append(f'{"    "*1}- {text}')
						elif style in docConfig.ul3:
							checkSameStyle(Style.unorderedlist3, lambda:lines.append(''))
							if len(elem.text):	# ignore empty
								lines.append(f'{"    "*2}- {text}')
						elif style in docConfig.ul4:
							checkSameStyle(Style.unorderedlist4, lambda:lines.append(''))
							if len(elem.text):	# ignore empty
								lines.append(f'{"    "*3}- {text}')
						elif style in docConfig.ul5:
							checkSameStyle(Style.unorderedlist5, lambda:lines.append(''))
							if len(elem.text):	# ignore empty
								lines.append(f'{"    "*4}- {text}')

						#	Table Caption
						elif style in docConfig.tablecaption:
							lines.append('')
							caption = replaceNL(text).strip()
							anchor = f'<a name="table_{caption[6:].split(":")[0].strip()}"></a>' if caption.startswith('Table ') and ':' in caption else ''
							lines.append(f'**{caption}**{anchor}')

						#	Image Caption
						elif style in docConfig.imagecaption:
							checkSameStyle(Style.imagecaption, lambda:lines.append(''))
							_t = replaceNL(text).strip()
							lines.append(f'**{_t}**')
							lines.append('')
							if docConfig.imageCaptions2AltText:
								# Search and replace the previous image reference (max 10 lines back-search TODO configurable)
								for idx in range(len(lines)-1, len(lines)-11, -1):
									if _captionMarker in lines[idx]:
										lines[idx] = lines[idx].replace(_captionMarker, _t)

						#	Image & Figure
						elif style in docConfig.image:
							lines.append('')
							lines.append(text)

						#	Code
						elif style in docConfig.code:
							checkSameStyle(Style.code, lambda:lines.append(''))
							for _t in text.split(_linebreak):
								lines.append(f'```{_t if _t else " "}```  ') # at least an empty space. And 2 spaces at the end for newline

						#	Example
						elif style in docConfig.example:
							checkSameStyle(Style.example, lambda:lines.append(''))
							# Replace linebreaks
							for _t in text.split(_linebreak):
								lines.append(f'`{_t if _t else " "}`  ') # at least an empty space. And 2 spaces at the end for newline

						#	Notes
						elif style in docConfig.note:
							checkSameStyle(Style.note)
							lines.append(f'> {text}')
							
						# 	Add TOC
						elif style in docConfig.tocStartParagraph:
							lines.extend(toHeader(style, elem.text, docConfig.tocHeaderLevel, numbering = False))
							if docConfig.addTocMacro:
								lines.append('[toc]')
							if docConfig.generateToc:
								lines.append(_tocInsertPoint)

						# 	Ignore & empty
						elif style in docConfig.ignore:
							pass
						elif style in docConfig.empty:
							lines.append('')

						# Print Unhandled tokens also to the console
						else:
							_print(f'{paragraphNr} {style}: {elem.style}: {text}')
							lines.append(text)


					case 'Table':
						rows:list[list[str]] = []
						nrRows = 0
						for row in elem.rows:
							cells:list[str] = []
							for cell in row.cells:
								cells.append(f'{getTextFromXML(cell)} ')	# add at least a space
							rows.append(cells)
							nrRows += 1
						
						# Warning if this is a single-row table
						if nrRows == 1:
							_print(f'[red]Single-row table found. Such tables cannot be converted to markdown.[/red] Please consider to change the following table in the original document:\n[grey39]{rows[0]}', highlight = False)

						lines.append('')	# Add an empty line before a table
						for idx, row in enumerate(rows):

							# Check for a table caption and add separator line
							if idx == 1:
								lines.append('-'.join('|' * (len(row) + 1) ))
							
							# Add table row
							lines.append(f'|{"|".join(row)}|'
										 .replace('\n', _linebreak))	# replace line breaks in cells
						lines.append('')	# Add another empty line after a table
					
					case _:
						_print('[blue] {type(elem).__name__}')

			#
			#	Replace non-ascii characters
			#
			progress.update(processTask, advance = 1)	# progress update
			for i in range(len(lines)):
				line = lines[i]
				for ch in line:
					if not ch.isascii():
						if (_ch := ord(ch)) in docConfig.characters:
							if (rch := docConfig.characters[_ch]) == chr(0):
								rch = ''
							# line = line.replace(ch, docConfig.characters[_ch])	# we need the line for further replacements
							line = line.replace(ch, rch)	# we need the line for further replacements
							lines[i] = line
						else:
							_print(f'[yellow]Non-ASCII character (consider to add a replacement in the config.ini file): "{ch}" / {ord(ch)} / {hex(ord(ch))}')
		

			#
			#	Remove multiple bold / italics on/off occurances
			#	Sometimes word doesn't remove empty bold-on/bold-off (or italics) indicatros
			#
			progress.update(processTask, advance = 1)	# progress update
			for i in range(len(lines)):
				line = lines[i]
				line = line.replace('__', '')
				line = line.replace('****', '')
				#line = line.replace('  ', ' ')
				lines[i] = line


			#
			#	Combine mutiple consecutive "code" lines
			#

			if docConfig.combineCodeParagraphs:
				codeblock:list[str] = []
				_lines:list[str] = []
				for i in range(len(lines)):
					line = lines[i]
					if line.startswith('```') and line.endswith('```  '):
						# Store code block
						codeblock.append(line[3:-5])
					elif codeblock:
						# Add whole code block to lines
						_lines.append('```')
						_lines.append('')
						_lines.extend(codeblock)
						_lines.append('')
						_lines.append('```')
						codeblock = []
					else:
						# Add line
						_lines.append(line)
				lines = _lines

			#
			#	Insert auto-generated table of contents
			#
			progress.update(processTask, advance = 1)	# progress update
			if docConfig.generateToc:
				toc = ''
				for l, t in headers:
					link = t
					# Convert to 
					# All text is converted to lowercase.
					link = link.lower()	
					# All non-word text (e.g., punctuation (except: -()), HTML) is removed. Some characters are converted to upper-case hex.
					# TODO decide / configurable how to replace special characters in links- Depends on rendering? "Markdown 2" (mac) likes to include
					# link = ''.join( c if c not in '():' else f'{ord(c):x}'.upper()
					link = ''.join( c if c not in '():' else ''#f'{ord(c):x}'.upper()
									for c in link
									if c.isspace() or c.isalnum() or c in '-():')
					# All spaces are converted to hyphens.
					link = ''.join( '-' if c.isspace() else c 
										for c in link)
					# Two or more hyphens in a row are converted to one.
					link = '-'.join(link.split())
					# TODO If a header with the same ID has already been generated, a unique incrementing number is appended, starting at 1.
					# Add to toc
					toc += f'{_nbsp * 4 * (l - 1)}[{t}](#{link})  \n'
				
				for i in range(len(lines)):
					line = lines[i]
					if line == _tocInsertPoint:
						lines[i] = toc
						# continue when found, perhaps we want to have more than one toc?
			
			#
			#	Map internal references
			#
			_definitionExpression = re.compile(r'^[`]?\[([\d]+|i.[\d]+)\]([^`]*)[`]?')
			_referenceExpression = re.compile(r'([^>])\[([\d]+|i.[\d]+)\](?!</a>)')

			for i in range(len(lines)):
				line = lines[i]
				if (m := _definitionExpression.match(line)) is not None:
					lines[i] = f'- <a name="_ref_{m.group(1)}"[{m.group(1)}]">[{m.group(1)}]</a>{m.group(2)}'
			
			def _repl(m:re.Match) -> str|None:
				if m.group(1) == '"':
					return None
				return f'{m.group(1)}<a href="#_ref_{m.group(2)}">[{m.group(2)}]</a>'

			for i in range(len(lines)):
				line = lines[i]
				lines[i] = re.sub(_referenceExpression, _repl, line)	# type:ignore[arg-type]


			#
			#	Process footnotes
			#
			progress.update(processTask, advance = 1)	# progress update
			if len(footnotes) and footnotesPart is not None:
				_print(f'[yellow]Footnotes found: {len(footnotes)}')
				# Analyze footnotes file
				footnotesXML = ET.fromstring(footnotesPart.blob)
				# Process the footnotes XML here
				for element in footnotesXML:

					# Footnote found
					if strippedTag(element.tag) == 'footnote':
						footnoteID = element.attrib[f'{{{wns}}}id']
						if footnoteID in footnotes:
							t = getTextFromXML(element)
							footnotes[footnoteID] = t
				
				# Add footnotes to the end of the document
				lines.append('')
				for fid, text in footnotes.items():
					lines.append(f'[^{fid}]: {text}')

			#
			#	List unresolved CAPTION markers
			#
			for i in range(len(lines)):
				line = lines[i]
				if _captionMarker in line:
					_print(f'[yellow]Unresolved / unreferenced figure caption : \[{i}] "{line}"')
			
			#
			#	Write produced Markdown file
			#
			
			with open(f'{documentDirName}{os.sep}{Path(d).stem}.md', 'w') as file:
				file.write('\n'.join(lines))

			#
			#	Convert media files
			#

			def _convertImage(converter:str, format:str) -> None:
				if converter:
					for fn in list(emfFiles):
						_f = Path(fn)		# Filename to handle
						_t = imageDirName	# Target directory
						if _f.stem not in referencedImages:
							if not fn.startswith(unreferencedDirName):
								_print(f'[yellow]Unreferenced image in the document: {PurePath(fn).name} {"(skipped)" if docConfig.skipUnreferencedMediaFiles else ""}', highlight = False)
								_print(f'[yellow]Moving image file to: {unreferencedDirName}', highlight = False)
								_n = f'{unreferencedDirName}/{_f.name}'
								_p = _f.replace(_n)
								emfFiles.remove(fn)
								emfFiles.append(_n)
								fn = _n
							if docConfig.skipUnreferencedMediaFiles:
								continue
							_t = unreferencedDirName
						cmd = converter
						cmd = cmd.replace('{infile}', fn).replace('{outdir}', _t)
						_print(f'Converting EMF file: {fn} to "{format}"', highlight = False)
						if (res := subprocess.run(cmd, shell = True, capture_output = True)).returncode != 0:
							_print(f'[red]Error running command: {res.stderr.decode("utf-8")}')
							_print(f'[red]Please check the configuration file -> section "\[media]" for the converter command: {converter}')
							break

			if not skipImageConversion:
				if docConfig.emfConverterPng:
					_convertImage(docConfig.emfConverterPng, 'png')
				if docConfig.emfConverterSvg:
					_convertImage(docConfig.emfConverterSvg, 'svg')

			emfFiles.clear()
			referencedImages.clear()
			progress.update(processTask, advance = 1)	# progress update


		progress.stop()



if __name__ == '__main__':

	# Parse command line arguments
	parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>',  help = 'specify output directory')
	parser.add_argument('--skip-image-conversion', '-sic', action='store_true', dest='skipImageConversion',  help = 'skip image conversion step')

	parser.add_argument('document', nargs = '+', help = 'documents to parse')
	args = parser.parse_args()

		# Process documents and print output
	os.makedirs(args.outDirectory, exist_ok = True)

	processDocuments(sorted(args.document), args.outDirectory, args.skipImageConversion)