# # spec2md.py # # Script to convert oneM2M specs in docx / openXML to markdown. # # (c) 2022 by Andreas Kraft # License: BSD 3-Clause License. See the LICENSE file for further details. # from enum import IntEnum, auto from typing import Callable, Tuple, Dict, Optional, Any from pathlib import Path, PurePath from docx.document import Document from docx.text.paragraph import Paragraph from docx.package import Package import docx.opc.exceptions from docx.table import _Cell, Table from docx.oxml.table import CT_Tbl from docx.oxml.text.paragraph import CT_P import argparse, os, binascii, re, subprocess from pathlib import Path from rich.console import Console from rich.progress import Progress, TextColumn, BarColumn from rich.console import Console from rich import inspect import configparser, zipfile from lxml import etree as ET class Style(IntEnum): code = auto() example = auto() image = auto() imagecaption = auto() none = auto() normal = auto() note = auto() orderedlist = auto() orderedlist2 = auto() unorderedlist = auto() unorderedlist2 = auto() unorderedlist3 = auto() unorderedlist4 = auto() unorderedlist5 = auto() # TODO more styles defaultConfigFile = 'config.ini' imagesSubDir = 'media' unreferencedSubDir = 'unreferenced' # special characters _linebreak = '<br />' _entityLt = '<' _nbsp = ' ' _tocInsertPoint = '~~t~o~c~~' _captionMarker = '~~CAPTION~~' # https://learn.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing?view=openxml-2.8.1 # NOTE Crash when extracting tables # Until a fix from the python-docx package is available, parsing tables might crash. # Fix in python-docx table.py, around line 173 the lines in _cells() to: # # ... # if tc.vMerge == ST_Merge.CONTINUE: # if len(cells) >= col_count: # <-- # cells.append(cells[-col_count]) # .. # TODO regard addTOCMacro configuration # TODO move -section switch to configuration # TODO Support internal links # TODO at least mark unsupported objects and images? # Rich console for pretty printing console = Console() _print:Callable = print # Some predefined tags and attributes wns = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' _val = f'{{{wns}}}val' class SectionNumbers(object): def __init__(self) -> None: self.levels:list[int] = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] self.heading:int = 0 self.annex:int = 64 def nextSectionNumber(self, level:int, isAnnex:Optional[bool] = False) -> str: if isAnnex: self.levels[0] = self.annex else: self.levels[0] = self.heading # Increment appropriate level self.levels[level - 1] += 1 for i in range(level, len(self.levels)): self.levels[i] = 0 if isAnnex: self.levels[0] = chr(self.levels[0]) # type: ignore nr = '.'.join([ str(x) for x in self.levels if x != 0 ]) + ' ' if isAnnex: self.annex = ord(self.levels[0]) # type:ignore nr = f'Annex {nr}' else: self.heading = self.levels[0] # type:ignore return nr class DocumentConfiguration(object): """ Per document configuration settings. """ def __init__(self, documentFileName:str) -> None: self.documentFileName = documentFileName self.configFileNameDef = f'{os.path.split(documentFileName)[0]}/config.ini' self.configFileName = f'{os.path.splitext(documentFileName)[0]}.ini' # print(self.configFileName) try: config = configparser.ConfigParser( interpolation=configparser.ExtendedInterpolation(), converters={'list': lambda x: [i.strip() for i in x.split(',')]}, # Convert csv to list ) if len(config.read( [defaultConfigFile, self.configFileNameDef, self.configFileName])) == 0: _print(f'[grey39]Configuration file missing or not readable for file: "{self.documentFileName}"') return # print([defaultConfigFile, self.configFileName]) except configparser.Error as e: _print('[red]Error in configuration file') raise e # General self.replaceNbsp = config.get('general', 'replaceNbsp', fallback = None) self.replaceLt = config.get('general', 'replaceLt', fallback = _entityLt) self.renameEMFExtension = config.get('general', 'renameEMFExtension', fallback = None) self.skipUnreferencedMediaFiles = config.getboolean('general', 'skipUnreferencedMediaFiles', fallback = False) self.imageCaptions2AltText = config.getboolean('general', 'imageCaptions2AltText', fallback = True) self.combineCodeParagraphs = config.getboolean('general', 'combineCodeParagraphs', fallback = True) # Paragraphs self.paragraphs = { c : config.getlist('paragraphs', c) # type: ignore [attr-defined] for c in config['paragraphs'] } self.normal = self.paragraphs['normal'] self.h1 = self.paragraphs['h1'] self.h2 = self.paragraphs['h2'] self.h3 = self.paragraphs['h3'] self.h4 = self.paragraphs['h4'] self.h5 = self.paragraphs['h5'] self.h6 = self.paragraphs['h6'] self.h7 = self.paragraphs['h7'] self.h8 = self.paragraphs['h8'] self.h9 = self.paragraphs['h9'] self.a1 = self.paragraphs['a1'] self.a2 = self.paragraphs['a2'] self.a3 = self.paragraphs['a3'] self.ol1 = self.paragraphs['ol1'] self.ol2 = self.paragraphs['ol2'] self.ul1 = self.paragraphs['ul1'] self.ul2 = self.paragraphs['ul2'] self.ul3 = self.paragraphs['ul3'] self.ul4 = self.paragraphs['ul4'] self.ul5 = self.paragraphs['ul5'] #self.continuedlist = self.paragraphs['continuedlist'] self.code = self.paragraphs['code'] self.note = self.paragraphs['note'] self.example = self.paragraphs['example'] self.tablecaption = self.paragraphs['tablecaption'] self.imagecaption = self.paragraphs['imagecaption'] self.image = self.paragraphs['image'] self.ignore = self.paragraphs['ignore'] self.empty = self.paragraphs['empty'] # TOC self.addSectionNumbers = config.getboolean('toc', 'addSectionNumbers', fallback = False) self.excludeFromNumbering = config.getlist('toc', 'excludeFromNumbering') # type: ignore [attr-defined] self.tocStartParagraph = config.get('toc', 'tocStartParagraph') self.tocHeaderLevel = config.getint('toc', 'tocHeaderLevel') self.addTocMacro = config.getboolean('toc', 'addTocMacro', fallback = False) self.generateToc = config.getboolean('toc', 'generateToc', fallback = False) # characters # self.characters = { int(c, 16) : binascii.unhexlify(config.get('characters', c)).decode('utf-8') # type: ignore [attr-defined] # for c in config['characters'] } self.characters = {} for c,v in config['characters'].items(): if v.startswith('&'): # HTML entity self.characters[int(c, 16)] = v else: # Unicode character self.characters[int(c, 16)] = binascii.unhexlify(config.get('characters', c)).decode('utf-8') # type: ignore [attr-defined] # Media & Converter self.emfConverterPng = config.get('media', 'emfConverterPng', fallback = None) self.emfConverterSvg = config.get('media', 'emfConverterSvg', fallback = None) def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:bool) -> None: docs:Dict[str, Tuple[Document, DocumentConfiguration, Any]] = {} ptasks = {} mediaRelations:Dict[str, str] = {} addSectionNumbers = False excludeFromNumbering:list[str] = [] headers:list[Tuple[int, str]] = [] emfFiles:list[str] = [] referencedImages:list[str] = [] footnotes:dict[str, str] = {} global _print with Progress( TextColumn('[progress.description]{task.description}'), BarColumn(), TextColumn('[progress.percentage]{task.percentage:>3.0f}%'), speed_estimate_period=2.0) as progress: _print = progress.print # Assign progress internal print to global print _lastStyle = Style.none def stopProgress(msg:str='') -> None: progress.stop() progress.remove_task(readTask) _print(msg) def iter_block_items(parent): """ Yield each paragraph and table child within *parent*, in document order. Each returned value is an instance of either Table or Paragraph. *parent* would most commonly be a reference to a main Document object, but also works for a _Cell object, which itself can contain paragraphs and tables. """ if isinstance(parent, Document): parent_elm = parent.element.body elif isinstance(parent, _Cell): parent_elm = parent._tc else: raise ValueError("something's not right") for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): yield Table(child, parent) else: # print(child.__class__.__name__) ... def replaceNL(text:str, rpl:str = '') -> str: return text.replace('\n', rpl) def toMD(text:str) -> str: return text.replace('<', docConfig.replaceLt) #sectionNrs:list[int] = [ 0, 0, 0, 0] sectionNrs = SectionNumbers() def toHeader(style:str, text:str, level:int, numbering:bool = True, isAnnex:bool = False) -> list[str]: nonlocal addSectionNumbers, excludeFromNumbering if style in excludeFromNumbering: numbering = False nr = '' if numbering and addSectionNumbers: nr = sectionNrs.nextSectionNumber(level, isAnnex = isAnnex) # Replace multiple white spaces text = ' '.join(text.split()) # Remove linebreak in header lines text = text.replace(_linebreak, ' ').strip() # Store header headers.append( (level, replaceNL(text))) return [ '', f'{"#" * level} {nr}{replaceNL(text)}' if text else '' ] def strippedTag(tag:str) -> str: """ Stripp the namespace from an element or attribute name. """ _, _, tag = tag.rpartition('}') return tag def getTextFromXML(elem:Paragraph|_Cell|ET._Element) -> str: # Not-used document tags. _ignoredTags = ( 'AlternateContent', 'fldChar', 'fldSimple', 'instrText', 'lastRenderedPageBreak', 'noBreakHyphen', 'pPr', 'proofErr', 'rPr', 'moveFromRangeEnd', 'ins', 'del', 'commentRangeStart', 'commentRangeEnd', 'commentReference', 'smartTag', 'footnoteRef', ) def _parseXML(element:ET.Element, inCell:Optional[bool] = False) -> str: """ Recursively parse a document paragraph. """ nonlocal _ignoredTags _result = '' tag = strippedTag(element.tag) # remove namespaces for easier handlings match tag: case 'p': for x in element: _result += _parseXML(x, inCell) case 'r': for x in element: _result += _parseXML(x, inCell) case 't': _bold = '' _italics = '' for e in element.getparent(): if strippedTag(e.tag) == 'rPr': # paragraph style for ep in e: match strippedTag(ep.tag): case 'b' if ep.attrib.get(_val, 'true') == 'true': _bold = '**' case 'i' if ep.attrib.get(_val, 'true') == 'true': _italics = '_' # case _: # _print(f'[yellow]unsupported style: {ep.tag}') # Strip white spaces if bold or italics _s = str(toMD(str(element.text))).strip() if _bold or _italics else str(toMD(str(element.text))) # Replace single * or _ _s = _s.replace('_', '\\_') _s = _s.replace('*', '\\*') # Add trailing white space when bold or italics _postfix = ' ' if _bold or _italics else '' _result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_postfix}' # print(_result) case 'br': _result += _linebreak case 'bookmarkStart' | 'bookmarkEnd': # TODO ? pass case 'hyperlink': # Hyperlinks and URLs _hresult = '' for x in element: _hresult += _parseXML(x, inCell) _result += f'[{_hresult}]({_hresult})' case 'drawing': # Get the rID of a media file from the element's XML # and map to an extracted media file # inspect(element) # _print(element.items()) # _print(element.attrib) blip = element.findall('ns1:inline/ns3:graphic/ns3:graphicData/ns4:pic/ns4:blipFill/ns3:blip', namespaces = { 'ns1' : 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing', 'ns3' : 'http://schemas.openxmlformats.org/drawingml/2006/main', 'ns4' : 'http://schemas.openxmlformats.org/drawingml/2006/picture', }) if blip and \ (rId := blip[0].attrib.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')) and \ (mediaFile := mediaRelations.get(rId)): mediaFilePath = Path(mediaFile) referencedImages.append(mediaFilePath.stem) # Add to referenced files if docConfig.renameEMFExtension and mediaFilePath.suffix.lower() == '.emf': mediaFilePath = mediaFilePath.with_suffix(f'.{docConfig.renameEMFExtension}') _print(f'[yellow]Renaming EMF file reference to "{str(mediaFilePath)}"') _result += f'})' # image reference as posix path # else: # _print(blip) case 'pict': # for e in element: # print(f'----{e}') _print(f'[yellow]unsupported pict element: {element}') _result += f'<mark>unsupported pict element: {element}</mark>' case 'object': #inspect(element) pass case 'tab': _result += ' ' # TODO nbsp? case 'softHyphen': pass # ignore a soft hyphen character which has no meaning in Markdown and zero-width case 'sym': def _symError(ch:int) -> None: nonlocal _result _symError = f'unknown font+symbol: {element.attrib["{"+wns+"}font"]} - "{element.attrib["{"+wns+"}char"]} ({ch})"' _print(f'[yellow]{_symError}') _result += f'<mark>{_symError}</mark>' try: _ch = 0 _ch = int(element.attrib["{"+wns+"}char"], 16) if _ch in docConfig.characters: if (rch := docConfig.characters[_ch]) == chr(0): rch = '' _result += rch else: _symError(_ch) except ValueError as e: _symError(_ch) # ignore deleted test case 'del': pass # try to keep the text of inserted text case 'ins': for x in element: _result += _parseXML(x) case 'footnoteReference': id = element.attrib[f'{{{wns}}}id'] _result += f'[^{id}]' footnotes[id] = '<mark>unknown footnote</mark>' # The footnote itself is not included in the document but in a separate file. # Therefore, we need to extract the footnote from the footnotes.xml file. The format # of the footnote is the same as a paragraph. case 'footnote': for x in element: _result += _parseXML(x) case _ if tag in _ignoredTags: # ignore pass case _: # unknown _print(tag) _print(element) return _result # _print(ET.fromstring(elem._p.xml)) match elem: case Paragraph(): # type: ignore[misc] return _parseXML(ET.fromstring(elem._p.xml)) case _Cell(): # type: ignore[misc] # Iterate over all paragraphs in the cell and parse them # Create a list of parsed paragraphs and join them with linebreaks return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip() for p in elem.paragraphs ]) case ET._Element(): return _parseXML(elem) case _: return '' def checkSameStyle(style:Style, action:Optional[Callable] = None) -> None: """ Check whether the given style is the same as the last one. If no, then execute the optional action. """ nonlocal _lastStyle if style == _lastStyle: return _lastStyle = style if action: action() # Preparing tasks for progress readTask = progress.add_task(f'Reading document{"s" if len(documents) > 1 else ""}', total = len(documents)) # # Reading documents # for d in documents: if not (dp := Path(d)).exists(): stopProgress(f'[red]Input document "{d}" does not esist') return if not dp.is_file(): stopProgress(f'[red]Input document "{d}" is not a file') return try: # Search for footnotes in the document XML footnotesPart = None for part in Package.open(d).parts: if part.partname.endswith('/footnotes.xml'): footnotesPart = part docs[d] = (docx.Document(d), DocumentConfiguration(d), footnotesPart) ptasks[d] = progress.add_task(f'Processing {d}', total = None) progress.update(readTask, advance=1) except docx.opc.exceptions.PackageNotFoundError as e: stopProgress(f'[red]Input document "{d}" is not a .docx file') return except Exception as e: stopProgress(f'[red]Error reading file "{d}"') console.print_exception() return # # Processing Documents # for docFileName, (doc, docConfig, footnotesPart) in docs.items(): processTask = ptasks[docFileName] docItems = list(iter_block_items(doc)) addSectionNumbers = docConfig.addSectionNumbers excludeFromNumbering = docConfig.excludeFromNumbering paragraphNr = 0 # TODO #(docConfig.replaceNbsp) # Create output directories try: os.makedirs(documentDirName := f'{outDirectory}{os.sep}{Path(docFileName).stem}', exist_ok = True) os.makedirs(imageDirName := f'{documentDirName}{os.sep}{imagesSubDir}', exist_ok = True) os.makedirs(unreferencedDirName := f'{documentDirName}{os.sep}{unreferencedSubDir}', exist_ok = True) except Exception as e: stopProgress(f'[red]Error creating output directory "{d}"') console.print_exception() return # Add sub-progress task progress.update(processTask, total = len(docItems) + 6) # + relations + image extraction + characters + toc + footnotes + media convert # Extract the media relations file, and get the mappings from document IDs to media files mediaRelations = {} with open(docFileName, 'rb') as docfile: zip = zipfile.ZipFile(docfile) for z in zip.filelist: if z.filename == 'word/_rels/document.xml.rels': xml = ET.fromstring(zip.read(z.filename)) for element in xml: if strippedTag(element.tag) == 'Relationship': if (_a := element.attrib.get('Type')) and _a.endswith('/image'): # Only image relationships mediaRelations[element.attrib['Id']] = element.attrib['Target'] break else: _print('[red]Media relations file not found in document') return progress.update(processTask, advance = 1) # progress update # Extracting images for the document next with open(docFileName, 'rb') as docfile: zip = zipfile.ZipFile(docfile) for zipMediaFilename in [z.filename for z in zip.filelist if z.filename.startswith('word/media/')]: fn = f'{imageDirName}{os.sep}{os.path.basename(zipMediaFilename)}' if fn.lower().endswith(('.emf', '.wmf')): _print(f'[yellow]unsupported media file: {fn}', highlight = False) emfFiles.append(fn) with open(f'{fn}', 'wb') as imgFile: imgFile.write(zip.read(zipMediaFilename)) progress.update(processTask, advance = 1) # progress update # Processing the document lines:list[str] = [] imageIndex = 1 for elem in docItems: paragraphNr += 1 progress.update(processTask, advance = 1) match type(elem).__name__: case 'Paragraph': text = getTextFromXML(elem) style = elem.style.name.lower() # print(f'{style} {text}') # Normal, body text if style in docConfig.normal: checkSameStyle(Style.normal, lambda:lines.append('')) lines.append(text) lines.append('') # Add empty line # Headers elif style in docConfig.h1: lines.extend(toHeader(style, text, 1)) elif style in docConfig.h2: lines.extend(toHeader(style, text, 2)) elif style in docConfig.h3: lines.extend(toHeader(style, text, 3)) elif style in docConfig.h4: lines.extend(toHeader(style, text, 4)) elif style in docConfig.h5: lines.extend(toHeader(style, text, 5)) elif style in docConfig.h6: lines.extend(toHeader(style, text, 6)) elif style in docConfig.h7: lines.extend(toHeader(style, text, 7)) elif style in docConfig.h8: lines.extend(toHeader(style, text, 8)) elif style in docConfig.h9: lines.extend(toHeader(style, text, 9)) # Annexes elif style in docConfig.a1: lines.extend(toHeader(style, text, 1, isAnnex = True)) elif style in docConfig.a2: lines.extend(toHeader(style, text, 2, isAnnex = True)) elif style in docConfig.a3: lines.extend(toHeader(style, text, 3, isAnnex = True)) # Ordered Lists elif style in docConfig.ol1: checkSameStyle(Style.orderedlist2, lambda:lines.append('')) if len(elem.text): # ignore empty lines.append(f'1. {text}') elif style in docConfig.ol2: checkSameStyle(Style.orderedlist2, lambda:lines.append('')) if len(elem.text): # ignore empty lines.append(f' 1. {text}') # Unordered Lists elif style in docConfig.ul1: checkSameStyle(Style.unorderedlist, lambda:lines.append('')) if len(elem.text): # ignore empty lines.append(f'- {text}') elif style in docConfig.ul2: checkSameStyle(Style.unorderedlist2, lambda:lines.append('')) if len(elem.text): # ignore empty lines.append(f'{" "*1}- {text}') elif style in docConfig.ul3: checkSameStyle(Style.unorderedlist3, lambda:lines.append('')) if len(elem.text): # ignore empty lines.append(f'{" "*2}- {text}') elif style in docConfig.ul4: checkSameStyle(Style.unorderedlist4, lambda:lines.append('')) if len(elem.text): # ignore empty lines.append(f'{" "*3}- {text}') elif style in docConfig.ul5: checkSameStyle(Style.unorderedlist5, lambda:lines.append('')) if len(elem.text): # ignore empty lines.append(f'{" "*4}- {text}') # Table Caption elif style in docConfig.tablecaption: lines.append('') caption = replaceNL(text).strip() anchor = f'<a name="table_{caption[6:].split(":")[0].strip()}"></a>' if caption.startswith('Table ') and ':' in caption else '' lines.append(f'**{caption}**{anchor}') # Image Caption elif style in docConfig.imagecaption: checkSameStyle(Style.imagecaption, lambda:lines.append('')) _t = replaceNL(text).strip() lines.append(f'**{_t}**') lines.append('') if docConfig.imageCaptions2AltText: # Search and replace the previous image reference (max 10 lines back-search TODO configurable) for idx in range(len(lines)-1, len(lines)-11, -1): if _captionMarker in lines[idx]: lines[idx] = lines[idx].replace(_captionMarker, _t) # Image & Figure elif style in docConfig.image: lines.append('') lines.append(text) # Code elif style in docConfig.code: checkSameStyle(Style.code, lambda:lines.append('')) for _t in text.split(_linebreak): lines.append(f'```{_t if _t else " "}``` ') # at least an empty space. And 2 spaces at the end for newline # Example elif style in docConfig.example: checkSameStyle(Style.example, lambda:lines.append('')) # Replace linebreaks for _t in text.split(_linebreak): lines.append(f'`{_t if _t else " "}` ') # at least an empty space. And 2 spaces at the end for newline # Notes elif style in docConfig.note: checkSameStyle(Style.note) lines.append(f'> {text}') # Add TOC elif style in docConfig.tocStartParagraph: lines.extend(toHeader(style, elem.text, docConfig.tocHeaderLevel, numbering = False)) if docConfig.addTocMacro: lines.append('[toc]') if docConfig.generateToc: lines.append(_tocInsertPoint) # Ignore & empty elif style in docConfig.ignore: pass elif style in docConfig.empty: lines.append('') # Print Unhandled tokens also to the console else: _print(f'{paragraphNr} {style}: {elem.style}: {text}') lines.append(text) case 'Table': rows:list[list[str]] = [] nrRows = 0 for row in elem.rows: cells:list[str] = [] for cell in row.cells: cells.append(f'{getTextFromXML(cell)} ') # add at least a space rows.append(cells) nrRows += 1 # Warning if this is a single-row table if nrRows == 1: _print(f'[red]Single-row table found. Such tables cannot be converted to markdown.[/red] Please consider to change the following table in the original document:\n[grey39]{rows[0]}', highlight = False) lines.append('') # Add an empty line before a table for idx, row in enumerate(rows): # Check for a table caption and add separator line if idx == 1: lines.append('-'.join('|' * (len(row) + 1) )) # Add table row lines.append(f'|{"|".join(row)}|' .replace('\n', _linebreak)) # replace line breaks in cells lines.append('') # Add another empty line after a table case _: _print('[blue] {type(elem).__name__}') # # Replace non-ascii characters # progress.update(processTask, advance = 1) # progress update for i in range(len(lines)): line = lines[i] for ch in line: if not ch.isascii(): if (_ch := ord(ch)) in docConfig.characters: if (rch := docConfig.characters[_ch]) == chr(0): rch = '' # line = line.replace(ch, docConfig.characters[_ch]) # we need the line for further replacements line = line.replace(ch, rch) # we need the line for further replacements lines[i] = line else: _print(f'[yellow]Non-ASCII character (consider to add a replacement in the config.ini file): "{ch}" / {ord(ch)} / {hex(ord(ch))}') # # Remove multiple bold / italics on/off occurances # Sometimes word doesn't remove empty bold-on/bold-off (or italics) indicatros # progress.update(processTask, advance = 1) # progress update for i in range(len(lines)): line = lines[i] line = line.replace('__', '') line = line.replace('****', '') #line = line.replace(' ', ' ') lines[i] = line # # Combine mutiple consecutive "code" lines # if docConfig.combineCodeParagraphs: codeblock:list[str] = [] _lines:list[str] = [] for i in range(len(lines)): line = lines[i] if line.startswith('```') and line.endswith('``` '): # Store code block codeblock.append(line[3:-5]) elif codeblock: # Add whole code block to lines _lines.append('```') _lines.append('') _lines.extend(codeblock) _lines.append('') _lines.append('```') codeblock = [] else: # Add line _lines.append(line) lines = _lines # # Insert auto-generated table of contents # progress.update(processTask, advance = 1) # progress update if docConfig.generateToc: toc = '' for l, t in headers: link = t # Convert to # All text is converted to lowercase. link = link.lower() # All non-word text (e.g., punctuation (except: -()), HTML) is removed. Some characters are converted to upper-case hex. # TODO decide / configurable how to replace special characters in links- Depends on rendering? "Markdown 2" (mac) likes to include # link = ''.join( c if c not in '():' else f'{ord(c):x}'.upper() link = ''.join( c if c not in '():' else ''#f'{ord(c):x}'.upper() for c in link if c.isspace() or c.isalnum() or c in '-():') # All spaces are converted to hyphens. link = ''.join( '-' if c.isspace() else c for c in link) # Two or more hyphens in a row are converted to one. link = '-'.join(link.split()) # TODO If a header with the same ID has already been generated, a unique incrementing number is appended, starting at 1. # Add to toc toc += f'{_nbsp * 4 * (l - 1)}[{t}](#{link}) \n' for i in range(len(lines)): line = lines[i] if line == _tocInsertPoint: lines[i] = toc # continue when found, perhaps we want to have more than one toc? # # Map internal references # _definitionExpression = re.compile(r'^[`]?\[([\d]+|i.[\d]+)\]([^`]*)[`]?') _referenceExpression = re.compile(r'([^>])\[([\d]+|i.[\d]+)\](?!</a>)') for i in range(len(lines)): line = lines[i] if (m := _definitionExpression.match(line)) is not None: lines[i] = f'- <a name="_ref_{m.group(1)}"[{m.group(1)}]">[{m.group(1)}]</a>{m.group(2)}' def _repl(m:re.Match) -> str|None: if m.group(1) == '"': return None return f'{m.group(1)}<a href="#_ref_{m.group(2)}">[{m.group(2)}]</a>' for i in range(len(lines)): line = lines[i] lines[i] = re.sub(_referenceExpression, _repl, line) # type:ignore[arg-type] # # Process footnotes # progress.update(processTask, advance = 1) # progress update if len(footnotes) and footnotesPart is not None: _print(f'[yellow]Footnotes found: {len(footnotes)}') # Analyze footnotes file footnotesXML = ET.fromstring(footnotesPart.blob) # Process the footnotes XML here for element in footnotesXML: # Footnote found if strippedTag(element.tag) == 'footnote': footnoteID = element.attrib[f'{{{wns}}}id'] if footnoteID in footnotes: t = getTextFromXML(element) footnotes[footnoteID] = t # Add footnotes to the end of the document lines.append('') for fid, text in footnotes.items(): lines.append(f'[^{fid}]: {text}') # # List unresolved CAPTION markers # for i in range(len(lines)): line = lines[i] if _captionMarker in line: _print(f'[yellow]Unresolved / unreferenced figure caption : \[{i}] "{line}"') # # Write produced Markdown file # with open(f'{documentDirName}{os.sep}{Path(d).stem}.md', 'w') as file: file.write('\n'.join(lines)) # # Convert media files # def _convertImage(converter:str, format:str) -> None: if converter: for fn in list(emfFiles): _f = Path(fn) # Filename to handle _t = imageDirName # Target directory if _f.stem not in referencedImages: if not fn.startswith(unreferencedDirName): _print(f'[yellow]Unreferenced image in the document: {PurePath(fn).name} {"(skipped)" if docConfig.skipUnreferencedMediaFiles else ""}', highlight = False) _print(f'[yellow]Moving image file to: {unreferencedDirName}', highlight = False) _n = f'{unreferencedDirName}/{_f.name}' _p = _f.replace(_n) emfFiles.remove(fn) emfFiles.append(_n) fn = _n if docConfig.skipUnreferencedMediaFiles: continue _t = unreferencedDirName cmd = converter cmd = cmd.replace('{infile}', fn).replace('{outdir}', _t) _print(f'Converting EMF file: {fn} to "{format}"', highlight = False) if (res := subprocess.run(cmd, shell = True, capture_output = True)).returncode != 0: _print(f'[red]Error running command: {res.stderr.decode("utf-8")}') _print(f'[red]Please check the configuration file -> section "\[media]" for the converter command: {converter}') break if not skipImageConversion: if docConfig.emfConverterPng: _convertImage(docConfig.emfConverterPng, 'png') if docConfig.emfConverterSvg: _convertImage(docConfig.emfConverterSvg, 'svg') emfFiles.clear() referencedImages.clear() progress.update(processTask, advance = 1) # progress update progress.stop() if __name__ == '__main__': # Parse command line arguments parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>', help = 'specify output directory') parser.add_argument('--skip-image-conversion', '-sic', action='store_true', dest='skipImageConversion', help = 'skip image conversion step') parser.add_argument('document', nargs = '+', help = 'documents to parse') args = parser.parse_args() # Process documents and print output os.makedirs(args.outDirectory, exist_ok = True) processDocuments(sorted(args.document), args.outDirectory, args.skipImageConversion)