diff --git a/config.ini b/config.ini index af8585c3683c33d8f012f0e06192e30c0a529099..0ebaa7e74326d118a15089e0b81fb5561ba9f491 100644 --- a/config.ini +++ b/config.ini @@ -31,18 +31,31 @@ imageCaptions2AltText = true [toc] +# Add section numbers to the headlines addSectionNumbers = false -excludeFromNumbering = + +# Exclude the following paragraph types from numbering. +# The default is to exclude the "Content" heading. +excludeFromNumbering = tt + +# The paragraph type that is used in the original document for the table of contents. tocStartParagraph = heading no numbering + +# The level of the table of contents. tocHeaderLevel = 1 + +# Automatically generate a table of contents. generateToc = false + +# Add a macro "[toc]" to the document that can be used to generate a table of contents. +# Some converters and viewer support this macro. addTocMacro = false [paragraphs] -normal = normal -h1 = heading 1, tt -h2 = heading 2 +normal = normal, onem2m-normal +h1 = heading 1, tt, onem2m-heading1 +h2 = heading 2, onem2m-heading2 h3 = heading 3 h4 = heading 4 h5 = heading 5 @@ -53,7 +66,7 @@ h9 = heading 9 a1 = heading 1 a2 = heading 2 a3 = heading 3 -note = no +note = no, onem2m-iprtitle, onem2m-ipr code = pl example = ex, ew ul1 = b1, b1+, list paragraph @@ -63,7 +76,7 @@ ul4 = b4, b4+ ul5 = b5, b5+ ol1 = bn ol2 = bl -tablecaption = caption, th +tablecaption = caption, th, onem2m-tabletitle imagecaption = tf image = fl empty = fp diff --git a/spec2md.py b/spec2md.py index d618ce5007f694e8b94173d9de53f1b564ee2759..1c26aa512d8fdb352742e6a46fb6c997b69a896e 100644 --- a/spec2md.py +++ b/spec2md.py @@ -9,10 +9,12 @@ from enum import IntEnum, auto -from typing import Callable, Tuple, Dict, Optional +from typing import Callable, Tuple, Dict, Optional, Any + from pathlib import Path, PurePath from docx.document import Document from docx.text.paragraph import Paragraph +from docx.package import Package import docx.opc.exceptions from docx.table import _Cell, Table from docx.oxml.table import CT_Tbl @@ -201,14 +203,15 @@ class DocumentConfiguration(object): def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:bool) -> None: - docs:Dict[str, Tuple[Document, DocumentConfiguration]] = {} - ptasks = {} - mediaRelations:Dict[str, str] = {} - addSectionNumbers = False - excludeFromNumbering:list[str] = [] - headers:list[Tuple[int, str]] = [] - emfFiles:list[str] = [] - referencedImages:list[str] = [] + docs:Dict[str, Tuple[Document, DocumentConfiguration, Any]] = {} + ptasks = {} + mediaRelations:Dict[str, str] = {} + addSectionNumbers = False + excludeFromNumbering:list[str] = [] + headers:list[Tuple[int, str]] = [] + emfFiles:list[str] = [] + referencedImages:list[str] = [] + footnotes:dict[str, str] = {} global _print @@ -292,7 +295,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: return tag - def getTextFromXML(elem:Paragraph|_Cell) -> str: + def getTextFromXML(elem:Paragraph|_Cell|ET._Element) -> str: # Not-used document tags. _ignoredTags = ( 'AlternateContent', @@ -310,6 +313,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: 'commentRangeStart', 'commentRangeEnd', 'commentReference', + 'smartTag', + 'footnoteRef', ) @@ -405,13 +410,15 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: pass # ignore a soft hyphen character which has no meaning in Markdown and zero-width case 'sym': - def _symError(ch:str) -> str: + + def _symError(ch:int) -> None: nonlocal _result _symError = f'unknown font+symbol: {element.attrib["{"+wns+"}font"]} - "{element.attrib["{"+wns+"}char"]} ({ch})"' _print(f'[yellow]{_symError}') _result += f'<mark>{_symError}</mark>' + try: - _ch = '????' + _ch = 0 _ch = int(element.attrib["{"+wns+"}char"], 16) if _ch in docConfig.characters: if (rch := docConfig.characters[_ch]) == chr(0): @@ -431,6 +438,18 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: for x in element: _result += _parseXML(x) + case 'footnoteReference': + id = element.attrib[f'{{{wns}}}id'] + _result += f'[^{id}]' + footnotes[id] = '<mark>unknown footnote</mark>' + + # The footnote itself is not included in the document but in a separate file. + # Therefore, we need to extract the footnote from the footnotes.xml file. The format + # of the footnote is the same as a paragraph. + case 'footnote': + for x in element: + _result += _parseXML(x) + case _ if tag in _ignoredTags: # ignore pass @@ -439,7 +458,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: _print(element) return _result - #_print(ET.fromstring(elem._p.xml)) + # _print(ET.fromstring(elem._p.xml)) match elem: case Paragraph(): # type: ignore[misc] return _parseXML(ET.fromstring(elem._p.xml)) @@ -448,6 +467,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: # Create a list of parsed paragraphs and join them with linebreaks return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip() for p in elem.paragraphs ]) + case ET._Element(): + return _parseXML(elem) case _: return '' @@ -480,7 +501,12 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: stopProgress(f'[red]Input document "{d}" is not a file') return try: - docs[d] = (docx.Document(d), DocumentConfiguration(d)) + # Search for footnotes in the document XML + footnotesPart = None + for part in Package.open(d).parts: + if part.partname.endswith('/footnotes.xml'): + footnotesPart = part + docs[d] = (docx.Document(d), DocumentConfiguration(d), footnotesPart) ptasks[d] = progress.add_task(f'Processing {d}', total = None) progress.update(readTask, advance=1) except docx.opc.exceptions.PackageNotFoundError as e: @@ -495,7 +521,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: # Processing Documents # - for docFileName, (doc, docConfig) in docs.items(): + for docFileName, (doc, docConfig, footnotesPart) in docs.items(): processTask = ptasks[docFileName] docItems = list(iter_block_items(doc)) addSectionNumbers = docConfig.addSectionNumbers @@ -517,7 +543,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: return # Add sub-progress task - progress.update(processTask, total = len(docItems) + 5) # + relations + image extraction + characters + toc + media convert + progress.update(processTask, total = len(docItems) + 6) # + relations + image extraction + characters + toc + footnotes + media convert # Extract the media relations file, and get the mappings from document IDs to media files @@ -769,7 +795,9 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: elif codeblock: # Add whole code block to lines _lines.append('```') + _lines.append('') _lines.extend(codeblock) + _lines.append('') _lines.append('```') codeblock = [] else: @@ -830,6 +858,29 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: lines[i] = re.sub(_referenceExpression, _repl, line) # type:ignore[arg-type] + # + # Process footnotes + # + progress.update(processTask, advance = 1) # progress update + if len(footnotes) and footnotesPart is not None: + _print(f'[yellow]Footnotes found: {len(footnotes)}') + # Analyze footnotes file + footnotesXML = ET.fromstring(footnotesPart.blob) + # Process the footnotes XML here + for element in footnotesXML: + + # Footnote found + if strippedTag(element.tag) == 'footnote': + footnoteID = element.attrib[f'{{{wns}}}id'] + if footnoteID in footnotes: + t = getTextFromXML(element) + footnotes[footnoteID] = t + + # Add footnotes to the end of the document + lines.append('') + for fid, text in footnotes.items(): + lines.append(f'[^{fid}]: {text}') + # # List unresolved CAPTION markers #