Added support for footnotes

6f5baea5 · Andreas Kraft · 9328ebdb · 6f5baea5 · 6f5baea5
Commit 6f5baea5 authored 9 months ago by Andreas Kraft
--- a/config.ini
+++ b/config.ini
@@ -31,18 +31,31 @@ imageCaptions2AltText = true


 [toc]
+# Add section numbers to the headlines
 addSectionNumbers = false
-excludeFromNumbering =
+
+# Exclude the following paragraph types from numbering. 
+# The default is to exclude the "Content" heading.
+excludeFromNumbering = tt
+
+# The paragraph type that is used in the original document for the table of contents.
 tocStartParagraph = heading no numbering
+
+# The level of the table of contents.
 tocHeaderLevel = 1
+
+# Automatically generate a table of contents.
 generateToc = false
+
+# Add a macro "[toc]" to the document that can be used to generate a table of contents.
+# Some converters and viewer support this macro.
 addTocMacro = false


 [paragraphs]
-normal = normal
-h1 = heading 1, tt
-h2 = heading 2
+normal = normal, onem2m-normal
+h1 = heading 1, tt, onem2m-heading1
+h2 = heading 2, onem2m-heading2
 h3 = heading 3
 h4 = heading 4
 h5 = heading 5
@@ -53,7 +66,7 @@ h9 = heading 9
 a1 = heading 1
 a2 = heading 2
 a3 = heading 3
-note = no
+note = no, onem2m-iprtitle, onem2m-ipr
 code = pl
 example = ex, ew
 ul1 = b1, b1+, list paragraph
@@ -63,7 +76,7 @@ ul4 = b4, b4+
 ul5 = b5, b5+
 ol1 = bn
 ol2 = bl
-tablecaption = caption, th
+tablecaption = caption, th, onem2m-tabletitle
 imagecaption = tf
 image = fl
 empty = fp

--- a/spec2md.py
+++ b/spec2md.py
@@ -9,10 +9,12 @@


 from enum import IntEnum, auto
-from typing import Callable, Tuple, Dict, Optional
+from typing import Callable, Tuple, Dict, Optional, Any
+
 from pathlib import Path, PurePath
 from docx.document import Document
 from docx.text.paragraph import Paragraph
+from docx.package import Package
 import docx.opc.exceptions
 from docx.table import _Cell, Table
 from docx.oxml.table import CT_Tbl
@@ -201,7 +203,7 @@ class DocumentConfiguration(object):


 def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:bool) -> None:
-	docs:Dict[str, Tuple[Document, DocumentConfiguration]]		= {}
+	docs:Dict[str, Tuple[Document, DocumentConfiguration, Any]]		= {}
 	ptasks 															= {}
 	mediaRelations:Dict[str, str] 									= {}
 	addSectionNumbers 												= False
@@ -209,6 +211,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 	headers:list[Tuple[int, str]]									= []
 	emfFiles:list[str]												= []
 	referencedImages:list[str]										= []
+	footnotes:dict[str, str]										= {}

 	global _print
 	
@@ -292,7 +295,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 			return tag


-		def getTextFromXML(elem:Paragraph|_Cell) -> str:
+		def getTextFromXML(elem:Paragraph|_Cell|ET._Element) -> str:

 			#	Not-used document tags.
 			_ignoredTags = ( 'AlternateContent',
@@ -310,6 +313,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 							 'commentRangeStart',
 							 'commentRangeEnd',
 							 'commentReference',
+							 'smartTag',
+							 'footnoteRef',
 			)
 			

@@ -405,13 +410,15 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 						pass	# ignore a soft hyphen character which has no meaning in Markdown and zero-width 
 					
 					case 'sym':
-						def _symError(ch:str) -> str:
+
+						def _symError(ch:int) -> None:
 							nonlocal _result
 							_symError = f'unknown font+symbol: {element.attrib["{"+wns+"}font"]} - "{element.attrib["{"+wns+"}char"]} ({ch})"'
 							_print(f'[yellow]{_symError}')
 							_result += f'<mark>{_symError}</mark>'
+
 						try:
-							_ch = '????'
+							_ch = 0
 							_ch = int(element.attrib["{"+wns+"}char"], 16)
 							if _ch in docConfig.characters:
 								if (rch := docConfig.characters[_ch]) == chr(0):
@@ -431,6 +438,18 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 						for x in element:
 							_result += _parseXML(x)
 					
+					case 'footnoteReference':
+						id = element.attrib[f'{{{wns}}}id']
+						_result += f'[^{id}]'
+						footnotes[id] = '<mark>unknown footnote</mark>'
+					
+					# The footnote itself is not included in the document but in a separate file.
+					# Therefore, we need to extract the footnote from the footnotes.xml file. The format
+					# of the footnote is the same as a paragraph.
+					case 'footnote':
+						for x in element:
+							_result += _parseXML(x)
+					
 					case _ if tag in _ignoredTags:	# ignore
 						pass
 					
@@ -448,6 +467,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 					# Create a list of parsed paragraphs and join them with linebreaks
 					return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip() 
 										   for p in elem.paragraphs ])
+				case ET._Element():
+					return _parseXML(elem)
 				case _:
 					return ''

@@ -480,7 +501,12 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 				stopProgress(f'[red]Input document "{d}" is not a file')
 				return
 			try:
-				docs[d] = (docx.Document(d), DocumentConfiguration(d))
+				# Search for footnotes in the document XML
+				footnotesPart = None
+				for part in Package.open(d).parts:
+					if part.partname.endswith('/footnotes.xml'):
+						footnotesPart = part
+				docs[d] = (docx.Document(d), DocumentConfiguration(d), footnotesPart)
 				ptasks[d] = progress.add_task(f'Processing {d}', total = None)
 				progress.update(readTask, advance=1)
 			except docx.opc.exceptions.PackageNotFoundError as e:
@@ -495,7 +521,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 		#	Processing Documents
 		#

-		for docFileName, (doc, docConfig) in docs.items():
+		for docFileName, (doc, docConfig, footnotesPart) in docs.items():
 			processTask = ptasks[docFileName]
 			docItems = list(iter_block_items(doc))
 			addSectionNumbers = docConfig.addSectionNumbers
@@ -517,7 +543,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 				return

 			# Add sub-progress task
-			progress.update(processTask, total = len(docItems) + 5)	# + relations + image extraction + characters + toc + media convert
+			progress.update(processTask, total = len(docItems) + 6)	# + relations + image extraction + characters + toc + footnotes + media convert


 			#	Extract the media relations file, and get the mappings from document IDs to media files
@@ -769,7 +795,9 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 					elif codeblock:
 						# Add whole code block to lines
 						_lines.append('```')
+						_lines.append('')
 						_lines.extend(codeblock)
+						_lines.append('')
 						_lines.append('```')
 						codeblock = []
 					else:
@@ -830,6 +858,29 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 				lines[i] = re.sub(_referenceExpression, _repl, line)	# type:ignore[arg-type]


+			#
+			#	Process footnotes
+			#
+			progress.update(processTask, advance = 1)	# progress update
+			if len(footnotes) and footnotesPart is not None:
+				_print(f'[yellow]Footnotes found: {len(footnotes)}')
+				# Analyze footnotes file
+				footnotesXML = ET.fromstring(footnotesPart.blob)
+				# Process the footnotes XML here
+				for element in footnotesXML:
+
+					# Footnote found
+					if strippedTag(element.tag) == 'footnote':
+						footnoteID = element.attrib[f'{{{wns}}}id']
+						if footnoteID in footnotes:
+							t = getTextFromXML(element)
+							footnotes[footnoteID] = t
+				
+				# Add footnotes to the end of the document
+				lines.append('')
+				for fid, text in footnotes.items():
+					lines.append(f'[^{fid}]: {text}')
+
 			#
 			#	List unresolved CAPTION markers
 			#