Skip to content
Snippets Groups Projects
Commit 6f5baea5 authored by Andreas Kraft's avatar Andreas Kraft
Browse files

Added support for footnotes

parent 9328ebdb
No related branches found
No related tags found
No related merge requests found
......@@ -31,18 +31,31 @@ imageCaptions2AltText = true
[toc]
# Add section numbers to the headlines
addSectionNumbers = false
excludeFromNumbering =
# Exclude the following paragraph types from numbering.
# The default is to exclude the "Content" heading.
excludeFromNumbering = tt
# The paragraph type that is used in the original document for the table of contents.
tocStartParagraph = heading no numbering
# The level of the table of contents.
tocHeaderLevel = 1
# Automatically generate a table of contents.
generateToc = false
# Add a macro "[toc]" to the document that can be used to generate a table of contents.
# Some converters and viewer support this macro.
addTocMacro = false
[paragraphs]
normal = normal
h1 = heading 1, tt
h2 = heading 2
normal = normal, onem2m-normal
h1 = heading 1, tt, onem2m-heading1
h2 = heading 2, onem2m-heading2
h3 = heading 3
h4 = heading 4
h5 = heading 5
......@@ -53,7 +66,7 @@ h9 = heading 9
a1 = heading 1
a2 = heading 2
a3 = heading 3
note = no
note = no, onem2m-iprtitle, onem2m-ipr
code = pl
example = ex, ew
ul1 = b1, b1+, list paragraph
......@@ -63,7 +76,7 @@ ul4 = b4, b4+
ul5 = b5, b5+
ol1 = bn
ol2 = bl
tablecaption = caption, th
tablecaption = caption, th, onem2m-tabletitle
imagecaption = tf
image = fl
empty = fp
......
......@@ -9,10 +9,12 @@
from enum import IntEnum, auto
from typing import Callable, Tuple, Dict, Optional
from typing import Callable, Tuple, Dict, Optional, Any
from pathlib import Path, PurePath
from docx.document import Document
from docx.text.paragraph import Paragraph
from docx.package import Package
import docx.opc.exceptions
from docx.table import _Cell, Table
from docx.oxml.table import CT_Tbl
......@@ -201,7 +203,7 @@ class DocumentConfiguration(object):
def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:bool) -> None:
docs:Dict[str, Tuple[Document, DocumentConfiguration]] = {}
docs:Dict[str, Tuple[Document, DocumentConfiguration, Any]] = {}
ptasks = {}
mediaRelations:Dict[str, str] = {}
addSectionNumbers = False
......@@ -209,6 +211,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
headers:list[Tuple[int, str]] = []
emfFiles:list[str] = []
referencedImages:list[str] = []
footnotes:dict[str, str] = {}
global _print
......@@ -292,7 +295,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
return tag
def getTextFromXML(elem:Paragraph|_Cell) -> str:
def getTextFromXML(elem:Paragraph|_Cell|ET._Element) -> str:
# Not-used document tags.
_ignoredTags = ( 'AlternateContent',
......@@ -310,6 +313,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
'commentRangeStart',
'commentRangeEnd',
'commentReference',
'smartTag',
'footnoteRef',
)
......@@ -405,13 +410,15 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
pass # ignore a soft hyphen character which has no meaning in Markdown and zero-width
case 'sym':
def _symError(ch:str) -> str:
def _symError(ch:int) -> None:
nonlocal _result
_symError = f'unknown font+symbol: {element.attrib["{"+wns+"}font"]} - "{element.attrib["{"+wns+"}char"]} ({ch})"'
_print(f'[yellow]{_symError}')
_result += f'<mark>{_symError}</mark>'
try:
_ch = '????'
_ch = 0
_ch = int(element.attrib["{"+wns+"}char"], 16)
if _ch in docConfig.characters:
if (rch := docConfig.characters[_ch]) == chr(0):
......@@ -431,6 +438,18 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
for x in element:
_result += _parseXML(x)
case 'footnoteReference':
id = element.attrib[f'{{{wns}}}id']
_result += f'[^{id}]'
footnotes[id] = '<mark>unknown footnote</mark>'
# The footnote itself is not included in the document but in a separate file.
# Therefore, we need to extract the footnote from the footnotes.xml file. The format
# of the footnote is the same as a paragraph.
case 'footnote':
for x in element:
_result += _parseXML(x)
case _ if tag in _ignoredTags: # ignore
pass
......@@ -448,6 +467,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
# Create a list of parsed paragraphs and join them with linebreaks
return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip()
for p in elem.paragraphs ])
case ET._Element():
return _parseXML(elem)
case _:
return ''
......@@ -480,7 +501,12 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
stopProgress(f'[red]Input document "{d}" is not a file')
return
try:
docs[d] = (docx.Document(d), DocumentConfiguration(d))
# Search for footnotes in the document XML
footnotesPart = None
for part in Package.open(d).parts:
if part.partname.endswith('/footnotes.xml'):
footnotesPart = part
docs[d] = (docx.Document(d), DocumentConfiguration(d), footnotesPart)
ptasks[d] = progress.add_task(f'Processing {d}', total = None)
progress.update(readTask, advance=1)
except docx.opc.exceptions.PackageNotFoundError as e:
......@@ -495,7 +521,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
# Processing Documents
#
for docFileName, (doc, docConfig) in docs.items():
for docFileName, (doc, docConfig, footnotesPart) in docs.items():
processTask = ptasks[docFileName]
docItems = list(iter_block_items(doc))
addSectionNumbers = docConfig.addSectionNumbers
......@@ -517,7 +543,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
return
# Add sub-progress task
progress.update(processTask, total = len(docItems) + 5) # + relations + image extraction + characters + toc + media convert
progress.update(processTask, total = len(docItems) + 6) # + relations + image extraction + characters + toc + footnotes + media convert
# Extract the media relations file, and get the mappings from document IDs to media files
......@@ -769,7 +795,9 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
elif codeblock:
# Add whole code block to lines
_lines.append('```')
_lines.append('')
_lines.extend(codeblock)
_lines.append('')
_lines.append('```')
codeblock = []
else:
......@@ -830,6 +858,29 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
lines[i] = re.sub(_referenceExpression, _repl, line) # type:ignore[arg-type]
#
# Process footnotes
#
progress.update(processTask, advance = 1) # progress update
if len(footnotes) and footnotesPart is not None:
_print(f'[yellow]Footnotes found: {len(footnotes)}')
# Analyze footnotes file
footnotesXML = ET.fromstring(footnotesPart.blob)
# Process the footnotes XML here
for element in footnotesXML:
# Footnote found
if strippedTag(element.tag) == 'footnote':
footnoteID = element.attrib[f'{{{wns}}}id']
if footnoteID in footnotes:
t = getTextFromXML(element)
footnotes[footnoteID] = t
# Add footnotes to the end of the document
lines.append('')
for fid, text in footnotes.items():
lines.append(f'[^{fid}]: {text}')
#
# List unresolved CAPTION markers
#
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment