Skip to content
Snippets Groups Projects
Commit f46a2a97 authored by Andreas Kraft's avatar Andreas Kraft
Browse files

Improved parsing and generation of tables. Corrected various wrong...

Improved parsing and generation of tables. Corrected various wrong replacements. List unresolved captions
parent 6401024d
No related branches found
No related tags found
No related merge requests found
...@@ -52,8 +52,8 @@ unreferencedSubDir = 'unreferenced' ...@@ -52,8 +52,8 @@ unreferencedSubDir = 'unreferenced'
_linebreak = '<br />' _linebreak = '<br />'
_entityLt = '&lt;' _entityLt = '&lt;'
_nbsp = '&nbsp;' _nbsp = '&nbsp;'
_tocInsertPoint = '__t_o_c__' _tocInsertPoint = '~~t~o~c~~'
_captionMarker = '__CAPTION__' _captionMarker = '~~CAPTION~~'
# https://learn.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing?view=openxml-2.8.1 # https://learn.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing?view=openxml-2.8.1
...@@ -335,6 +335,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: ...@@ -335,6 +335,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
_bold = '**' _bold = '**'
case 'i' if ep.attrib.get(_val, 'true') == 'true': case 'i' if ep.attrib.get(_val, 'true') == 'true':
_italics = '_' _italics = '_'
# case _:
# _print(f'[yellow]unsupported style: {ep.tag}')
# Strip white spaces if bold or italics # Strip white spaces if bold or italics
_s = str(toMD(str(element.text))).strip() if _bold or _italics else str(toMD(str(element.text))) _s = str(toMD(str(element.text))).strip() if _bold or _italics else str(toMD(str(element.text)))
...@@ -342,11 +344,13 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: ...@@ -342,11 +344,13 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
_s = _s.replace('_', '\\_') _s = _s.replace('_', '\\_')
_s = _s.replace('*', '\\*') _s = _s.replace('*', '\\*')
# Add trailing white space when bold or italics # Add trailing white space when bold or italics
_prefix = ' ' if _bold or _italics else '' _postfix = ' ' if _bold or _italics else ''
_result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_prefix}' _result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_postfix}'
# print(_result)
case 'br': case 'br':
_result += _linebreak _result += _linebreak
case 'bookmarkStart' | 'bookmarkEnd': # TODO ? case 'bookmarkStart' | 'bookmarkEnd': # TODO ?
pass pass
...@@ -366,20 +370,21 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: ...@@ -366,20 +370,21 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
blip = element.findall('ns1:inline/ns3:graphic/ns3:graphicData/ns4:pic/ns4:blipFill/ns3:blip', blip = element.findall('ns1:inline/ns3:graphic/ns3:graphicData/ns4:pic/ns4:blipFill/ns3:blip',
namespaces = { namespaces = {
'ns1' : 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing', 'ns1' : 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
'ns3' : wns, 'ns3' : 'http://schemas.openxmlformats.org/drawingml/2006/main',
'ns4' : 'http://schemas.openxmlformats.org/drawingml/2006/picture', 'ns4' : 'http://schemas.openxmlformats.org/drawingml/2006/picture',
}) })
if blip and \ if blip and \
(rId := blip[0].attrib.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')) and \ (rId := blip[0].attrib.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')) and \
(mediaFile := mediaRelations.get(rId)): (mediaFile := mediaRelations.get(rId)):
referencedImages.append(Path(mediaFile).stem) # Add to referenced files mediaFilePath = Path(mediaFile)
if docConfig.renameEMFExtension and mediaFile.lower().endswith('.emf'): referencedImages.append(mediaFilePath.stem) # Add to referenced files
mediaFile = str(Path(mediaFile).with_suffix(f'.{docConfig.renameEMFExtension}')) if docConfig.renameEMFExtension and mediaFilePath.suffix.lower() == '.emf':
_print(f'[yellow]Renaming EMF file reference to "{mediaFile}"') mediaFilePath = mediaFilePath.with_suffix(f'.{docConfig.renameEMFExtension}')
_result += f'![{_captionMarker}]({mediaFile})' _print(f'[yellow]Renaming EMF file reference to "{str(mediaFilePath)}"')
_result += f'![{_captionMarker}]({mediaFilePath.as_posix()})' # image reference as posix path
# else: # else:
# _print(blip) # _print(blip)
case 'pict': case 'pict':
# for e in element: # for e in element:
# print(f'----{e}') # print(f'----{e}')
...@@ -423,10 +428,10 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: ...@@ -423,10 +428,10 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
case Paragraph(): # type: ignore[misc] case Paragraph(): # type: ignore[misc]
return _parseXML(ET.fromstring(elem._p.xml)) return _parseXML(ET.fromstring(elem._p.xml))
case _Cell(): # type: ignore[misc] case _Cell(): # type: ignore[misc]
result = '' # Iterate over all paragraphs in the cell and parse them
for p in elem.paragraphs: # Create a list of parsed paragraphs and join them with linebreaks
result += _parseXML(ET.fromstring(p._p.xml), True) return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip()
return result for p in elem.paragraphs ])
case _: case _:
return '' return ''
...@@ -614,10 +619,10 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: ...@@ -614,10 +619,10 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
# Table Caption # Table Caption
elif style in docConfig.tablecaption: elif style in docConfig.tablecaption:
lines.append('') lines.append('')
lines.append(f'**{replaceNL(text).strip()}**') caption = replaceNL(text).strip()
anchor = f'<a name="table_{caption[6:].split(":")[0].strip()}"></a>' if caption.startswith('Table ') and ':' in caption else '' anchor = f'<a name="table_{caption[6:].split(":")[0].strip()}"></a>' if caption.startswith('Table ') and ':' in caption else ''
lines.append(f'**{caption}**{anchor}') lines.append(f'**{caption}**{anchor}')
# Image Caption # Image Caption
elif style in docConfig.imagecaption: elif style in docConfig.imagecaption:
checkSameStyle(Style.imagecaption, lambda:lines.append('')) checkSameStyle(Style.imagecaption, lambda:lines.append(''))
...@@ -679,12 +684,16 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: ...@@ -679,12 +684,16 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
# Warning if this is a single-row table # Warning if this is a single-row table
if nrRows == 1: if nrRows == 1:
_print(f'[red]Single-row table found. Consider replacing it in the original document:\n{rows[0]}') _print(f'[red]Single-row table found. Such tables cannot be converted to markdown.[/red] Please consider to change the following table in the original document:\n[grey39]{rows[0]}', highlight = False)
lines.append('') # Add an empty line before a table lines.append('') # Add an empty line before a table
for idx, row in enumerate(rows): for idx, row in enumerate(rows):
# Check for a table caption and add separator line
if idx == 1: if idx == 1:
lines.append('-'.join('|' * (len(row) + 1) )) lines.append('-'.join('|' * (len(row) + 1) ))
# Add table row
lines.append(f'|{"|".join(row)}|' lines.append(f'|{"|".join(row)}|'
.replace('\n', _linebreak)) # replace line breaks in cells .replace('\n', _linebreak)) # replace line breaks in cells
lines.append('') # Add another empty line after a table lines.append('') # Add another empty line after a table
...@@ -719,7 +728,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: ...@@ -719,7 +728,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
line = lines[i] line = lines[i]
line = line.replace('__', '') line = line.replace('__', '')
line = line.replace('****', '') line = line.replace('****', '')
line = line.replace(' ', ' ') #line = line.replace(' ', ' ')
lines[i] = line lines[i] = line
...@@ -775,6 +784,15 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: ...@@ -775,6 +784,15 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
line = lines[i] line = lines[i]
lines[i] = re.sub(_referenceExpression, _repl, line) # type:ignore[arg-type] lines[i] = re.sub(_referenceExpression, _repl, line) # type:ignore[arg-type]
#
# List unresolved CAPTION markers
#
for i in range(len(lines)):
line = lines[i]
if _captionMarker in line:
_print(f'[yellow]Unresolved figure caption : \[{i}] "{line}"')
# #
# Write produced Markdown file # Write produced Markdown file
# #
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment