Improved parsing and generation of tables. Corrected various wrong...

Improved parsing and generation of tables. Corrected various wrong replacements. List unresolved captions

Improved parsing and generation of tables. Corrected various wrong...
f46a2a97 · Andreas Kraft · 6401024d · f46a2a97
Commit f46a2a97 authored 1 year ago by Andreas Kraft
--- a/spec2md.py
+++ b/spec2md.py
@@ -52,8 +52,8 @@ unreferencedSubDir = 'unreferenced'
 _linebreak = '<br />'
 _entityLt = '&lt;'
 _nbsp = '&nbsp;'
-_tocInsertPoint = '__t_o_c__'
+_tocInsertPoint = '~~t~o~c~~'
-_captionMarker = '__CAPTION__'
+_captionMarker = '~~CAPTION~~'
 # https://learn.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing?view=openxml-2.8.1
@@ -335,6 +335,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 											_bold = '**'
 										case 'i' if ep.attrib.get(_val, 'true') == 'true':
 											_italics = '_'
+										# case _:
+										# 	_print(f'[yellow]unsupported style: {ep.tag}')
 						# Strip white spaces if bold or italics
 						_s = str(toMD(str(element.text))).strip() if _bold or _italics else str(toMD(str(element.text)))
@@ -342,11 +344,13 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 						_s = _s.replace('_', '\\_')
 						_s = _s.replace('*', '\\*')
 						# Add trailing white space when bold or italics
-						_prefix = ' ' if _bold or _italics else ''
+						_postfix = ' ' if _bold or _italics else ''
-						_result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_prefix}'
+						_result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_postfix}'
+						# print(_result)
 					case 'br':
 						_result += _linebreak
 					case 'bookmarkStart' | 'bookmarkEnd':		# TODO ?
 						pass
@@ -366,20 +370,21 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 						blip = element.findall('ns1:inline/ns3:graphic/ns3:graphicData/ns4:pic/ns4:blipFill/ns3:blip', 
 												namespaces = { 
 													'ns1' : 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
-													'ns3' : wns,
+													'ns3' : 'http://schemas.openxmlformats.org/drawingml/2006/main',
 													'ns4' : 'http://schemas.openxmlformats.org/drawingml/2006/picture',
 												})
 						if blip and \
 							(rId := blip[0].attrib.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')) and \
 							(mediaFile := mediaRelations.get(rId)):
-							referencedImages.append(Path(mediaFile).stem)	# Add to referenced files
+							mediaFilePath = Path(mediaFile)
-							if docConfig.renameEMFExtension and mediaFile.lower().endswith('.emf'):
+							referencedImages.append(mediaFilePath.stem)	# Add to referenced files
-								mediaFile = str(Path(mediaFile).with_suffix(f'.{docConfig.renameEMFExtension}'))
+							if docConfig.renameEMFExtension and mediaFilePath.suffix.lower() == '.emf':
-								_print(f'[yellow]Renaming EMF file reference to "{mediaFile}"')
+								mediaFilePath = mediaFilePath.with_suffix(f'.{docConfig.renameEMFExtension}')
-							_result += f'![{_captionMarker}]({mediaFile})'
+								_print(f'[yellow]Renaming EMF file reference to "{str(mediaFilePath)}"')
+							_result += f'![{_captionMarker}]({mediaFilePath.as_posix()})'	# image reference as posix path
 						# else:
 						# 	_print(blip)
 					case 'pict':
 						# for e in element:
 						# 	print(f'----{e}')
@@ -423,10 +428,10 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 				case Paragraph():	# type: ignore[misc]
 					return _parseXML(ET.fromstring(elem._p.xml))
 				case _Cell():		# type: ignore[misc]
-					result = ''
+					# Iterate over all paragraphs in the cell and parse them
-					for p in elem.paragraphs:
+					# Create a list of parsed paragraphs and join them with linebreaks
-						result += _parseXML(ET.fromstring(p._p.xml), True)
+					return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip() 
-					return result
+										   for p in elem.paragraphs ])
 				case _:
 					return ''
@@ -614,10 +619,10 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 						#	Table Caption
 						elif style in docConfig.tablecaption:
 							lines.append('')
-							lines.append(f'**{replaceNL(text).strip()}**')
+							caption = replaceNL(text).strip()
 							anchor = f'<a name="table_{caption[6:].split(":")[0].strip()}"></a>' if caption.startswith('Table ') and ':' in caption else ''
 							lines.append(f'**{caption}**{anchor}')
 						#	Image Caption
 						elif style in docConfig.imagecaption:
 							checkSameStyle(Style.imagecaption, lambda:lines.append(''))
@@ -679,12 +684,16 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 						# Warning if this is a single-row table
 						if nrRows == 1:
-							_print(f'[red]Single-row table found. Consider replacing it in the original document:\n{rows[0]}')
+							_print(f'[red]Single-row table found. Such tables cannot be converted to markdown.[/red] Please consider to change the following table in the original document:\n[grey39]{rows[0]}', highlight = False)
 						lines.append('')	# Add an empty line before a table
 						for idx, row in enumerate(rows):
+							# Check for a table caption and add separator line
 							if idx == 1:
 								lines.append('-'.join('|' * (len(row) + 1) ))
+							# Add table row
 							lines.append(f'|{"|".join(row)}|'
 										 .replace('\n', _linebreak))	# replace line breaks in cells
 						lines.append('')	# Add another empty line after a table
@@ -719,7 +728,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 				line = lines[i]
 				line = line.replace('__', '')
 				line = line.replace('****', '')
-				line = line.replace('  ', ' ')
+				#line = line.replace('  ', ' ')
 				lines[i] = line
@@ -775,6 +784,15 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 				line = lines[i]
 				lines[i] = re.sub(_referenceExpression, _repl, line)	# type:ignore[arg-type]
+			#
+			#	List unresolved CAPTION markers
+			#
+			for i in range(len(lines)):
+				line = lines[i]
+				if _captionMarker in line:
+					_print(f'[yellow]Unresolved figure caption : \[{i}] "{line}"')
 			#
 			#	Write produced Markdown file
 			#