Improved handling of sometimes broken inline formatting in table cells. Adding...

Improved handling of sometimes broken inline formatting in table cells. Adding more default heading formats.

Improved handling of sometimes broken inline formatting in table cells. Adding...
78951d3f · Andreas Kraft · 10048d4b · 78951d3f · 78951d3f · 78951d3f
Commit 78951d3f authored 1 year ago by Andreas Kraft
--- a/README.md
+++ b/README.md
-# onem2m-spec2md
+# spec2md
 Convert oneM2M specification documents to markdown
@@ -26,6 +26,27 @@ python3 spec2md.py <path-to-word-document>
 Is *LibreOffice* already running? If yes, then close it.
+### Are linebreaks, paragraphs, and lists supported in table cells?
+Unfortunately, markdown doesn't support multiple paragraphs in table cells. A table cell must be a single line. However, one can add a html `<br />` linebreak to break between lines:
+```markdown
+| Header         |
+|----------------|
+| text<br />text |
+```
+Lists in table cells are also not possible. One may use html lists for this, but this use is **discouraged** because it may cause problems in conversions from markdown to other document formats. It is recommend to use simple lists using a dash `-` character:
+```markdown
+| Header                           |
+|----------------------------------|
+| - list item 1<br />- list item 2 |
+```
 ## Changes
+- **2023-08-18** - Improved handling of sometimes broken inline formatting in table cells. Adding more default heading formats.
 - **2023-07-27** - Added converting bold and italic text in paragraphs, headers and tables.
\ No newline at end of file
--- a/config.ini
+++ b/config.ini
@@ -42,6 +42,8 @@ h4 = heading 4
 h5 = heading 5
 h6 = heading 6
 h7 = heading 7
+h8 = heading 8
+h9 = heading 9
 a1 = heading 1
 a2 = heading 2
 a3 = heading 3
@@ -58,7 +60,7 @@ tablecaption = caption, th
 imagecaption = tf
 image = fl
 empty = fp
-ignore = toc 1, toc 2, toc 3, toc 4, toc 5, toc 6, toc 7
+ignore = toc 1, toc 2, toc 3, toc 4, toc 5, toc 6, toc 7, toc 8, toc 9
 [characters]

--- a/spec2md.py
+++ b/spec2md.py
@@ -84,7 +84,7 @@ _val = f'{{{wns}}}val'
 class SectionNumbers(object):
 	def __init__(self) -> None:
-		self.levels:list[int] = [ 0, 0, 0, 0]
+		self.levels:list[int] = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
 		self.heading:int = 0
 		self.annex:int = 64
@@ -157,6 +157,8 @@ class DocumentConfiguration(object):
 		self.h5 = self.paragraphs['h5']
 		self.h6 = self.paragraphs['h6']
 		self.h7 = self.paragraphs['h7']
+		self.h8 = self.paragraphs['h8']
+		self.h9 = self.paragraphs['h9']
 		self.a1 = self.paragraphs['a1']
 		self.a2 = self.paragraphs['a2']
 		self.a3 = self.paragraphs['a3']
@@ -195,7 +197,7 @@ class DocumentConfiguration(object):
-def processDocuments(documents:list[str], outDirectory:str) -> None:
+def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:bool) -> None:
 	docs:Dict[str, Tuple[Document, DocumentConfiguration]]		= {}
 	ptasks 														= {}
 	mediaRelations:Dict[str, str] 								= {}
@@ -308,7 +310,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
 			)
-			def _parseXML(element:ET.Element) -> str:
+			def _parseXML(element:ET.Element, inCell:Optional[bool] = False) -> str:
 				"""	Recursively parse a document paragraph.
 				"""
 				nonlocal _ignoredTags
@@ -318,22 +320,30 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
 				match tag:
 					case 'p':
 						for x in element:
-							_result += _parseXML(x)
+							_result += _parseXML(x, inCell)
 					case 'r':
 						for x in element:
-							_result += _parseXML(x)
+							_result += _parseXML(x, inCell)
 					case 't':
-						bold = ''
+						_bold = ''
-						italics = ''
+						_italics = ''
 						for e in element.getparent():
 							if strippedTag(e.tag) == 'rPr':	# paragraph style
 								for ep in e:
 									match strippedTag(ep.tag):
 										case 'b' if ep.attrib.get(_val, 'true') == 'true':
-											bold = '**'
+											_bold = '**'
 										case 'i' if ep.attrib.get(_val, 'true') == 'true':
-											italics = '_'
+											_italics = '_'
-						_result += f'{bold}{italics}{str(toMD(str(element.text)))}{italics}{bold}'
+						# Strip white spaces if bold or italics
+						_s = str(toMD(str(element.text))).strip() if _bold or _italics else str(toMD(str(element.text)))
+						# Replace single * or _
+						_s = _s.replace('_', '\\_')
+						_s = _s.replace('*', '\\*')
+						# Add trailing white space when bold or italics
+						_prefix = ' ' if _bold or _italics else ''
+						_result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_prefix}'
 					case 'br':
 						_result += _linebreak
@@ -344,7 +354,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
 						# Hyperlinks and URLs
 						_hresult = ''
 						for x in element:
-							_hresult += _parseXML(x)
+							_hresult += _parseXML(x, inCell)
 						_result += f'[{_hresult}]({_hresult})'
 					case 'drawing':
@@ -356,7 +366,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
 						blip = element.findall('ns1:inline/ns3:graphic/ns3:graphicData/ns4:pic/ns4:blipFill/ns3:blip', 
 												namespaces = { 
 													'ns1' : 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
-													'ns3' : 'http://schemas.openxmlformats.org/drawingml/2006/main',
+													'ns3' : wns,
 													'ns4' : 'http://schemas.openxmlformats.org/drawingml/2006/picture',
 												})
 						if blip and \
@@ -367,8 +377,8 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
 								mediaFile = str(Path(mediaFile).with_suffix(f'.{docConfig.renameEMFExtension}'))
 								_print(f'[yellow]Renaming EMF file reference to "{mediaFile}"')
 							_result += f'![{_captionMarker}]({mediaFile})'
-						else:
+						# else:
-							_print(blip)
+						# 	_print(blip)
 					case 'pict':
 						# for e in element:
@@ -380,10 +390,17 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
 						#inspect(element)
 						pass
 					case 'tab':
 						_result += '    '	# TODO nbsp?
+					case 'softHyphen':
+						pass	# ignore a soft hyphen character which has no meaning in Markdown and zero-width 
+					case 'sym':
+						_symError = f'unknown font+symbol: {element.attrib["{"+wns+"}font"]} - "{element.attrib["{"+wns+"}char"]}"'
+						_print(f'[yellow]{_symError}')
+						_result += f'<mark>{_symError}</mark>'
 					case _ if tag in _ignoredTags:	# ignore
 						pass
@@ -399,7 +416,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
 				case _Cell():		# type: ignore[misc]
 					result = ''
 					for p in elem.paragraphs:
-						result += _parseXML(ET.fromstring(p._p.xml))
+						result += _parseXML(ET.fromstring(p._p.xml), True)
 					return result
 				case _:
 					return ''
@@ -434,7 +451,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
 				return
 			try:
 				docs[d] = (docx.Document(d), DocumentConfiguration(d))
-				ptasks[d] = progress.add_task(f'Processing {d}', total = 1000)
+				ptasks[d] = progress.add_task(f'Processing {d}', total = None)
 				progress.update(readTask, advance=1)
 			except docx.opc.exceptions.PackageNotFoundError as e:
 				stopProgress(f'[red]Input document "{d}" is not a .docx file')
@@ -540,6 +557,10 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
 							lines.extend(toHeader(style, text, 6))
 						elif style in docConfig.h7:
 							lines.extend(toHeader(style, text, 7))
+						elif style in docConfig.h8:
+							lines.extend(toHeader(style, text, 8))
+						elif style in docConfig.h9:
+							lines.extend(toHeader(style, text, 9))
 						#	Annexes
 						elif style in docConfig.a1:
@@ -676,9 +697,23 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
 							lines[i] = line
 						else:
 							_print(f'[yellow]Non-ASCII character (consider to add a replacement in the config.ini file): "{ch}" / {ord(ch)} / {hex(ord(ch))}')
+			#
+			#	Remove multiple bold / italics on/off occurances
+			#	Sometimes word doesn't remove empty bold-on/bold-off (or italics) indicatros
+			#
+			progress.update(processTask, advance = 1)	# progress update
+			for i in range(len(lines)):
+				line = lines[i]
+				line = line.replace('__', '')
+				line = line.replace('****', '')
+				line = line.replace('  ', ' ')
+				lines[i] = line
 			#
-			#	Insert auto-genrated table of contents
+			#	Insert auto-generated table of contents
 			#
 			progress.update(processTask, advance = 1)	# progress update
 			if docConfig.generateToc:
@@ -763,10 +798,11 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
 						if (res := subprocess.run(cmd, shell = True, capture_output = True)).returncode != 0:
 							_print(f'[red] Error running command: {res.stderr.decode("utf-8")}')
-			if docConfig.emfConverterPng:
+			if not skipImageConversion:
-				_convertImage(docConfig.emfConverterPng, 'png')
+				if docConfig.emfConverterPng:
-			if docConfig.emfConverterSvg:
+					_convertImage(docConfig.emfConverterPng, 'png')
-				_convertImage(docConfig.emfConverterSvg, 'svg')
+				if docConfig.emfConverterSvg:
+					_convertImage(docConfig.emfConverterSvg, 'svg')
 			emfFiles.clear()
 			referencedImages.clear()
@@ -782,6 +818,7 @@ if __name__ == '__main__':
 	# Parse command line arguments
 	parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 	parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>',  help = 'specify output directory')
+	parser.add_argument('--skip-image-conversion', '-sic', action='store_true', dest='skipImageConversion',  help = 'skip image conversion step')
 	parser.add_argument('document', nargs = '+', help = 'documents to parse')
 	args = parser.parse_args()
@@ -789,5 +826,5 @@ if __name__ == '__main__':
 		# Process documents and print output
 	os.makedirs(args.outDirectory, exist_ok = True)
-	processDocuments(sorted(args.document), args.outDirectory)
+	processDocuments(sorted(args.document), args.outDirectory, args.skipImageConversion)