Compare revisions

Andreas Kraft · Miguel Angel Reina Ortega · Miguel Angel Reina Ortega · 4c12b1be · 4c12b1be · 4c12b1be
--- a/config.ini
+++ b/config.ini
@@ -100,6 +100,11 @@ ff0c = 2c20
 d7 = 78
 ; Ligature "fi"
 fb01 = 6669
+; "<="
+f0fd = 3c3d
+; "=>"
+f0e0 = 3e3d
+


 [media]

--- a/requirements.txt
+++ b/requirements.txt
@@ -20,3 +20,4 @@ rich==13.7.0
    # via oneM2M-spec-2-MD-converter (setup.py)
 typing-extensions==4.8.0
    # via python-docx
+pillow==10.1.0
--- a/spec2md.py
+++ b/spec2md.py
@@ -26,6 +26,8 @@ from rich import inspect
 import configparser, zipfile
 from lxml import etree as ET

+from PIL import Image
+
 class Style(IntEnum):
 	example = auto()
 	image = auto()
@@ -52,8 +54,8 @@ unreferencedSubDir = 'unreferenced'
 _linebreak = '<br />'
 _entityLt = '&lt;'
 _nbsp = '&nbsp;'
-_tocInsertPoint = '__t_o_c__'
-_captionMarker = '__CAPTION__'
+_tocInsertPoint = '~~t~o~c~~'
+_captionMarker = '~~CAPTION~~'


 # https://learn.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing?view=openxml-2.8.1
@@ -335,6 +337,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 											_bold = '**'
 										case 'i' if ep.attrib.get(_val, 'true') == 'true':
 											_italics = '_'
+										# case _:
+										# 	_print(f'[yellow]unsupported style: {ep.tag}')
 						
 						# Strip white spaces if bold or italics
 						_s = str(toMD(str(element.text))).strip() if _bold or _italics else str(toMD(str(element.text)))
@@ -342,11 +346,13 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 						_s = _s.replace('_', '\\_')
 						_s = _s.replace('*', '\\*')
 						# Add trailing white space when bold or italics
-						_prefix = ' ' if _bold or _italics else ''
-						_result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_prefix}'
+						_postfix = ' ' if _bold or _italics else ''
+						_result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_postfix}'
+						# print(_result)

 					case 'br':
 						_result += _linebreak
+						
 					case 'bookmarkStart' | 'bookmarkEnd':		# TODO ?
 						pass

@@ -366,20 +372,21 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 						blip = element.findall('ns1:inline/ns3:graphic/ns3:graphicData/ns4:pic/ns4:blipFill/ns3:blip', 
 												namespaces = { 
 													'ns1' : 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
-													'ns3' : wns,
+													'ns3' : 'http://schemas.openxmlformats.org/drawingml/2006/main',
 													'ns4' : 'http://schemas.openxmlformats.org/drawingml/2006/picture',
 												})
 						if blip and \
 							(rId := blip[0].attrib.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')) and \
 							(mediaFile := mediaRelations.get(rId)):
-							referencedImages.append(Path(mediaFile).stem)	# Add to referenced files
-							if docConfig.renameEMFExtension and mediaFile.lower().endswith('.emf'):
-								mediaFile = str(Path(mediaFile).with_suffix(f'.{docConfig.renameEMFExtension}'))
-								_print(f'[yellow]Renaming EMF file reference to "{mediaFile}"')
-							_result += f'![{_captionMarker}]({mediaFile})'
+							mediaFilePath = Path(mediaFile)
+							referencedImages.append(mediaFilePath.stem)	# Add to referenced files
+							if docConfig.renameEMFExtension and mediaFilePath.suffix.lower() == '.emf':
+								mediaFilePath = mediaFilePath.with_suffix(f'.{docConfig.renameEMFExtension}')
+								_print(f'[yellow]Renaming EMF file reference to "{str(mediaFilePath)}"')
+							_result += f'![{_captionMarker}]({mediaFilePath.as_posix()})'	# image reference as posix path
 						# else:
 						# 	_print(blip)
-					
+
 					case 'pict':
 						# for e in element:
 						# 	print(f'----{e}')
@@ -397,9 +404,23 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 						pass	# ignore a soft hyphen character which has no meaning in Markdown and zero-width 
 					
 					case 'sym':
-						_symError = f'unknown font+symbol: {element.attrib["{"+wns+"}font"]} - "{element.attrib["{"+wns+"}char"]}"'
-						_print(f'[yellow]{_symError}')
-						_result += f'<mark>{_symError}</mark>'
+						if inCell:
+							ch = element.attrib["{"+wns+"}char"]
+							_print(f'[yellow]: {ch} ')
+
+							if not ch.isascii():
+								_print(f'[yellow]: {ch}')
+								if (_ch := ord(ch)) in docConfig.characters:
+									if (rch := docConfig.characters[_ch]) == chr(0):
+										rch = ''
+									_result = rch
+								else:
+									_print(
+										f'[yellow]Non-ASCII character (consider to add a replacement in the config.ini file): "{ch}" / {ord(ch)} / {hex(ord(ch))}')
+						else:
+							_symError = f'unknown font+symbol: {element.attrib["{"+wns+"}font"]} - "{element.attrib["{"+wns+"}char"]}"'
+							_print(f'[yellow]{_symError}')
+							_result += f'<mark>{_symError}</mark>'

 					# ignore deleted test
 					case 'del':
@@ -423,10 +444,10 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 				case Paragraph():	# type: ignore[misc]
 					return _parseXML(ET.fromstring(elem._p.xml))
 				case _Cell():		# type: ignore[misc]
-					result = ''
-					for p in elem.paragraphs:
-						result += _parseXML(ET.fromstring(p._p.xml), True)
-					return result
+					# Iterate over all paragraphs in the cell and parse them
+					# Create a list of parsed paragraphs and join them with linebreaks
+					return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip() 
+										   for p in elem.paragraphs ])
 				case _:
 					return ''

@@ -535,7 +556,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 			# 	Processing the document			
 			lines:list[str] = []
 			imageIndex = 1
-
+			isAnnex = False
 			for elem in docItems:
 				paragraphNr += 1
 				progress.update(processTask, advance = 1)
@@ -550,25 +571,33 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 							checkSameStyle(Style.normal, lambda:lines.append(''))
 							lines.append(text)
 							lines.append('')	# Add empty line 
-							
+							continue
+
 						#	Headers
-						elif style in docConfig.h1:
+						#print(f'{style} {text}')
+						# Check if annexes start
+						if text.find("Annex A") != -1:
+							isAnnex = True
+						elif text.find("History") != -1:
+							isAnnex = False
+
+						if (style in docConfig.h1) and not isAnnex:
 							lines.extend(toHeader(style, text, 1))
-						elif style in docConfig.h2:
+						elif (style in docConfig.h2) and not isAnnex:
 							lines.extend(toHeader(style, text, 2))
-						elif style in docConfig.h3:
+						elif (style in docConfig.h3) and not isAnnex:
 							lines.extend(toHeader(style, text, 3))
-						elif style in docConfig.h4:
+						elif (style in docConfig.h4) and not isAnnex:
 							lines.extend(toHeader(style, text, 4))
-						elif style in docConfig.h5:
+						elif (style in docConfig.h5) and not isAnnex:
 							lines.extend(toHeader(style, text, 5))
-						elif style in docConfig.h6:
+						elif (style in docConfig.h6) and not isAnnex:
 							lines.extend(toHeader(style, text, 6))
-						elif style in docConfig.h7:
+						elif (style in docConfig.h7) and not isAnnex:
 							lines.extend(toHeader(style, text, 7))
-						elif style in docConfig.h8:
+						elif (style in docConfig.h8) and not isAnnex:
 							lines.extend(toHeader(style, text, 8))
-						elif style in docConfig.h9:
+						elif (style in docConfig.h9) and not isAnnex:
 							lines.extend(toHeader(style, text, 9))

 						#	Annexes
@@ -614,10 +643,11 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 						#	Table Caption
 						elif style in docConfig.tablecaption:
 							lines.append('')
-							lines.append(f'**{replaceNL(text).strip()}**')
+							caption = replaceNL(text).strip()
+							caption = replaceNL(text).strip()
 							anchor = f'<a name="table_{caption[6:].split(":")[0].strip()}"></a>' if caption.startswith('Table ') and ':' in caption else ''
 							lines.append(f'**{caption}**{anchor}')
-							
+
 						#	Image Caption
 						elif style in docConfig.imagecaption:
 							checkSameStyle(Style.imagecaption, lambda:lines.append(''))
@@ -655,6 +685,10 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 							if docConfig.generateToc:
 								lines.append(_tocInsertPoint)

+							# Check when TOC ends
+							if text.find("History"):
+								isAnnex = False
+
 						# 	Ignore & empty
 						elif style in docConfig.ignore:
 							pass
@@ -667,6 +701,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 							lines.append(text)


+
 					case 'Table':
 						rows:list[list[str]] = []
 						nrRows = 0
@@ -679,12 +714,16 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 						
 						# Warning if this is a single-row table
 						if nrRows == 1:
-							_print(f'[red]Single-row table found. Consider replacing it in the original document:\n{rows[0]}')
+							_print(f'[red]Single-row table found. Such tables cannot be converted to markdown.[/red] Please consider to change the following table in the original document:\n[grey39]{rows[0]}', highlight = False)

 						lines.append('')	# Add an empty line before a table
 						for idx, row in enumerate(rows):
+
+							# Check for a table caption and add separator line
 							if idx == 1:
 								lines.append('-'.join('|' * (len(row) + 1) ))
+							
+							# Add table row
 							lines.append(f'|{"|".join(row)}|'
 										 .replace('\n', _linebreak))	# replace line breaks in cells
 						lines.append('')	# Add another empty line after a table
@@ -719,7 +758,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 				line = lines[i]
 				line = line.replace('__', '')
 				line = line.replace('****', '')
-				line = line.replace('  ', ' ')
+				#line = line.replace('  ', ' ')
 				lines[i] = line


@@ -775,6 +814,15 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 				line = lines[i]
 				lines[i] = re.sub(_referenceExpression, _repl, line)	# type:ignore[arg-type]

+
+			#
+			#	List unresolved CAPTION markers
+			#
+			for i in range(len(lines)):
+				line = lines[i]
+				if _captionMarker in line:
+					_print(f'[yellow]Unresolved figure caption : \[{i}] "{line}"')
+			
 			#
 			#	Write produced Markdown file
 			#
@@ -806,10 +854,12 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
 						cmd = converter
 						cmd = cmd.replace('{infile}', fn).replace('{outdir}', _t)
 						_print(f'Converting EMF file: {fn} to "{format}"', highlight = False)
-						if (res := subprocess.run(cmd, shell = True, capture_output = True)).returncode != 0:
-							_print(f'[red]Error running command: {res.stderr.decode("utf-8")}')
-							_print(f'[red]Please check the configuration file -> section "\[media]" for the converter command: {converter}')
-							break
+
+						convert(fn, fn[:-4])
+						#if (res := subprocess.run(cmd, shell = True, capture_output = True)).returncode != 0:
+						#	_print(f'[red]Error running command: {res.stderr.decode("utf-8")}')
+						#	_print(f'[red]Please check the configuration file -> section "\[media]" for the converter command: {converter}')
+						#	break

 			if not skipImageConversion:
 				if docConfig.emfConverterPng:
@@ -824,6 +874,10 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:

 		progress.stop()

+def convert(input:str, output_name:str):
+
+	name=output_name+".png"
+	Image.open(input).save(str(name))


 if __name__ == '__main__':
No results found