Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • tools/spec2md
1 result
Show changes
Commits on Source (3)
......@@ -100,6 +100,11 @@ ff0c = 2c20
d7 = 78
; Ligature "fi"
fb01 = 6669
; "<="
f0fd = 3c3d
; "=>"
f0e0 = 3e3d
[media]
......
......@@ -20,3 +20,4 @@ rich==13.7.0
# via oneM2M-spec-2-MD-converter (setup.py)
typing-extensions==4.8.0
# via python-docx
pillow==10.1.0
......@@ -26,6 +26,8 @@ from rich import inspect
import configparser, zipfile
from lxml import etree as ET
from PIL import Image
class Style(IntEnum):
example = auto()
image = auto()
......@@ -52,8 +54,8 @@ unreferencedSubDir = 'unreferenced'
_linebreak = '<br />'
_entityLt = '&lt;'
_nbsp = '&nbsp;'
_tocInsertPoint = '__t_o_c__'
_captionMarker = '__CAPTION__'
_tocInsertPoint = '~~t~o~c~~'
_captionMarker = '~~CAPTION~~'
# https://learn.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing?view=openxml-2.8.1
......@@ -335,6 +337,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
_bold = '**'
case 'i' if ep.attrib.get(_val, 'true') == 'true':
_italics = '_'
# case _:
# _print(f'[yellow]unsupported style: {ep.tag}')
# Strip white spaces if bold or italics
_s = str(toMD(str(element.text))).strip() if _bold or _italics else str(toMD(str(element.text)))
......@@ -342,11 +346,13 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
_s = _s.replace('_', '\\_')
_s = _s.replace('*', '\\*')
# Add trailing white space when bold or italics
_prefix = ' ' if _bold or _italics else ''
_result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_prefix}'
_postfix = ' ' if _bold or _italics else ''
_result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_postfix}'
# print(_result)
case 'br':
_result += _linebreak
case 'bookmarkStart' | 'bookmarkEnd': # TODO ?
pass
......@@ -366,20 +372,21 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
blip = element.findall('ns1:inline/ns3:graphic/ns3:graphicData/ns4:pic/ns4:blipFill/ns3:blip',
namespaces = {
'ns1' : 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
'ns3' : wns,
'ns3' : 'http://schemas.openxmlformats.org/drawingml/2006/main',
'ns4' : 'http://schemas.openxmlformats.org/drawingml/2006/picture',
})
if blip and \
(rId := blip[0].attrib.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')) and \
(mediaFile := mediaRelations.get(rId)):
referencedImages.append(Path(mediaFile).stem) # Add to referenced files
if docConfig.renameEMFExtension and mediaFile.lower().endswith('.emf'):
mediaFile = str(Path(mediaFile).with_suffix(f'.{docConfig.renameEMFExtension}'))
_print(f'[yellow]Renaming EMF file reference to "{mediaFile}"')
_result += f'![{_captionMarker}]({mediaFile})'
mediaFilePath = Path(mediaFile)
referencedImages.append(mediaFilePath.stem) # Add to referenced files
if docConfig.renameEMFExtension and mediaFilePath.suffix.lower() == '.emf':
mediaFilePath = mediaFilePath.with_suffix(f'.{docConfig.renameEMFExtension}')
_print(f'[yellow]Renaming EMF file reference to "{str(mediaFilePath)}"')
_result += f'![{_captionMarker}]({mediaFilePath.as_posix()})' # image reference as posix path
# else:
# _print(blip)
case 'pict':
# for e in element:
# print(f'----{e}')
......@@ -397,9 +404,23 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
pass # ignore a soft hyphen character which has no meaning in Markdown and zero-width
case 'sym':
_symError = f'unknown font+symbol: {element.attrib["{"+wns+"}font"]} - "{element.attrib["{"+wns+"}char"]}"'
_print(f'[yellow]{_symError}')
_result += f'<mark>{_symError}</mark>'
if inCell:
ch = element.attrib["{"+wns+"}char"]
_print(f'[yellow]: {ch} ')
if not ch.isascii():
_print(f'[yellow]: {ch}')
if (_ch := ord(ch)) in docConfig.characters:
if (rch := docConfig.characters[_ch]) == chr(0):
rch = ''
_result = rch
else:
_print(
f'[yellow]Non-ASCII character (consider to add a replacement in the config.ini file): "{ch}" / {ord(ch)} / {hex(ord(ch))}')
else:
_symError = f'unknown font+symbol: {element.attrib["{"+wns+"}font"]} - "{element.attrib["{"+wns+"}char"]}"'
_print(f'[yellow]{_symError}')
_result += f'<mark>{_symError}</mark>'
# ignore deleted test
case 'del':
......@@ -423,10 +444,10 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
case Paragraph(): # type: ignore[misc]
return _parseXML(ET.fromstring(elem._p.xml))
case _Cell(): # type: ignore[misc]
result = ''
for p in elem.paragraphs:
result += _parseXML(ET.fromstring(p._p.xml), True)
return result
# Iterate over all paragraphs in the cell and parse them
# Create a list of parsed paragraphs and join them with linebreaks
return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip()
for p in elem.paragraphs ])
case _:
return ''
......@@ -535,7 +556,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
# Processing the document
lines:list[str] = []
imageIndex = 1
isAnnex = False
for elem in docItems:
paragraphNr += 1
progress.update(processTask, advance = 1)
......@@ -550,25 +571,33 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
checkSameStyle(Style.normal, lambda:lines.append(''))
lines.append(text)
lines.append('') # Add empty line
continue
# Headers
elif style in docConfig.h1:
#print(f'{style} {text}')
# Check if annexes start
if text.find("Annex A") != -1:
isAnnex = True
elif text.find("History") != -1:
isAnnex = False
if (style in docConfig.h1) and not isAnnex:
lines.extend(toHeader(style, text, 1))
elif style in docConfig.h2:
elif (style in docConfig.h2) and not isAnnex:
lines.extend(toHeader(style, text, 2))
elif style in docConfig.h3:
elif (style in docConfig.h3) and not isAnnex:
lines.extend(toHeader(style, text, 3))
elif style in docConfig.h4:
elif (style in docConfig.h4) and not isAnnex:
lines.extend(toHeader(style, text, 4))
elif style in docConfig.h5:
elif (style in docConfig.h5) and not isAnnex:
lines.extend(toHeader(style, text, 5))
elif style in docConfig.h6:
elif (style in docConfig.h6) and not isAnnex:
lines.extend(toHeader(style, text, 6))
elif style in docConfig.h7:
elif (style in docConfig.h7) and not isAnnex:
lines.extend(toHeader(style, text, 7))
elif style in docConfig.h8:
elif (style in docConfig.h8) and not isAnnex:
lines.extend(toHeader(style, text, 8))
elif style in docConfig.h9:
elif (style in docConfig.h9) and not isAnnex:
lines.extend(toHeader(style, text, 9))
# Annexes
......@@ -614,10 +643,11 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
# Table Caption
elif style in docConfig.tablecaption:
lines.append('')
lines.append(f'**{replaceNL(text).strip()}**')
caption = replaceNL(text).strip()
caption = replaceNL(text).strip()
anchor = f'<a name="table_{caption[6:].split(":")[0].strip()}"></a>' if caption.startswith('Table ') and ':' in caption else ''
lines.append(f'**{caption}**{anchor}')
# Image Caption
elif style in docConfig.imagecaption:
checkSameStyle(Style.imagecaption, lambda:lines.append(''))
......@@ -655,6 +685,10 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
if docConfig.generateToc:
lines.append(_tocInsertPoint)
# Check when TOC ends
if text.find("History"):
isAnnex = False
# Ignore & empty
elif style in docConfig.ignore:
pass
......@@ -667,6 +701,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
lines.append(text)
case 'Table':
rows:list[list[str]] = []
nrRows = 0
......@@ -679,12 +714,16 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
# Warning if this is a single-row table
if nrRows == 1:
_print(f'[red]Single-row table found. Consider replacing it in the original document:\n{rows[0]}')
_print(f'[red]Single-row table found. Such tables cannot be converted to markdown.[/red] Please consider to change the following table in the original document:\n[grey39]{rows[0]}', highlight = False)
lines.append('') # Add an empty line before a table
for idx, row in enumerate(rows):
# Check for a table caption and add separator line
if idx == 1:
lines.append('-'.join('|' * (len(row) + 1) ))
# Add table row
lines.append(f'|{"|".join(row)}|'
.replace('\n', _linebreak)) # replace line breaks in cells
lines.append('') # Add another empty line after a table
......@@ -719,7 +758,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
line = lines[i]
line = line.replace('__', '')
line = line.replace('****', '')
line = line.replace(' ', ' ')
#line = line.replace(' ', ' ')
lines[i] = line
......@@ -775,6 +814,15 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
line = lines[i]
lines[i] = re.sub(_referenceExpression, _repl, line) # type:ignore[arg-type]
#
# List unresolved CAPTION markers
#
for i in range(len(lines)):
line = lines[i]
if _captionMarker in line:
_print(f'[yellow]Unresolved figure caption : \[{i}] "{line}"')
#
# Write produced Markdown file
#
......@@ -806,10 +854,12 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
cmd = converter
cmd = cmd.replace('{infile}', fn).replace('{outdir}', _t)
_print(f'Converting EMF file: {fn} to "{format}"', highlight = False)
if (res := subprocess.run(cmd, shell = True, capture_output = True)).returncode != 0:
_print(f'[red]Error running command: {res.stderr.decode("utf-8")}')
_print(f'[red]Please check the configuration file -> section "\[media]" for the converter command: {converter}')
break
convert(fn, fn[:-4])
#if (res := subprocess.run(cmd, shell = True, capture_output = True)).returncode != 0:
# _print(f'[red]Error running command: {res.stderr.decode("utf-8")}')
# _print(f'[red]Please check the configuration file -> section "\[media]" for the converter command: {converter}')
# break
if not skipImageConversion:
if docConfig.emfConverterPng:
......@@ -824,6 +874,10 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
progress.stop()
def convert(input:str, output_name:str):
name=output_name+".png"
Image.open(input).save(str(name))
if __name__ == '__main__':
......