Skip to content
Snippets Groups Projects
Commit 78951d3f authored by Andreas Kraft's avatar Andreas Kraft
Browse files

Improved handling of sometimes broken inline formatting in table cells. Adding...

Improved handling of sometimes broken inline formatting in table cells. Adding more default heading formats.
parent 10048d4b
No related branches found
No related tags found
No related merge requests found
# onem2m-spec2md # spec2md
Convert oneM2M specification documents to markdown Convert oneM2M specification documents to markdown
...@@ -26,6 +26,27 @@ python3 spec2md.py <path-to-word-document> ...@@ -26,6 +26,27 @@ python3 spec2md.py <path-to-word-document>
Is *LibreOffice* already running? If yes, then close it. Is *LibreOffice* already running? If yes, then close it.
### Are linebreaks, paragraphs, and lists supported in table cells?
Unfortunately, markdown doesn't support multiple paragraphs in table cells. A table cell must be a single line. However, one can add a html `<br />` linebreak to break between lines:
```markdown
| Header |
|----------------|
| text<br />text |
```
Lists in table cells are also not possible. One may use html lists for this, but this use is **discouraged** because it may cause problems in conversions from markdown to other document formats. It is recommend to use simple lists using a dash `-` character:
```markdown
| Header |
|----------------------------------|
| - list item 1<br />- list item 2 |
```
## Changes ## Changes
- **2023-08-18** - Improved handling of sometimes broken inline formatting in table cells. Adding more default heading formats.
- **2023-07-27** - Added converting bold and italic text in paragraphs, headers and tables. - **2023-07-27** - Added converting bold and italic text in paragraphs, headers and tables.
\ No newline at end of file
...@@ -42,6 +42,8 @@ h4 = heading 4 ...@@ -42,6 +42,8 @@ h4 = heading 4
h5 = heading 5 h5 = heading 5
h6 = heading 6 h6 = heading 6
h7 = heading 7 h7 = heading 7
h8 = heading 8
h9 = heading 9
a1 = heading 1 a1 = heading 1
a2 = heading 2 a2 = heading 2
a3 = heading 3 a3 = heading 3
...@@ -58,7 +60,7 @@ tablecaption = caption, th ...@@ -58,7 +60,7 @@ tablecaption = caption, th
imagecaption = tf imagecaption = tf
image = fl image = fl
empty = fp empty = fp
ignore = toc 1, toc 2, toc 3, toc 4, toc 5, toc 6, toc 7 ignore = toc 1, toc 2, toc 3, toc 4, toc 5, toc 6, toc 7, toc 8, toc 9
[characters] [characters]
......
...@@ -84,7 +84,7 @@ _val = f'{{{wns}}}val' ...@@ -84,7 +84,7 @@ _val = f'{{{wns}}}val'
class SectionNumbers(object): class SectionNumbers(object):
def __init__(self) -> None: def __init__(self) -> None:
self.levels:list[int] = [ 0, 0, 0, 0] self.levels:list[int] = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
self.heading:int = 0 self.heading:int = 0
self.annex:int = 64 self.annex:int = 64
...@@ -157,6 +157,8 @@ class DocumentConfiguration(object): ...@@ -157,6 +157,8 @@ class DocumentConfiguration(object):
self.h5 = self.paragraphs['h5'] self.h5 = self.paragraphs['h5']
self.h6 = self.paragraphs['h6'] self.h6 = self.paragraphs['h6']
self.h7 = self.paragraphs['h7'] self.h7 = self.paragraphs['h7']
self.h8 = self.paragraphs['h8']
self.h9 = self.paragraphs['h9']
self.a1 = self.paragraphs['a1'] self.a1 = self.paragraphs['a1']
self.a2 = self.paragraphs['a2'] self.a2 = self.paragraphs['a2']
self.a3 = self.paragraphs['a3'] self.a3 = self.paragraphs['a3']
...@@ -195,7 +197,7 @@ class DocumentConfiguration(object): ...@@ -195,7 +197,7 @@ class DocumentConfiguration(object):
def processDocuments(documents:list[str], outDirectory:str) -> None: def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:bool) -> None:
docs:Dict[str, Tuple[Document, DocumentConfiguration]] = {} docs:Dict[str, Tuple[Document, DocumentConfiguration]] = {}
ptasks = {} ptasks = {}
mediaRelations:Dict[str, str] = {} mediaRelations:Dict[str, str] = {}
...@@ -308,7 +310,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: ...@@ -308,7 +310,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
) )
def _parseXML(element:ET.Element) -> str: def _parseXML(element:ET.Element, inCell:Optional[bool] = False) -> str:
""" Recursively parse a document paragraph. """ Recursively parse a document paragraph.
""" """
nonlocal _ignoredTags nonlocal _ignoredTags
...@@ -318,22 +320,30 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: ...@@ -318,22 +320,30 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
match tag: match tag:
case 'p': case 'p':
for x in element: for x in element:
_result += _parseXML(x) _result += _parseXML(x, inCell)
case 'r': case 'r':
for x in element: for x in element:
_result += _parseXML(x) _result += _parseXML(x, inCell)
case 't': case 't':
bold = '' _bold = ''
italics = '' _italics = ''
for e in element.getparent(): for e in element.getparent():
if strippedTag(e.tag) == 'rPr': # paragraph style if strippedTag(e.tag) == 'rPr': # paragraph style
for ep in e: for ep in e:
match strippedTag(ep.tag): match strippedTag(ep.tag):
case 'b' if ep.attrib.get(_val, 'true') == 'true': case 'b' if ep.attrib.get(_val, 'true') == 'true':
bold = '**' _bold = '**'
case 'i' if ep.attrib.get(_val, 'true') == 'true': case 'i' if ep.attrib.get(_val, 'true') == 'true':
italics = '_' _italics = '_'
_result += f'{bold}{italics}{str(toMD(str(element.text)))}{italics}{bold}'
# Strip white spaces if bold or italics
_s = str(toMD(str(element.text))).strip() if _bold or _italics else str(toMD(str(element.text)))
# Replace single * or _
_s = _s.replace('_', '\\_')
_s = _s.replace('*', '\\*')
# Add trailing white space when bold or italics
_prefix = ' ' if _bold or _italics else ''
_result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_prefix}'
case 'br': case 'br':
_result += _linebreak _result += _linebreak
...@@ -344,7 +354,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: ...@@ -344,7 +354,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
# Hyperlinks and URLs # Hyperlinks and URLs
_hresult = '' _hresult = ''
for x in element: for x in element:
_hresult += _parseXML(x) _hresult += _parseXML(x, inCell)
_result += f'[{_hresult}]({_hresult})' _result += f'[{_hresult}]({_hresult})'
case 'drawing': case 'drawing':
...@@ -356,7 +366,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: ...@@ -356,7 +366,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
blip = element.findall('ns1:inline/ns3:graphic/ns3:graphicData/ns4:pic/ns4:blipFill/ns3:blip', blip = element.findall('ns1:inline/ns3:graphic/ns3:graphicData/ns4:pic/ns4:blipFill/ns3:blip',
namespaces = { namespaces = {
'ns1' : 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing', 'ns1' : 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
'ns3' : 'http://schemas.openxmlformats.org/drawingml/2006/main', 'ns3' : wns,
'ns4' : 'http://schemas.openxmlformats.org/drawingml/2006/picture', 'ns4' : 'http://schemas.openxmlformats.org/drawingml/2006/picture',
}) })
if blip and \ if blip and \
...@@ -367,8 +377,8 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: ...@@ -367,8 +377,8 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
mediaFile = str(Path(mediaFile).with_suffix(f'.{docConfig.renameEMFExtension}')) mediaFile = str(Path(mediaFile).with_suffix(f'.{docConfig.renameEMFExtension}'))
_print(f'[yellow]Renaming EMF file reference to "{mediaFile}"') _print(f'[yellow]Renaming EMF file reference to "{mediaFile}"')
_result += f'![{_captionMarker}]({mediaFile})' _result += f'![{_captionMarker}]({mediaFile})'
else: # else:
_print(blip) # _print(blip)
case 'pict': case 'pict':
# for e in element: # for e in element:
...@@ -380,10 +390,17 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: ...@@ -380,10 +390,17 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
#inspect(element) #inspect(element)
pass pass
case 'tab': case 'tab':
_result += ' ' # TODO nbsp? _result += ' ' # TODO nbsp?
case 'softHyphen':
pass # ignore a soft hyphen character which has no meaning in Markdown and zero-width
case 'sym':
_symError = f'unknown font+symbol: {element.attrib["{"+wns+"}font"]} - "{element.attrib["{"+wns+"}char"]}"'
_print(f'[yellow]{_symError}')
_result += f'<mark>{_symError}</mark>'
case _ if tag in _ignoredTags: # ignore case _ if tag in _ignoredTags: # ignore
pass pass
...@@ -399,7 +416,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: ...@@ -399,7 +416,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
case _Cell(): # type: ignore[misc] case _Cell(): # type: ignore[misc]
result = '' result = ''
for p in elem.paragraphs: for p in elem.paragraphs:
result += _parseXML(ET.fromstring(p._p.xml)) result += _parseXML(ET.fromstring(p._p.xml), True)
return result return result
case _: case _:
return '' return ''
...@@ -434,7 +451,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: ...@@ -434,7 +451,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
return return
try: try:
docs[d] = (docx.Document(d), DocumentConfiguration(d)) docs[d] = (docx.Document(d), DocumentConfiguration(d))
ptasks[d] = progress.add_task(f'Processing {d}', total = 1000) ptasks[d] = progress.add_task(f'Processing {d}', total = None)
progress.update(readTask, advance=1) progress.update(readTask, advance=1)
except docx.opc.exceptions.PackageNotFoundError as e: except docx.opc.exceptions.PackageNotFoundError as e:
stopProgress(f'[red]Input document "{d}" is not a .docx file') stopProgress(f'[red]Input document "{d}" is not a .docx file')
...@@ -540,6 +557,10 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: ...@@ -540,6 +557,10 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
lines.extend(toHeader(style, text, 6)) lines.extend(toHeader(style, text, 6))
elif style in docConfig.h7: elif style in docConfig.h7:
lines.extend(toHeader(style, text, 7)) lines.extend(toHeader(style, text, 7))
elif style in docConfig.h8:
lines.extend(toHeader(style, text, 8))
elif style in docConfig.h9:
lines.extend(toHeader(style, text, 9))
# Annexes # Annexes
elif style in docConfig.a1: elif style in docConfig.a1:
...@@ -676,9 +697,23 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: ...@@ -676,9 +697,23 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
lines[i] = line lines[i] = line
else: else:
_print(f'[yellow]Non-ASCII character (consider to add a replacement in the config.ini file): "{ch}" / {ord(ch)} / {hex(ord(ch))}') _print(f'[yellow]Non-ASCII character (consider to add a replacement in the config.ini file): "{ch}" / {ord(ch)} / {hex(ord(ch))}')
#
# Remove multiple bold / italics on/off occurances
# Sometimes word doesn't remove empty bold-on/bold-off (or italics) indicatros
#
progress.update(processTask, advance = 1) # progress update
for i in range(len(lines)):
line = lines[i]
line = line.replace('__', '')
line = line.replace('****', '')
line = line.replace(' ', ' ')
lines[i] = line
# #
# Insert auto-genrated table of contents # Insert auto-generated table of contents
# #
progress.update(processTask, advance = 1) # progress update progress.update(processTask, advance = 1) # progress update
if docConfig.generateToc: if docConfig.generateToc:
...@@ -763,10 +798,11 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: ...@@ -763,10 +798,11 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
if (res := subprocess.run(cmd, shell = True, capture_output = True)).returncode != 0: if (res := subprocess.run(cmd, shell = True, capture_output = True)).returncode != 0:
_print(f'[red] Error running command: {res.stderr.decode("utf-8")}') _print(f'[red] Error running command: {res.stderr.decode("utf-8")}')
if docConfig.emfConverterPng: if not skipImageConversion:
_convertImage(docConfig.emfConverterPng, 'png') if docConfig.emfConverterPng:
if docConfig.emfConverterSvg: _convertImage(docConfig.emfConverterPng, 'png')
_convertImage(docConfig.emfConverterSvg, 'svg') if docConfig.emfConverterSvg:
_convertImage(docConfig.emfConverterSvg, 'svg')
emfFiles.clear() emfFiles.clear()
referencedImages.clear() referencedImages.clear()
...@@ -782,6 +818,7 @@ if __name__ == '__main__': ...@@ -782,6 +818,7 @@ if __name__ == '__main__':
# Parse command line arguments # Parse command line arguments
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>', help = 'specify output directory') parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>', help = 'specify output directory')
parser.add_argument('--skip-image-conversion', '-sic', action='store_true', dest='skipImageConversion', help = 'skip image conversion step')
parser.add_argument('document', nargs = '+', help = 'documents to parse') parser.add_argument('document', nargs = '+', help = 'documents to parse')
args = parser.parse_args() args = parser.parse_args()
...@@ -789,5 +826,5 @@ if __name__ == '__main__': ...@@ -789,5 +826,5 @@ if __name__ == '__main__':
# Process documents and print output # Process documents and print output
os.makedirs(args.outDirectory, exist_ok = True) os.makedirs(args.outDirectory, exist_ok = True)
processDocuments(sorted(args.document), args.outDirectory) processDocuments(sorted(args.document), args.outDirectory, args.skipImageConversion)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment