diff --git a/README.md b/README.md index 3df277ecf0a70ab13d0e8e8554dc2cde8bcf2a2a..9dc1608f07fc8f41541a7c7276ad44f51ec54357 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# onem2m-spec2md +# spec2md Convert oneM2M specification documents to markdown @@ -26,6 +26,27 @@ python3 spec2md.py <path-to-word-document> Is *LibreOffice* already running? If yes, then close it. +### Are linebreaks, paragraphs, and lists supported in table cells? + +Unfortunately, markdown doesn't support multiple paragraphs in table cells. A table cell must be a single line. However, one can add a html `<br />` linebreak to break between lines: + +```markdown +| Header | +|----------------| +| text<br />text | +``` + +Lists in table cells are also not possible. One may use html lists for this, but this use is **discouraged** because it may cause problems in conversions from markdown to other document formats. It is recommend to use simple lists using a dash `-` character: + +```markdown +| Header | +|----------------------------------| +| - list item 1<br />- list item 2 | +``` + + + ## Changes +- **2023-08-18** - Improved handling of sometimes broken inline formatting in table cells. Adding more default heading formats. - **2023-07-27** - Added converting bold and italic text in paragraphs, headers and tables. \ No newline at end of file diff --git a/config.ini b/config.ini index 3428d74fa85a8621df97a6623aa2a1b99645e879..e7fe449beea548fb00fc1b27bec0a433084a21ca 100644 --- a/config.ini +++ b/config.ini @@ -42,6 +42,8 @@ h4 = heading 4 h5 = heading 5 h6 = heading 6 h7 = heading 7 +h8 = heading 8 +h9 = heading 9 a1 = heading 1 a2 = heading 2 a3 = heading 3 @@ -58,7 +60,7 @@ tablecaption = caption, th imagecaption = tf image = fl empty = fp -ignore = toc 1, toc 2, toc 3, toc 4, toc 5, toc 6, toc 7 +ignore = toc 1, toc 2, toc 3, toc 4, toc 5, toc 6, toc 7, toc 8, toc 9 [characters] diff --git a/spec2md.py b/spec2md.py index 2c851eba8505bd2802c07f109d3fcca2a7af8a27..02799b4e64c7dac86aeb4753ecbefb8c2d0705ed 100644 --- a/spec2md.py +++ b/spec2md.py @@ -84,7 +84,7 @@ _val = f'{{{wns}}}val' class SectionNumbers(object): def __init__(self) -> None: - self.levels:list[int] = [ 0, 0, 0, 0] + self.levels:list[int] = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] self.heading:int = 0 self.annex:int = 64 @@ -157,6 +157,8 @@ class DocumentConfiguration(object): self.h5 = self.paragraphs['h5'] self.h6 = self.paragraphs['h6'] self.h7 = self.paragraphs['h7'] + self.h8 = self.paragraphs['h8'] + self.h9 = self.paragraphs['h9'] self.a1 = self.paragraphs['a1'] self.a2 = self.paragraphs['a2'] self.a3 = self.paragraphs['a3'] @@ -195,7 +197,7 @@ class DocumentConfiguration(object): -def processDocuments(documents:list[str], outDirectory:str) -> None: +def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:bool) -> None: docs:Dict[str, Tuple[Document, DocumentConfiguration]] = {} ptasks = {} mediaRelations:Dict[str, str] = {} @@ -308,7 +310,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: ) - def _parseXML(element:ET.Element) -> str: + def _parseXML(element:ET.Element, inCell:Optional[bool] = False) -> str: """ Recursively parse a document paragraph. """ nonlocal _ignoredTags @@ -318,22 +320,30 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: match tag: case 'p': for x in element: - _result += _parseXML(x) + _result += _parseXML(x, inCell) case 'r': for x in element: - _result += _parseXML(x) + _result += _parseXML(x, inCell) case 't': - bold = '' - italics = '' + _bold = '' + _italics = '' for e in element.getparent(): if strippedTag(e.tag) == 'rPr': # paragraph style for ep in e: match strippedTag(ep.tag): case 'b' if ep.attrib.get(_val, 'true') == 'true': - bold = '**' + _bold = '**' case 'i' if ep.attrib.get(_val, 'true') == 'true': - italics = '_' - _result += f'{bold}{italics}{str(toMD(str(element.text)))}{italics}{bold}' + _italics = '_' + + # Strip white spaces if bold or italics + _s = str(toMD(str(element.text))).strip() if _bold or _italics else str(toMD(str(element.text))) + # Replace single * or _ + _s = _s.replace('_', '\\_') + _s = _s.replace('*', '\\*') + # Add trailing white space when bold or italics + _prefix = ' ' if _bold or _italics else '' + _result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_prefix}' case 'br': _result += _linebreak @@ -344,7 +354,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: # Hyperlinks and URLs _hresult = '' for x in element: - _hresult += _parseXML(x) + _hresult += _parseXML(x, inCell) _result += f'[{_hresult}]({_hresult})' case 'drawing': @@ -356,7 +366,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: blip = element.findall('ns1:inline/ns3:graphic/ns3:graphicData/ns4:pic/ns4:blipFill/ns3:blip', namespaces = { 'ns1' : 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing', - 'ns3' : 'http://schemas.openxmlformats.org/drawingml/2006/main', + 'ns3' : wns, 'ns4' : 'http://schemas.openxmlformats.org/drawingml/2006/picture', }) if blip and \ @@ -367,8 +377,8 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: mediaFile = str(Path(mediaFile).with_suffix(f'.{docConfig.renameEMFExtension}')) _print(f'[yellow]Renaming EMF file reference to "{mediaFile}"') _result += f'' - else: - _print(blip) + # else: + # _print(blip) case 'pict': # for e in element: @@ -380,10 +390,17 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: #inspect(element) pass - case 'tab': _result += ' ' # TODO nbsp? + case 'softHyphen': + pass # ignore a soft hyphen character which has no meaning in Markdown and zero-width + + case 'sym': + _symError = f'unknown font+symbol: {element.attrib["{"+wns+"}font"]} - "{element.attrib["{"+wns+"}char"]}"' + _print(f'[yellow]{_symError}') + _result += f'<mark>{_symError}</mark>' + case _ if tag in _ignoredTags: # ignore pass @@ -399,7 +416,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: case _Cell(): # type: ignore[misc] result = '' for p in elem.paragraphs: - result += _parseXML(ET.fromstring(p._p.xml)) + result += _parseXML(ET.fromstring(p._p.xml), True) return result case _: return '' @@ -434,7 +451,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: return try: docs[d] = (docx.Document(d), DocumentConfiguration(d)) - ptasks[d] = progress.add_task(f'Processing {d}', total = 1000) + ptasks[d] = progress.add_task(f'Processing {d}', total = None) progress.update(readTask, advance=1) except docx.opc.exceptions.PackageNotFoundError as e: stopProgress(f'[red]Input document "{d}" is not a .docx file') @@ -540,6 +557,10 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: lines.extend(toHeader(style, text, 6)) elif style in docConfig.h7: lines.extend(toHeader(style, text, 7)) + elif style in docConfig.h8: + lines.extend(toHeader(style, text, 8)) + elif style in docConfig.h9: + lines.extend(toHeader(style, text, 9)) # Annexes elif style in docConfig.a1: @@ -676,9 +697,23 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: lines[i] = line else: _print(f'[yellow]Non-ASCII character (consider to add a replacement in the config.ini file): "{ch}" / {ord(ch)} / {hex(ord(ch))}') + + + # + # Remove multiple bold / italics on/off occurances + # Sometimes word doesn't remove empty bold-on/bold-off (or italics) indicatros + # + progress.update(processTask, advance = 1) # progress update + for i in range(len(lines)): + line = lines[i] + line = line.replace('__', '') + line = line.replace('****', '') + line = line.replace(' ', ' ') + lines[i] = line + # - # Insert auto-genrated table of contents + # Insert auto-generated table of contents # progress.update(processTask, advance = 1) # progress update if docConfig.generateToc: @@ -763,10 +798,11 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: if (res := subprocess.run(cmd, shell = True, capture_output = True)).returncode != 0: _print(f'[red] Error running command: {res.stderr.decode("utf-8")}') - if docConfig.emfConverterPng: - _convertImage(docConfig.emfConverterPng, 'png') - if docConfig.emfConverterSvg: - _convertImage(docConfig.emfConverterSvg, 'svg') + if not skipImageConversion: + if docConfig.emfConverterPng: + _convertImage(docConfig.emfConverterPng, 'png') + if docConfig.emfConverterSvg: + _convertImage(docConfig.emfConverterSvg, 'svg') emfFiles.clear() referencedImages.clear() @@ -782,6 +818,7 @@ if __name__ == '__main__': # Parse command line arguments parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>', help = 'specify output directory') + parser.add_argument('--skip-image-conversion', '-sic', action='store_true', dest='skipImageConversion', help = 'skip image conversion step') parser.add_argument('document', nargs = '+', help = 'documents to parse') args = parser.parse_args() @@ -789,5 +826,5 @@ if __name__ == '__main__': # Process documents and print output os.makedirs(args.outDirectory, exist_ok = True) - processDocuments(sorted(args.document), args.outDirectory) + processDocuments(sorted(args.document), args.outDirectory, args.skipImageConversion)