From 10048d4b5f8a74b27ce0a772177113b49b4a85b3 Mon Sep 17 00:00:00 2001 From: ankraft <an.kraft@gmail.com> Date: Thu, 27 Jul 2023 16:56:28 +0200 Subject: [PATCH] Added converting bold and italic text in paragraphs, headers and tables. --- README.md | 10 +++++++--- requirements.txt | 18 +++++++++++------- setup.py | 1 + spec2md.py | 37 +++++++++++++++++++++++++++++-------- 4 files changed, 48 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 55c16da..3df277e 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,9 @@ python3 -m pip install -r requirements.txt ## Usage - Create a directory with the Word document in it. The Word document **must** be in *docx* format. This can be achieved by opening the document with *Word* and save it in *docx* format to another file. -- Create a configuration file with the same basename as the Word document + *.ini* extension. This file may contain different configurations as the standard *config.ini* file provided. - - Alternativaly, a file named *config.ini* will apply to all files in that directory. - - It is only necessary to add the settings that are different from the *config.ini* file in the projects root directoy. That file will always act as a fallback. +- Create a configuration file with the same base name as the Word document + *.ini* extension. This file may contain different configurations as the standard *config.ini* file provided. +- Alternatively, a file named *config.ini* will apply to all files in that directory. + - It is only necessary to add the settings that are different from the *config.ini* file in the project's root directory. That file will always act as a fallback. - Run the converter as follows: ``` python3 spec2md.py <path-to-word-document> @@ -25,3 +25,7 @@ python3 spec2md.py <path-to-word-document> ### The converter doesn't seem to generate image files. Is *LibreOffice* already running? If yes, then close it. + +## Changes + +- **2023-07-27** - Added converting bold and italic text in paragraphs, headers and tables. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 42ccd5d..f32fa76 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,20 @@ # -# This file is autogenerated by pip-compile with python 3.10 -# To update, run: +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: # # pip-compile # -commonmark==0.9.1 +lxml==4.9.3 + # via + # oneM2M-spec-2-MD-converter (setup.py) + # python-docx +markdown-it-py==3.0.0 # via rich -lxml==4.9.1 - # via python-docx -pygments==2.13.0 +mdurl==0.1.2 + # via markdown-it-py +pygments==2.15.1 # via rich python-docx==0.8.11 # via oneM2M-spec-2-MD-converter (setup.py) -rich==12.5.1 +rich==13.4.2 # via oneM2M-spec-2-MD-converter (setup.py) diff --git a/setup.py b/setup.py index 0e24443..7ac245e 100644 --- a/setup.py +++ b/setup.py @@ -9,6 +9,7 @@ setup( description='Convert oneM2M specifications to Markdown', packages=find_packages(), install_requires=[ + 'lxml', 'rich', 'python-docx', ] diff --git a/spec2md.py b/spec2md.py index e760852..2c851eb 100644 --- a/spec2md.py +++ b/spec2md.py @@ -24,8 +24,7 @@ from rich.progress import Progress, TextColumn, BarColumn from rich.console import Console from rich import inspect import configparser, zipfile -from xml.etree import ElementTree as ET - +from lxml import etree as ET class Style(IntEnum): example = auto() @@ -78,7 +77,9 @@ _captionMarker = '__CAPTION__' console = Console() _print:Callable = print - +# Some predefined tags and attributes +wns = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' +_val = f'{{{wns}}}val' class SectionNumbers(object): @@ -286,7 +287,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: return tag - def getTextFromXML(elem:Paragraph) -> str: + def getTextFromXML(elem:Paragraph|_Cell) -> str: # Not-used document tags. _ignoredTags = ( 'AlternateContent', @@ -322,7 +323,18 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: for x in element: _result += _parseXML(x) case 't': - _result += str(toMD(str(element.text))) + bold = '' + italics = '' + for e in element.getparent(): + if strippedTag(e.tag) == 'rPr': # paragraph style + for ep in e: + match strippedTag(ep.tag): + case 'b' if ep.attrib.get(_val, 'true') == 'true': + bold = '**' + case 'i' if ep.attrib.get(_val, 'true') == 'true': + italics = '_' + _result += f'{bold}{italics}{str(toMD(str(element.text)))}{italics}{bold}' + case 'br': _result += _linebreak case 'bookmarkStart' | 'bookmarkEnd': # TODO ? @@ -381,7 +393,16 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: return _result #_print(ET.fromstring(elem._p.xml)) - return _parseXML(ET.fromstring(elem._p.xml)) + match elem: + case Paragraph(): # type: ignore[misc] + return _parseXML(ET.fromstring(elem._p.xml)) + case _Cell(): # type: ignore[misc] + result = '' + for p in elem.paragraphs: + result += _parseXML(ET.fromstring(p._p.xml)) + return result + case _: + return '' @@ -620,7 +641,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: for row in elem.rows: cells:list[str] = [] for cell in row.cells: - cells.append(f'{toMD(cell.text)} ') # add at least a space + cells.append(f'{getTextFromXML(cell)} ') # add at least a space rows.append(cells) nrRows += 1 @@ -706,7 +727,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None: for i in range(len(lines)): line = lines[i] - lines[i] = re.sub(_referenceExpression, _repl, line) + lines[i] = re.sub(_referenceExpression, _repl, line) # type:ignore[arg-type] # # Write produced Markdown file -- GitLab