Added converting bold and italic text in paragraphs, headers and tables.

10048d4b · Andreas Kraft · 68c2f30b · 10048d4b · 10048d4b · 10048d4b
Commit 10048d4b authored 1 year ago by Andreas Kraft
--- a/README.md
+++ b/README.md
@@ -13,8 +13,8 @@ python3 -m pip install -r requirements.txt
 ## Usage
 - Create a directory with the Word document in it. The Word document **must** be in *docx* format. This can be achieved by opening the document with *Word* and save it in *docx* format to another file.
 - Create a configuration file with the same base name as the Word document + *.ini* extension. This file may contain different configurations as the standard *config.ini* file provided. 
-  - Alternativaly, a file named *config.ini* will apply to all files in that directory.
-  - It is only necessary to add the settings that are different from the *config.ini* file in the projects root directoy. That file will always act as a fallback.
+- Alternatively, a file named *config.ini* will apply to all files in that directory.
+	- It is only necessary to add the settings that are different from the *config.ini* file in the project's root directory. That file will always act as a fallback.
 - Run the converter as follows:
 ```
 python3 spec2md.py <path-to-word-document>
@@ -25,3 +25,7 @@ python3 spec2md.py <path-to-word-document>
 ### The converter doesn't seem to generate image files.

 Is *LibreOffice* already running? If yes, then close it.
+
+## Changes
+
+- **2023-07-27** - Added converting bold and italic text in paragraphs, headers and tables.
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
 #
-# This file is autogenerated by pip-compile with python 3.10
-# To update, run:
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
 #
 #    pip-compile
 #
-commonmark==0.9.1
+lxml==4.9.3
+    # via
+    #   oneM2M-spec-2-MD-converter (setup.py)
+    #   python-docx
+markdown-it-py==3.0.0
    # via rich
-lxml==4.9.1
-    # via python-docx
-pygments==2.13.0
+mdurl==0.1.2
+    # via markdown-it-py
+pygments==2.15.1
    # via rich
 python-docx==0.8.11
    # via oneM2M-spec-2-MD-converter (setup.py)
-rich==12.5.1
+rich==13.4.2
    # via oneM2M-spec-2-MD-converter (setup.py)
--- a/setup.py
+++ b/setup.py
@@ -9,6 +9,7 @@ setup(
 	description='Convert oneM2M specifications to Markdown',
 	packages=find_packages(),
 	install_requires=[
+        'lxml',
 		'rich',
 		'python-docx',
 	 ]

--- a/spec2md.py
+++ b/spec2md.py
@@ -24,8 +24,7 @@ from rich.progress import Progress, TextColumn, BarColumn
 from rich.console import Console
 from rich import inspect
 import configparser, zipfile
-from xml.etree import ElementTree as ET
-
+from lxml import etree as ET

 class Style(IntEnum):
 	example = auto()
@@ -78,7 +77,9 @@ _captionMarker = '__CAPTION__'
 console = Console()
 _print:Callable = print

-
+# Some predefined tags and attributes
+wns = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
+_val = f'{{{wns}}}val'

 class SectionNumbers(object):

@@ -286,7 +287,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
 			return tag


-		def getTextFromXML(elem:Paragraph) -> str:
+		def getTextFromXML(elem:Paragraph|_Cell) -> str:

 			#	Not-used document tags.
 			_ignoredTags = ( 'AlternateContent',
@@ -322,7 +323,18 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
 						for x in element:
 							_result += _parseXML(x)
 					case 't':
-						_result += str(toMD(str(element.text)))
+						bold = ''
+						italics = ''
+						for e in element.getparent():
+							if strippedTag(e.tag) == 'rPr':	# paragraph style
+								for ep in e:
+									match strippedTag(ep.tag):
+										case 'b' if ep.attrib.get(_val, 'true') == 'true':
+											bold = '**'
+										case 'i' if ep.attrib.get(_val, 'true') == 'true':
+											italics = '_'
+						_result += f'{bold}{italics}{str(toMD(str(element.text)))}{italics}{bold}'
+
 					case 'br':
 						_result += _linebreak
 					case 'bookmarkStart' | 'bookmarkEnd':		# TODO ?
@@ -381,7 +393,16 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
 				return _result

 			#_print(ET.fromstring(elem._p.xml))
+			match elem:
+				case Paragraph():	# type: ignore[misc]
 					return _parseXML(ET.fromstring(elem._p.xml))
+				case _Cell():		# type: ignore[misc]
+					result = ''
+					for p in elem.paragraphs:
+						result += _parseXML(ET.fromstring(p._p.xml))
+					return result
+				case _:
+					return ''



@@ -620,7 +641,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
 						for row in elem.rows:
 							cells:list[str] = []
 							for cell in row.cells:
-								cells.append(f'{toMD(cell.text)} ')	# add at least a space
+								cells.append(f'{getTextFromXML(cell)} ')	# add at least a space
 							rows.append(cells)
 							nrRows += 1
 						
@@ -706,7 +727,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:

 			for i in range(len(lines)):
 				line = lines[i]
-				lines[i] = re.sub(_referenceExpression, _repl, line)
+				lines[i] = re.sub(_referenceExpression, _repl, line)	# type:ignore[arg-type]

 			#
 			#	Write produced Markdown file