Skip to content
Snippets Groups Projects
Commit 10048d4b authored by Andreas Kraft's avatar Andreas Kraft
Browse files

Added converting bold and italic text in paragraphs, headers and tables.

parent 68c2f30b
No related branches found
No related tags found
No related merge requests found
......@@ -13,8 +13,8 @@ python3 -m pip install -r requirements.txt
## Usage
- Create a directory with the Word document in it. The Word document **must** be in *docx* format. This can be achieved by opening the document with *Word* and save it in *docx* format to another file.
- Create a configuration file with the same base name as the Word document + *.ini* extension. This file may contain different configurations as the standard *config.ini* file provided.
- Alternativaly, a file named *config.ini* will apply to all files in that directory.
- It is only necessary to add the settings that are different from the *config.ini* file in the projects root directoy. That file will always act as a fallback.
- Alternatively, a file named *config.ini* will apply to all files in that directory.
- It is only necessary to add the settings that are different from the *config.ini* file in the project's root directory. That file will always act as a fallback.
- Run the converter as follows:
```
python3 spec2md.py <path-to-word-document>
......@@ -25,3 +25,7 @@ python3 spec2md.py <path-to-word-document>
### The converter doesn't seem to generate image files.
Is *LibreOffice* already running? If yes, then close it.
## Changes
- **2023-07-27** - Added converting bold and italic text in paragraphs, headers and tables.
\ No newline at end of file
#
# This file is autogenerated by pip-compile with python 3.10
# To update, run:
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile
#
commonmark==0.9.1
lxml==4.9.3
# via
# oneM2M-spec-2-MD-converter (setup.py)
# python-docx
markdown-it-py==3.0.0
# via rich
lxml==4.9.1
# via python-docx
pygments==2.13.0
mdurl==0.1.2
# via markdown-it-py
pygments==2.15.1
# via rich
python-docx==0.8.11
# via oneM2M-spec-2-MD-converter (setup.py)
rich==12.5.1
rich==13.4.2
# via oneM2M-spec-2-MD-converter (setup.py)
......@@ -9,6 +9,7 @@ setup(
description='Convert oneM2M specifications to Markdown',
packages=find_packages(),
install_requires=[
'lxml',
'rich',
'python-docx',
]
......
......@@ -24,8 +24,7 @@ from rich.progress import Progress, TextColumn, BarColumn
from rich.console import Console
from rich import inspect
import configparser, zipfile
from xml.etree import ElementTree as ET
from lxml import etree as ET
class Style(IntEnum):
example = auto()
......@@ -78,7 +77,9 @@ _captionMarker = '__CAPTION__'
console = Console()
_print:Callable = print
# Some predefined tags and attributes
wns = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
_val = f'{{{wns}}}val'
class SectionNumbers(object):
......@@ -286,7 +287,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
return tag
def getTextFromXML(elem:Paragraph) -> str:
def getTextFromXML(elem:Paragraph|_Cell) -> str:
# Not-used document tags.
_ignoredTags = ( 'AlternateContent',
......@@ -322,7 +323,18 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
for x in element:
_result += _parseXML(x)
case 't':
_result += str(toMD(str(element.text)))
bold = ''
italics = ''
for e in element.getparent():
if strippedTag(e.tag) == 'rPr': # paragraph style
for ep in e:
match strippedTag(ep.tag):
case 'b' if ep.attrib.get(_val, 'true') == 'true':
bold = '**'
case 'i' if ep.attrib.get(_val, 'true') == 'true':
italics = '_'
_result += f'{bold}{italics}{str(toMD(str(element.text)))}{italics}{bold}'
case 'br':
_result += _linebreak
case 'bookmarkStart' | 'bookmarkEnd': # TODO ?
......@@ -381,7 +393,16 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
return _result
#_print(ET.fromstring(elem._p.xml))
match elem:
case Paragraph(): # type: ignore[misc]
return _parseXML(ET.fromstring(elem._p.xml))
case _Cell(): # type: ignore[misc]
result = ''
for p in elem.paragraphs:
result += _parseXML(ET.fromstring(p._p.xml))
return result
case _:
return ''
......@@ -620,7 +641,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
for row in elem.rows:
cells:list[str] = []
for cell in row.cells:
cells.append(f'{toMD(cell.text)} ') # add at least a space
cells.append(f'{getTextFromXML(cell)} ') # add at least a space
rows.append(cells)
nrRows += 1
......@@ -706,7 +727,7 @@ def processDocuments(documents:list[str], outDirectory:str) -> None:
for i in range(len(lines)):
line = lines[i]
lines[i] = re.sub(_referenceExpression, _repl, line)
lines[i] = re.sub(_referenceExpression, _repl, line) # type:ignore[arg-type]
#
# Write produced Markdown file
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment