Skip to content
Snippets Groups Projects
Commit 0a04e26e authored by Andreas Kraft's avatar Andreas Kraft
Browse files

Added "addTrackedChanges.py"

parent f72cbc4c
No related branches found
No related tags found
No related merge requests found
Pipeline #112 passed
#
# addTrackedChanges.py
#
# Script to convert all improvised changes in a docx file (underlines, strike-throughs)
# to either coloured text or tracked changes.
#
# (c) 2023 by Andreas Kraft, Miguel Ortega
# License: BSD 3-Clause License. See the LICENSE file for further details.
#
from typing import Optional
import argparse, tempfile, zipfile
import lxml.etree as ET
wns = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
def convertChanges(fn:str,
outputFn:str,
author:Optional[str] = None,
date:Optional[str] = None,
colourOnly:Optional[bool] = False) -> None:
""" Converts all improvised changes in a docx file to either coloured text or tracked changes.
All underlined text is converted to either green text or tracked insertions.
All striked text is converted to either red text or tracked deletions.
Args:
fn: The path to the docx file.
outputFn: The path and filename to the output docx file.
author: The author of the changes. Defaults to None.
date: The date of the changes. Defaults to None.
colourOnly: If True, only the colour of the changes is changed, but not converted to tracked changes. Defaults to False.
"""
# Create a temporary directory (automatically cleaned up after the with statement)
with tempfile.TemporaryDirectory() as tmpDir:
# Extract the docx file to the temporary directory
originalFileList = zipfile.ZipFile(fn).namelist()
zipfile.ZipFile(fn).extractall(tmpDir)
# Get the path to the document.xml file
xmlFn = f'{tmpDir}/word/document.xml'
# Register all namespaces in the given XML file.
# This is necessary to avoid the namespaces being removed when parsing the XML file.
# For this the XML file is parsed once and all namespaces are registered.
namespaces = dict([node for _, node in ET.iterparse(xmlFn, events = ['start-ns'])])
for ns in namespaces:
ET.register_namespace(ns, namespaces[ns])
# Parse and handlethe XML file
with open(xmlFn, 'r') as f:
tree = ET.parse(f)
#
# Handle insertions
#
# Find all underline elements
for e in tree.findall(f'.//{wns}u'):
if colourOnly:
e.getparent().append(ins := ET.Element(f'{wns}color'))
ins.set(f'{wns}val', '00FF00')
else:
run = e.getparent().getparent()
# Add ins element as the parent of the run
idxOfRunInParent = run.getparent().index(run)
run.getparent().insert(idxOfRunInParent, ins := ET.Element(f'{wns}ins'))
# Add author and date elements
if author:
ins.set(f'{wns}author', author)
if date:
ins.set(f'{wns}date', date)
# Move the run element to the ins element
ins.append(run)
# Remove the underline element
e.getparent().remove(e)
#
# Handle deletions
#
# Find all strike elements
for e in tree.findall(f'.//{wns}strike'):
if colourOnly:
if e.attrib.get(f'{wns}val', 'true') == 'true':
e.getparent().append(ins := ET.Element(f'{wns}color'))
ins.set(f'{wns}val', 'FF0000')
else:
if e.attrib.get(f'{wns}val', 'true') == 'true':
run = e.getparent().getparent()
# Add del element as the parent of the run
# Get the corect index of the found element in the structure to add it later again
idxOfRunInParent = run.getparent().index(run)
run.getparent().insert(idxOfRunInParent, dl := ET.Element(f'{wns}del'))
# Add author and date elements
if author:
dl.set(f'{wns}author', author)
if date:
dl.set(f'{wns}date', date)
# Replace the run's text with the delText element
t = run.find(f'{wns}t') # find the t element
delText = ET.Element(f'{wns}delText') # create a new delText element
delText.text = t.text # copy the text from the t element to the delText element
run.append(delText) # add the delText element to the run element
run.remove(t) # remove the old t element from the run element
# Move the run element to the del element
dl.append(run)
# Remove the underline element
e.getparent().remove(e)
# Write back the modified XML file
tree.write(f'{tmpDir}/word/document.xml')
# Create a new docx file with the modified XML file
zip = zipfile.ZipFile(outputFn, 'w')
for fn in originalFileList:
zip.write(f'{tmpDir}/{fn}', fn, compress_type = zipfile.ZIP_DEFLATED , compresslevel = 9)
if __name__ == '__main__':
# Parse command line arguments
parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--outfile', '-o', action = 'store', dest = 'outfile', default = 'out.docx', metavar = '<output filename>', help = 'specify the output name for the Word document')
parser.add_argument('--author', '-a', action = 'store', dest = 'author', default = None, metavar = '<author>', help = 'specify the author of the changes')
parser.add_argument('--date', '-d', action = 'store', dest = 'date', default = None, metavar = '<date>', help = 'specify the date of the changes (e.g. "2023-07-21T14:09:02")')
parser.add_argument('--colourOnly', '-c', action = 'store_true', dest = 'colourOnly', default = False, help = 'only set the colour of the changes, but do not convert them to tracked changes')
parser.add_argument('document', help = 'document to parse')
args = parser.parse_args()
# Convert the changes
convertChanges(args.document,
args.outfile,
author = args.author,
date = args.date,
colourOnly = args.colourOnly)
......@@ -4,13 +4,13 @@
#
# pip-compile
#
markdown-it-py==2.2.0
lxml==4.9.3
# via oneM2M-markdown-to-pandoc-filter (setup.py)
markdown-it-py==3.0.0
# via rich
mdurl==0.1.2
# via markdown-it-py
pygments==2.15.1
# via rich
rich==13.3.5
rich==13.4.2
# via oneM2M-markdown-to-pandoc-filter (setup.py)
requests==2.31.0
unidiff==0.7.5
......@@ -9,10 +9,13 @@ setup(
packages=find_packages(),
install_requires=[
'rich',
'lxml',
],
entry_points= {
'console_scripts' : ['pandocFilter=pandocFilter:main',
'changemarks=changemarks:main']
'changemarks=changemarks:main',
'addTrackedChanges=addTrackedChanges:main',
]
}
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment