# # addTrackedChanges.py # # Script to convert all improvised changes in a docx file (underlines, strike-throughs) # to either coloured text or tracked changes. # # (c) 2023 by Andreas Kraft, Miguel Ortega # License: BSD 3-Clause License. See the LICENSE file for further details. # from typing import Optional import argparse, tempfile, zipfile import lxml.etree as ET wns = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' def convertChanges(fn:str, outputFn:str, author:Optional[str] = None, date:Optional[str] = None, colourOnly:Optional[bool] = False) -> None: """ Converts all improvised changes in a docx file to either coloured text or tracked changes. All underlined text is converted to either green text or tracked insertions. All striked text is converted to either red text or tracked deletions. Args: fn: The path to the docx file. outputFn: The path and filename to the output docx file. author: The author of the changes. Defaults to None. date: The date of the changes. Defaults to None. colourOnly: If True, only the colour of the changes is changed, but not converted to tracked changes. Defaults to False. """ # Create a temporary directory (automatically cleaned up after the with statement) with tempfile.TemporaryDirectory() as tmpDir: # Extract the docx file to the temporary directory originalFileList = zipfile.ZipFile(fn).namelist() zipfile.ZipFile(fn).extractall(tmpDir) # Get the path to the document.xml file xmlFn = f'{tmpDir}/word/document.xml' # Register all namespaces in the given XML file. # This is necessary to avoid the namespaces being removed when parsing the XML file. # For this the XML file is parsed once and all namespaces are registered. namespaces = dict([node for _, node in ET.iterparse(xmlFn, events = ['start-ns'])]) for ns in namespaces: ET.register_namespace(ns, namespaces[ns]) # Parse and handlethe XML file with open(xmlFn, 'r') as f: tree = ET.parse(f) # # Handle insertions # # Find all underline elements for e in tree.findall(f'.//{wns}u'): if colourOnly: e.getparent().append(ins := ET.Element(f'{wns}color')) ins.set(f'{wns}val', '00FF00') else: run = e.getparent().getparent() # Add ins element as the parent of the run idxOfRunInParent = run.getparent().index(run) run.getparent().insert(idxOfRunInParent, ins := ET.Element(f'{wns}ins')) # Add author and date elements if author: ins.set(f'{wns}author', author) if date: ins.set(f'{wns}date', date) # Move the run element to the ins element ins.append(run) # Remove the underline element e.getparent().remove(e) # # Handle deletions # # Find all strike elements for e in tree.findall(f'.//{wns}strike'): if colourOnly: if e.attrib.get(f'{wns}val', 'true') == 'true': e.getparent().append(ins := ET.Element(f'{wns}color')) ins.set(f'{wns}val', 'FF0000') else: if e.attrib.get(f'{wns}val', 'true') == 'true': run = e.getparent().getparent() # Add del element as the parent of the run # Get the corect index of the found element in the structure to add it later again idxOfRunInParent = run.getparent().index(run) run.getparent().insert(idxOfRunInParent, dl := ET.Element(f'{wns}del')) # Add author and date elements if author: dl.set(f'{wns}author', author) if date: dl.set(f'{wns}date', date) # Replace the run's text with the delText element t = run.find(f'{wns}t') # find the t element delText = ET.Element(f'{wns}delText') # create a new delText element delText.text = t.text # copy the text from the t element to the delText element run.append(delText) # add the delText element to the run element run.remove(t) # remove the old t element from the run element # Move the run element to the del element dl.append(run) # Remove the underline element e.getparent().remove(e) # Write back the modified XML file tree.write(f'{tmpDir}/word/document.xml') # Create a new docx file with the modified XML file zip = zipfile.ZipFile(outputFn, 'w') for fn in originalFileList: zip.write(f'{tmpDir}/{fn}', fn, compress_type = zipfile.ZIP_DEFLATED , compresslevel = 9) if __name__ == '__main__': # Parse command line arguments parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--outfile', '-o', action = 'store', dest = 'outfile', default = 'out.docx', metavar = '<output filename>', help = 'specify the output name for the Word document') parser.add_argument('--author', '-a', action = 'store', dest = 'author', default = None, metavar = '<author>', help = 'specify the author of the changes') parser.add_argument('--date', '-d', action = 'store', dest = 'date', default = None, metavar = '<date>', help = 'specify the date of the changes (e.g. "2023-07-21T14:09:02")') parser.add_argument('--colourOnly', '-c', action = 'store_true', dest = 'colourOnly', default = False, help = 'only set the colour of the changes, but do not convert them to tracked changes') parser.add_argument('document', help = 'document to parse') args = parser.parse_args() # Convert the changes convertChanges(args.document, args.outfile, author = args.author, date = args.date, colourOnly = args.colourOnly)