diff --git a/pandocFilter/addTrackedChanges.py b/pandocFilter/addTrackedChanges.py new file mode 100644 index 0000000000000000000000000000000000000000..302b7539902fe37cb275d8bd0f6e410ca046f8c8 --- /dev/null +++ b/pandocFilter/addTrackedChanges.py @@ -0,0 +1,152 @@ +# +# addTrackedChanges.py +# +# Script to convert all improvised changes in a docx file (underlines, strike-throughs) +# to either coloured text or tracked changes. +# +# (c) 2023 by Andreas Kraft, Miguel Ortega +# License: BSD 3-Clause License. See the LICENSE file for further details. +# + +from typing import Optional +import argparse, tempfile, zipfile +import lxml.etree as ET + +wns = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' + + +def convertChanges(fn:str, + outputFn:str, + author:Optional[str] = None, + date:Optional[str] = None, + colourOnly:Optional[bool] = False) -> None: + """ Converts all improvised changes in a docx file to either coloured text or tracked changes. + + All underlined text is converted to either green text or tracked insertions. + All striked text is converted to either red text or tracked deletions. + + Args: + fn: The path to the docx file. + outputFn: The path and filename to the output docx file. + author: The author of the changes. Defaults to None. + date: The date of the changes. Defaults to None. + colourOnly: If True, only the colour of the changes is changed, but not converted to tracked changes. Defaults to False. + """ + + # Create a temporary directory (automatically cleaned up after the with statement) + with tempfile.TemporaryDirectory() as tmpDir: + + # Extract the docx file to the temporary directory + originalFileList = zipfile.ZipFile(fn).namelist() + zipfile.ZipFile(fn).extractall(tmpDir) + + # Get the path to the document.xml file + xmlFn = f'{tmpDir}/word/document.xml' + + # Register all namespaces in the given XML file. + # This is necessary to avoid the namespaces being removed when parsing the XML file. + # For this the XML file is parsed once and all namespaces are registered. + namespaces = dict([node for _, node in ET.iterparse(xmlFn, events = ['start-ns'])]) + for ns in namespaces: + ET.register_namespace(ns, namespaces[ns]) + + # Parse and handlethe XML file + with open(xmlFn, 'r') as f: + tree = ET.parse(f) + + # + # Handle insertions + # + + # Find all underline elements + for e in tree.findall(f'.//{wns}u'): + if colourOnly: + e.getparent().append(ins := ET.Element(f'{wns}color')) + ins.set(f'{wns}val', '00FF00') + else: + + run = e.getparent().getparent() + + # Add ins element as the parent of the run + idxOfRunInParent = run.getparent().index(run) + run.getparent().insert(idxOfRunInParent, ins := ET.Element(f'{wns}ins')) + + # Add author and date elements + if author: + ins.set(f'{wns}author', author) + if date: + ins.set(f'{wns}date', date) + + # Move the run element to the ins element + ins.append(run) + + # Remove the underline element + e.getparent().remove(e) + + # + # Handle deletions + # + + # Find all strike elements + for e in tree.findall(f'.//{wns}strike'): + if colourOnly: + if e.attrib.get(f'{wns}val', 'true') == 'true': + e.getparent().append(ins := ET.Element(f'{wns}color')) + ins.set(f'{wns}val', 'FF0000') + + else: + if e.attrib.get(f'{wns}val', 'true') == 'true': + + run = e.getparent().getparent() + + # Add del element as the parent of the run + # Get the corect index of the found element in the structure to add it later again + idxOfRunInParent = run.getparent().index(run) + run.getparent().insert(idxOfRunInParent, dl := ET.Element(f'{wns}del')) + + # Add author and date elements + if author: + dl.set(f'{wns}author', author) + if date: + dl.set(f'{wns}date', date) + + # Replace the run's text with the delText element + t = run.find(f'{wns}t') # find the t element + delText = ET.Element(f'{wns}delText') # create a new delText element + delText.text = t.text # copy the text from the t element to the delText element + run.append(delText) # add the delText element to the run element + run.remove(t) # remove the old t element from the run element + + # Move the run element to the del element + dl.append(run) + + # Remove the underline element + e.getparent().remove(e) + + + # Write back the modified XML file + tree.write(f'{tmpDir}/word/document.xml') + + # Create a new docx file with the modified XML file + zip = zipfile.ZipFile(outputFn, 'w') + for fn in originalFileList: + zip.write(f'{tmpDir}/{fn}', fn, compress_type = zipfile.ZIP_DEFLATED , compresslevel = 9) + + +if __name__ == '__main__': + + # Parse command line arguments + parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--outfile', '-o', action = 'store', dest = 'outfile', default = 'out.docx', metavar = '<output filename>', help = 'specify the output name for the Word document') + parser.add_argument('--author', '-a', action = 'store', dest = 'author', default = None, metavar = '<author>', help = 'specify the author of the changes') + parser.add_argument('--date', '-d', action = 'store', dest = 'date', default = None, metavar = '<date>', help = 'specify the date of the changes (e.g. "2023-07-21T14:09:02")') + parser.add_argument('--colourOnly', '-c', action = 'store_true', dest = 'colourOnly', default = False, help = 'only set the colour of the changes, but do not convert them to tracked changes') + parser.add_argument('document', help = 'document to parse') + args = parser.parse_args() + + # Convert the changes + convertChanges(args.document, + args.outfile, + author = args.author, + date = args.date, + colourOnly = args.colourOnly) diff --git a/pandocFilter/requirements.txt b/pandocFilter/requirements.txt index 986b28ef5633586a033c1c37b136288807d8d0c1..0422266c785ac7f17fb209b21f8978a36a8d9687 100644 --- a/pandocFilter/requirements.txt +++ b/pandocFilter/requirements.txt @@ -4,13 +4,13 @@ # # pip-compile # -markdown-it-py==2.2.0 +lxml==4.9.3 + # via oneM2M-markdown-to-pandoc-filter (setup.py) +markdown-it-py==3.0.0 # via rich mdurl==0.1.2 # via markdown-it-py pygments==2.15.1 # via rich -rich==13.3.5 +rich==13.4.2 # via oneM2M-markdown-to-pandoc-filter (setup.py) -requests==2.31.0 -unidiff==0.7.5 diff --git a/pandocFilter/setup.py b/pandocFilter/setup.py index 066d12e78189d90038e6379113aa0de4a600bf75..aa23d04aa3d59b6fc04e502dc1a19ed98bdb3944 100644 --- a/pandocFilter/setup.py +++ b/pandocFilter/setup.py @@ -9,10 +9,13 @@ setup( packages=find_packages(), install_requires=[ 'rich', + 'lxml', ], entry_points= { 'console_scripts' : ['pandocFilter=pandocFilter:main', - 'changemarks=changemarks:main'] + 'changemarks=changemarks:main', + 'addTrackedChanges=addTrackedChanges:main', + ] } )