From 0a04e26e3f873721592a3fa607a0313f3dc9172d Mon Sep 17 00:00:00 2001
From: ankraft <an.kraft@gmail.com>
Date: Fri, 21 Jul 2023 15:27:46 +0200
Subject: [PATCH] Added "addTrackedChanges.py"

---
 pandocFilter/addTrackedChanges.py | 152 ++++++++++++++++++++++++++++++
 pandocFilter/requirements.txt     |   8 +-
 pandocFilter/setup.py             |   5 +-
 3 files changed, 160 insertions(+), 5 deletions(-)
 create mode 100644 pandocFilter/addTrackedChanges.py

diff --git a/pandocFilter/addTrackedChanges.py b/pandocFilter/addTrackedChanges.py
new file mode 100644
index 0000000..302b753
--- /dev/null
+++ b/pandocFilter/addTrackedChanges.py
@@ -0,0 +1,152 @@
+#
+#	addTrackedChanges.py
+#
+#	Script to convert all improvised changes in a docx file (underlines, strike-throughs) 
+#	to either coloured text or tracked changes.
+#
+#	(c) 2023 by Andreas Kraft, Miguel Ortega
+#	License: BSD 3-Clause License. See the LICENSE file for further details.
+#
+
+from typing import Optional
+import argparse, tempfile, zipfile
+import lxml.etree as ET
+
+wns = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
+
+
+def convertChanges(fn:str, 
+				   outputFn:str,
+				   author:Optional[str] = None, 
+				   date:Optional[str] = None, 
+				   colourOnly:Optional[bool] = False) -> None:
+	"""	Converts all improvised changes in a docx file to either coloured text or tracked changes.
+
+		All underlined text is converted to either green text or tracked insertions.
+		All striked text is converted to either red text or tracked deletions.
+
+		Args:
+			fn: The path to the docx file.
+			outputFn: The path and filename to the output docx file.
+			author: The author of the changes. Defaults to None.
+			date: The date of the changes. Defaults to None.
+			colourOnly: If True, only the colour of the changes is changed, but not converted to tracked changes. Defaults to False.
+	"""
+	
+	# Create a temporary directory (automatically cleaned up after the with statement)
+	with tempfile.TemporaryDirectory() as tmpDir:
+
+		# Extract the docx file to the temporary directory
+		originalFileList = zipfile.ZipFile(fn).namelist()
+		zipfile.ZipFile(fn).extractall(tmpDir)
+
+		# Get the path to the document.xml file
+		xmlFn = f'{tmpDir}/word/document.xml'
+
+		# Register all namespaces in the given XML file.
+		# This is necessary to avoid the namespaces being removed when parsing the XML file.
+		# For this the XML file is parsed once and all namespaces are registered.
+		namespaces = dict([node for _, node in ET.iterparse(xmlFn, events = ['start-ns'])])
+		for ns in namespaces:
+			ET.register_namespace(ns, namespaces[ns])
+				
+		# Parse and handlethe XML file
+		with open(xmlFn, 'r') as f:
+			tree = ET.parse(f)
+
+			#
+			# Handle insertions
+			#
+
+			# Find all underline elements
+			for e in tree.findall(f'.//{wns}u'):
+				if colourOnly:
+					e.getparent().append(ins := ET.Element(f'{wns}color'))
+					ins.set(f'{wns}val', '00FF00')
+				else:
+
+					run = e.getparent().getparent()
+
+					# Add ins element as the parent of the run
+					idxOfRunInParent = run.getparent().index(run)
+					run.getparent().insert(idxOfRunInParent, ins := ET.Element(f'{wns}ins'))
+					
+					# Add author and date elements
+					if author:
+						ins.set(f'{wns}author', author)
+					if date:
+						ins.set(f'{wns}date', date)
+					
+					# Move the run element to the ins element
+					ins.append(run)
+
+					# Remove the underline element
+					e.getparent().remove(e)
+
+			#
+			# Handle deletions
+			#
+
+			# Find all strike elements
+			for e in tree.findall(f'.//{wns}strike'):
+				if colourOnly:
+					if e.attrib.get(f'{wns}val', 'true') == 'true':
+						e.getparent().append(ins := ET.Element(f'{wns}color'))
+						ins.set(f'{wns}val', 'FF0000')
+
+				else:
+					if e.attrib.get(f'{wns}val', 'true') == 'true':
+
+						run = e.getparent().getparent()
+
+						# Add del element as the parent of the run
+						# Get the corect index of the found element in the structure to add it later again
+						idxOfRunInParent = run.getparent().index(run)
+						run.getparent().insert(idxOfRunInParent, dl := ET.Element(f'{wns}del'))
+						
+						# Add author and date elements
+						if author:
+							dl.set(f'{wns}author', author)
+						if date:
+							dl.set(f'{wns}date', date)
+						
+						# Replace the run's text with the delText element
+						t = run.find(f'{wns}t') 				# find the t element
+						delText = ET.Element(f'{wns}delText')	# create a new delText element
+						delText.text = t.text					# copy the text from the t element to the delText element
+						run.append(delText)						# add the delText element to the run element
+						run.remove(t)							# remove the old t element from the run element
+
+						# Move the run element to the del element
+						dl.append(run)
+
+						# Remove the underline element
+						e.getparent().remove(e)
+
+
+		# Write back the modified XML file
+		tree.write(f'{tmpDir}/word/document.xml')
+
+		# Create a new docx file with the modified XML file
+		zip = zipfile.ZipFile(outputFn, 'w')
+		for fn in originalFileList:
+			zip.write(f'{tmpDir}/{fn}', fn, compress_type = zipfile.ZIP_DEFLATED , compresslevel = 9)
+
+
+if __name__ == '__main__':
+
+	# Parse command line arguments
+	parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
+	parser.add_argument('--outfile', '-o', action = 'store', dest = 'outfile', default = 'out.docx', metavar = '<output filename>',  help = 'specify the output name for the Word document')
+	parser.add_argument('--author', '-a', action = 'store', dest = 'author', default = None, metavar = '<author>',  help = 'specify the author of the changes')
+	parser.add_argument('--date', '-d', action = 'store', dest = 'date', default = None, metavar = '<date>',  help = 'specify the date of the changes (e.g. "2023-07-21T14:09:02")')
+	parser.add_argument('--colourOnly', '-c', action = 'store_true', dest = 'colourOnly', default = False,  help = 'only set the colour of the changes, but do not convert them to tracked changes')
+	parser.add_argument('document',  help = 'document to parse')
+	args = parser.parse_args()
+
+	# Convert the changes
+	convertChanges(args.document,
+				   args.outfile, 
+				   author = args.author, 
+				   date = args.date, 
+				   colourOnly = args.colourOnly)
diff --git a/pandocFilter/requirements.txt b/pandocFilter/requirements.txt
index 986b28e..0422266 100644
--- a/pandocFilter/requirements.txt
+++ b/pandocFilter/requirements.txt
@@ -4,13 +4,13 @@
 #
 #    pip-compile
 #
-markdown-it-py==2.2.0
+lxml==4.9.3
+    # via oneM2M-markdown-to-pandoc-filter (setup.py)
+markdown-it-py==3.0.0
     # via rich
 mdurl==0.1.2
     # via markdown-it-py
 pygments==2.15.1
     # via rich
-rich==13.3.5
+rich==13.4.2
     # via oneM2M-markdown-to-pandoc-filter (setup.py)
-requests==2.31.0
-unidiff==0.7.5
diff --git a/pandocFilter/setup.py b/pandocFilter/setup.py
index 066d12e..aa23d04 100644
--- a/pandocFilter/setup.py
+++ b/pandocFilter/setup.py
@@ -9,10 +9,13 @@ setup(
 	packages=find_packages(),
 	install_requires=[
 		'rich',
+        'lxml',
 	],
     entry_points= {
             'console_scripts' : ['pandocFilter=pandocFilter:main',
-								 'changemarks=changemarks:main']
+								 'changemarks=changemarks:main',
+								 'addTrackedChanges=addTrackedChanges:main',
+			]
             }
 
 )
-- 
GitLab