Generation of changemark CR:

- generation of changemark MD files per modified Clause - conversion of MD files to docx files - combination of the docx files into the CR docx file

Generation of changemark CR:
29b1bf60 · Miguel Angel Reina Ortega · e4f5e2da · 29b1bf60 · 29b1bf60 · 29b1bf60
Commit 29b1bf60 authored 1 year ago by Miguel Angel Reina Ortega
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -28,7 +28,7 @@ Build pythonForPandocFilter docker image:
        - pandocFilter/setup.py
        - pandocFilter/requirements.txt
        - pandocFilter/pandocFilter.py
+        - pandocFilter/changemarks.py
 Word CR text:
  stage: generation
@@ -36,7 +36,7 @@ Word CR text:
    - merge_requests
  before_script:
    - |
-     curl "${CI_API_V4_URL}/projects/$TOOLS_SCRIPTS_PROJECT_ID/repository/files/generate_changemarks%2Esh/raw?ref=master" >> generate_changemarks.sh
+     curl "${CI_API_V4_URL}/projects/$TOOLS_SCRIPTS_PROJECT_ID/repository/files/generate_changemarks%2Esh/raw?ref=miguel" >> generate_changemarks.sh
    - chmod +x generate_changemarks.sh
    - |
     curl "${CI_API_V4_URL}/projects/$TOOLS_SCRIPTS_PROJECT_ID/repository/files/onem2m_delimiter_start%2Edocx/raw?ref=master" >> onem2m_delimiter_start.docx

--- a/generate_changemarks.sh
+++ b/generate_changemarks.sh
 #!/bin/bash
-DOCKER_IMAGE=forge.3gpp.org:5050/tools/3gpp-scripts/forgelib:v2.2.0
+FORGELIB_DOCKER_IMAGE=forge.3gpp.org:5050/tools/3gpp-scripts/forgelib:miguel
+PANDOC_FILTER_DOCKER_IMAGE=pandocfilter:latest
+DOCKER_IMAGE=pandoc/core:3.1.1.0
 echo "\n------ Checking for docker image --------"
 docker pull "$DOCKER_IMAGE"
@@ -11,13 +13,26 @@ rm **/*.docx
 echo "------ Parsing repo URL --------"
 HOST_URL=$(echo $1 | cut -d'/' -f 1-3)
-PROJECT_NAME=$(echo $1 | cut -d'/' -f 4- | cut -d'.' -f 1)
+PROJECT_NAME=$(echo $1 | cut -d'/' -f 6- | cut -d'.' -f 1)
 echo "HOST URL:" $HOST_URL
 echo "PROJECT NAME:" $PROJECT_NAME
 echo "PROJECT ID:" $2
 echo "MERGE IID:" $3
-echo "\n------ Generating change marks --------" 
+echo "\n------ Generating change marks MD --------"
-docker container run --rm -v $(pwd):/tmp/ "$DOCKER_IMAGE" forgelib-changedocs -vv --combine --diffs --preprocessor onem2m --outPath=/tmp/docs --startdelimiter "/tmp/$4" --enddelimiter "/tmp/$5" --coversheet "/tmp/$6" "$HOST_URL" "$2" "$PROJECT_NAME" "$3"
+#docker container run --rm -v $(pwd):/tmp/ "$DOCKER_IMAGE" forgelib-changedocs -vv --combine --diffs --preprocessor onem2m --outPath=/tmp/docs --startdelimiter "/tmp/$4" --enddelimiter "/tmp/$5" --coversheet "/tmp/$6" "$HOST_URL" "$2" "$PROJECT_NAME" "$3"
+docker container run --rm -v $(pwd):/tmp/ -u $(id -u):$(id -g) "$PANDOC_FILTER_DOCKER_IMAGE" changemarks -o "/tmp/out" "$HOST_URL" "$2" "$3"
+echo "\n------ Generating changemarks docx --------"
+for i in out/*.md ;  do
+        DOCUMENT_NAME=$(echo $i | cut -d'/' -f 2)
+	echo "\n------ Preparaing spec --------" 
+	docker run --rm -v $(pwd):/tmp/ -u $(id -u):$(id -g) "$PANDOC_FILTER_DOCKER_IMAGE" pandocFilter -o "/tmp/out" "/tmp/$i"
+	echo "\n------ Publishing spec --------" 
+	docker run --rm -v $(pwd):/data -u $(id -u):$(id -g) "$DOCKER_IMAGE" "/data/$i" -f markdown -t docx --reference-doc "Spec-template.docx" -o "/data/out/${DOCUMENT_NAME}.docx"
+done
+echo "\n------ Combining docx --------"
+docker container run --rm -v $(pwd):/tmp/ -u $(id -u):$(id -g) "$FORGELIB_DOCKER_IMAGE" forgelib-changedocs -vv -sf "/tmp/out/" --preprocessor onem2m --outPath=/tmp/docs --startdelimiter "/tmp/$4" --enddelimiter "/tmp/$5" --coversheet "/tmp/$6" "$HOST_URL" "$2" "$3"
 exit 0
--- a/onem2m_coversheet_template.docx
+++ b/onem2m_coversheet_template.docx
--- a/onem2m_delimiter_end.docx
+++ b/onem2m_delimiter_end.docx
--- a/onem2m_delimiter_start.docx
+++ b/onem2m_delimiter_start.docx
--- a/pandocFilter/changemarks.py
+++ b/pandocFilter/changemarks.py
+##
+#	changemarks.py
+#
+#	Script to generate a markdown file per clause modified in a merge request
+#
+#	(c) 2023 by Miguel Angel Reina Ortega
+#	License: BSD 3-Clause License. See the LICENSE file for further details.
+#
+import argparse, os, re, sys
+from rich import print
+from rich.progress import Progress, TextColumn, TimeElapsedColumn
+import logging
+import requests
+from unidiff import PatchSet
+def fetch(url : str, expected_content_type : str = None) -> requests.Response:
+    r = requests.get(url)
+    logging.debug(f"Fetching {url}")
+    if (r.status_code != 200):
+        errorMessage = f"Failed attempting to retrieve {url}, status code {r.status_code}"
+        logging.error(errorMessage)
+        raise ValueError(errorMessage)
+    if expected_content_type:
+        if r.headers['Content-Type'] != expected_content_type:
+            errorMessage = f"Unexpected content type retrieving {url}. Expected {expected_content_type}, got {r.headers['Content-Type']}"
+            logging.error(errorMessage)
+            raise ValueError(errorMessage)
+    return r
+def fetch_text(url : str, expected_content_type : str = None) -> str:
+    r = fetch(url, expected_content_type)
+    return r.text
+def fetch_json(url : str, expected_content_type : str = None) -> requests.Response:
+    r = fetch(url, expected_content_type)
+    return r.json()
+def readMDFile(progress:Progress, document:str) -> list[str]:
+    """	Read the markdown file and return a list of lines.
+    """
+    _taskID = progress.add_task('[blue]Reading document', start=False, total=0)
+    # Check if file exists
+    if not os.path.exists(document):
+        print(f'File {document} does not exist')
+        exit(1)
+    # Read the file
+    with open(document, 'r', encoding='utf-8', errors = 'replace') as f:
+        progress.stop_task(_taskID)
+        return f.readlines()
+def writeMDFile(progress:Progress, mdLines:list[str], document:str, outDirectory:str) -> None:
+    """	Write the markdown file.
+    """
+    _taskID = progress.add_task('[blue]Writing document', start=False, total=0)
+    # Write the file
+    with open(f'{outDirectory}/{os.path.basename(document)}', 'w', encoding='utf-8', errors = 'replace') as f:
+        f.writelines(mdLines)
+    progress.stop_task(_taskID)
+class Clause():
+    '''
+    Defines a clause of the base document
+    '''
+    def __init__(self, line, from_id, to_id, clause_nr):
+        self.raw = line
+        self.from_id = from_id
+        self.to_id = to_id
+        self.clause_nr = clause_nr
+def find_all_clauses(progress:Progress, mdLines:list[str]):
+    '''
+    Scans the body of the document to find all clauses
+    Returns a list of Clauses, start index and end index
+    '''
+    _taskID = progress.add_task('[blue]Find all available clauses', start=False, total=0)
+    clauseregex = re.compile('^#+\s(\d(\.\d)*|Annex \w|\w*(\.\d)*).*')
+    clauses:list[Clause] = []
+    index = 1
+    empty = ""
+    clause = Clause(empty,0,0,empty)
+    for line in mdLines:
+        if line.startswith('#'):
+            matches = re.findall(clauseregex, line)	# Match heading
+            if matches:   # It may be the end of the clause or the start of a subclause
+                if index - 2 == clause.from_id:  # It is a subclause
+                    clause.from_id = index
+                    clause.raw = line
+                    clause.clause_nr = matches[0][0]
+                else: # It is the end of the clause
+                    clause.to_id = index - 1
+                    clauses.append(clause)
+                    clause = Clause(line,index,index,matches[0][0])
+            else: # it is last clause
+                print("Unknown heading")
+        index = index + 1    
+    # Append last clause (usually History)
+    clause.to_id = index - 1
+    clauses.append(clause)
+    return clauses
+class MR:
+    def __init__(self, project_id, mr_id, root = "https://git.onem2m.org"):
+        self.project_id = project_id
+        self.mr_id = mr_id
+        self.root = root
+        self.raw_project_details = fetch_json(self.api_url())
+        self.web_url = self.raw_project_details['web_url']
+        self.raw_mr_details = fetch_json(self.api_url(f'/merge_requests/{self.mr_id}'))
+        self.author = self.raw_mr_details['author']['name']
+        self.target_branch = self.raw_mr_details['target_branch']
+        self.source_branch = self.raw_mr_details['source_branch']
+        self.title = self.raw_mr_details['title']
+        self.description = self.raw_mr_details['description']
+        self.raw_diff = fetch_text(f'{self.web_url}/-/merge_requests/{self.mr_id}.diff', expected_content_type='text/plain')
+        self.patch_set = PatchSet.from_string(self.raw_diff)
+    def api_url(self, route : str = "") -> str:
+        return f"{self.root}/api/v4/projects/{self.project_id}/{route}"
+    def retrieve_text(self, branch: str, filename: str) -> str:
+        return fetch_text(f"{self.web_url}/-/raw/{branch}/{filename}")
+def find_changed_clauses(progress:Progress, mdLines:list[str], clauses:list[Clause], mr:MR, outDirectory:str ):
+    '''
+    Determine the clauses that have been modified by the merge request
+https://forge.etsi.org/rep/cdm/pipeline-scripts/-/blob/main/common/Dockerfile.stfubuntu    Returns a list of Clauses, start index and end index
+    '''
+    _taskID = progress.add_task('[blue]Find changed clauses', start=False, total=0)
+    changed_clauses:list[Clause] = []
+    empty = ""
+    changed_clause = Clause(empty,0,0,empty)
+    for patched_file in mr.patch_set:
+        if patched_file.source_file.startswith("a/TS"):
+            logging.debug(f"Looking at changes in {patched_file.source_file}")
+            for change in patched_file:
+                # Check the previous changed_clause
+                if (changed_clause.from_id <= change.target_start) and (changed_clause.to_id >= (change.target_start - 1 + change.target_length)):
+                    generateMDforChange(progress, mdLines, changed_clause, change, outDirectory, True)
+                    break
+                i = 0 
+                # Check all clauses
+                for clause in clauses:
+                    if (clause.from_id <= change.target_start) and (clause.to_id >= (change.target_start - 1 + change.target_length)):
+                        changed_clause = clauses.pop(i)
+                        changed_clauses.append(clause)
+                        generateMDforChange(progress, mdLines, changed_clause, change, outDirectory, False)
+                        break
+                    i = i + 1
+    for clause in changed_clauses:
+        logging.debug(f"Clause {clause.clause_nr} contains modifications")
+    return changed_clauses
+def generateMDforChange(progress:Progress, mdLines:list[str],changed_clause:Clause, change, outDirectory:str, existing_clause:bool):
+    '''
+    Generate the MD for the clauses that have been modified by the merge request
+https://forge.etsi.org/rep/cdm/pipeline-scripts/-/blob/main/common/Dockerfile.stfubuntu    Returns a list of Clauses, start index and end index
+    '''
+    _taskID = progress.add_task('[blue]Generate MD for changed clauses', start=False, total=0)
+    if not existing_clause:
+        index = changed_clause.from_id - 1
+        clauseMDlines: list[str] = []
+        while index < changed_clause.to_id:
+            clauseMDlines.append(mdLines[index]+'\n')
+            index = index + 1
+    else:
+        clauseMDlines = readMDFile(progress, changed_clause.clause_nr +'.md')
+    j = change.target_start - changed_clause.from_id # index gap
+    for line in change:
+        if (not (line.value.strip() == '') and (line.is_added)):
+            clauseMDlines.insert(j, "<span class=\"underline\">" + line.value + "</span>\n\n")
+            #clauseMDlines.insert(j, "<mark>" + line.value.strip("\n") + "</mark>\n\n")
+            clauseMDlines.pop(j+1)
+        elif line.is_removed:
+            clauseMDlines.insert(j, "~~" + line.value.strip() + "~~")
+        j = j + 1
+    writeMDFile(progress, clauseMDlines, changed_clause.clause_nr.replace(" ","") + '.md', outDirectory)
+def process(document:str, outDirectory:str, mr:MR) -> None:
+    with Progress(TextColumn('{task.description}'),  TimeElapsedColumn()) as progress:
+                sourceText = mr.retrieve_text(mr.source_branch, document)
+                sourceMdLines = sourceText.splitlines(keepends=False)
+                clauses = find_all_clauses(progress, sourceMdLines)
+                changed_clauses = find_changed_clauses(progress, sourceMdLines, clauses, mr, outDirectory)
+def main(args=None):
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>',  help = 'specify output directory')
+    parser.add_argument('rootURL',      help="Forge root URL")
+    parser.add_argument('projectID',    help="Forge project ID")
+    parser.add_argument('mergeID',      help="Merge IID")
+    pargs = parser.parse_args()
+    # Process documents and print output
+    os.makedirs(pargs.outDirectory, exist_ok = True)
+    mr = MR(pargs.projectID, pargs.mergeID, pargs.rootURL)
+    for patched_file in mr.patch_set:
+        if patched_file.source_file.startswith("a/TS"):
+            filename = patched_file.source_file.split("/")[1]
+            process(filename, pargs.outDirectory, mr)
+        else:
+            logging.debug(f"Cannot process file named {patched_file.source_file}")
+if __name__ == '__main__':
+    sys.exit(main())
--- a/pandocFilter/requirements.txt
+++ b/pandocFilter/requirements.txt
@@ -12,3 +12,5 @@ pygments==2.15.1
    # via rich
 rich==13.3.5
    # via oneM2M-markdown-to-pandoc-filter (setup.py)
+requests==2.31.0
+unidiff==0.7.5
--- a/pandocFilter/setup.py
+++ b/pandocFilter/setup.py
@@ -11,7 +11,8 @@ setup(
 		'rich',
 	],
    entry_points= {
-            'console_scripts' : ['pandocFilter=pandocFilter:main']
+            'console_scripts' : ['pandocFilter=pandocFilter:main',
+								 'changemarks=changemarks:main']
            }
 )