Changed generated file names to section numbers or a short hash. Added verbose...

Changed generated file names to section numbers or a short hash. Added verbose and veryVerbose CLAs. Added converting of notes to MkDocs admonitions

Changed generated file names to section numbers or a short hash. Added verbose...
c47b9d6a · Andreas Kraft · b52168fa · c47b9d6a
Commit c47b9d6a authored 1 year ago by Andreas Kraft
--- a/toMkdocs/toMkdocs.py
+++ b/toMkdocs/toMkdocs.py
@@ -7,31 +7,81 @@
 #	directory structure.
 #

-from typing import Tuple
-import argparse, re, os, shutil
+from enum import Enum, auto
+import argparse, re, os, shutil, hashlib, base64
 from dataclasses import dataclass
 from rich import print

+verbose = False
+veryVerbose = False
+
+class LineType(Enum):
+	"""	Represents the type of a line in the markdown file. """
+	HEADING = auto()
+	TEXT = auto()
+	CODEFENCESTART = auto()
+	CODE = auto()
+	CODEFENCEEND = auto()
+	LIST = auto()
+	NOTE = auto()
+
+@dataclass
+class Line:
+	"""	Represents a line in the markdown file. """
+	text:str
+	lineType:LineType = LineType.TEXT
+
+
+
 @dataclass
 class Clause:
 	"""	Represents a clause in the markdown file. """
 	level:int
+	clauseNumber:str
 	title:str
-	lines:list[str]
+	lines:list[Line]
 	onlyNav:bool = False

-fnLength = 4
+	def asStringList(self) -> list[str]:
+		"""	Return the clause as a list of strings. 
+
+			Returns:
+				The clause's lines as a list of strings.
+		"""
+		return [ l.text for l in self.lines ]
+

 _matchHeader = re.compile(r'(#+)\s+(.*)', re.IGNORECASE)
-_matchCodefence = re.compile(r'\s*```\s?.*', re.IGNORECASE)
+_matchHeaderNumber = re.compile(r'\b[A-Za-z0-9]\d*(\.\d+)*\b', re.IGNORECASE)
+_matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE)
+_matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE)
+_matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE)
 _match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE)
-_markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#.*)\)', re.IGNORECASE)
+_markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE)
 _htmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE)
 _htmlAnchorLink = re.compile(r'<a\s+name="([^"]*)">[^<]*</a>', re.IGNORECASE)
+_matchNoteStart = re.compile(r'^\s*>\s*(note)?\s*[:]?\s*', re.IGNORECASE)


 # TODO handle multiple nav levels (left bar) better (make conifgurable)
-# TODO Update links in the markdown files to the new structure
+
+
+def shortHash(value:str, length:int) -> str:
+	"""	Generate a short hash of a string value.
+
+		Args:
+			value: The value to hash.
+			length: The length of the hash.
+
+		Returns:
+			The hash.
+	"""
+	return	base64.b64encode( 
+				hashlib.sha256( 
+					value.encode()
+				).digest()
+			 ).decode()[:length]
+

 def analyseMarkdown(filename:str) -> list[Clause]:
 	"""	Analyse the markdown file and split it into clauses.
@@ -48,26 +98,44 @@ def analyseMarkdown(filename:str) -> list[Clause]:
 	with open(filename, 'r') as file:
 		inLines = file.readlines()
 	
-	outLines:list[Clause] = [Clause(0, '', [])]
+	outLines:list[Clause] = [Clause(0, '', '', [])]

 	# Go through the lines and detect headers and codefences
 	inCodefence = False
 	for line in inLines:

 		# Detect codefences
-		if _matchCodefence.match(line):
-			inCodefence = not inCodefence
+		if _matchCodefenceStart.match(line):
+			inCodefence = True
+			outLines[-1].lines.append(Line(line, LineType.CODEFENCESTART))
+			continue
+
+		if _matchCodefenceEnd.match(line):
+			inCodefence = False
+			outLines[-1].lines.append(Line(line, LineType.CODEFENCEND))
+			continue
+
 		if inCodefence:
-			outLines[-1].lines.append(line)
+			outLines[-1].lines.append(Line(line, LineType.CODE))
+			continue
+	
+		# Detect notes
+		if _matchNote.match(line):
+			outLines[-1].lines.append(Line(line, LineType.NOTE))
 			continue
  
 		# Detect headers
+		_lineType = LineType.TEXT
 		if (m := _matchHeader.match(line)):
-			level = len(m.groups()[0])
 			clauseTitle = m.groups()[1].strip()
-			outLines.append(Clause(level, clauseTitle, []))
+			headerNumber = _matchHeaderNumber.search(clauseTitle)
+			outLines.append(Clause(len(m.groups()[0]), # level
+						  		   headerNumber.group() if headerNumber else shortHash(clauseTitle, 6),
+								   clauseTitle, 
+								   []))
+			_lineType = LineType.HEADING

-		outLines[-1].lines.append(line)
+		outLines[-1].lines.append(Line(line, _lineType))

 	return outLines

@@ -88,7 +156,7 @@ def splitMarkdownDocument(clauses:list[Clause],
 		Returns:
 			The list of clauses.
 	"""
-	outLines:list[Clause] = [Clause(0, '', [])]
+	outLines:list[Clause] = [Clause(0, '', '', [])]

 	for clause in clauses:
 		level = clause.level
@@ -100,7 +168,7 @@ def splitMarkdownDocument(clauses:list[Clause],
 		# Add a new output clause if the current clause's level is 
  		# equal or less than the split level
 		if clause.level <= splitLevel:
-			outLines.append(Clause(level, clause.title, []))
+			outLines.append(Clause(level, clause.clauseNumber, clause.title, []))
 		
 		# Add the lines to the output clause
 		outLines[-1].lines.extend(clause.lines)
@@ -131,64 +199,26 @@ def prepareForMkdocs(clauses:list[Clause]) -> list[Clause]:
 		if len(clause.lines) > 0:
 			clause.lines.pop(0)
 			# Also, remove the first empty lines if they exist
-			while len(clause.lines) > 0 and clause.lines[0].strip() == '':
+			while len(clause.lines) > 0 and clause.lines[0].text.strip() == '':
 				clause.lines.pop(0)
 	
 	# Mark the whole clause if it is the first AND NOT only clause
 	# for a parent clause. Then it is usually empty except the heading.
 	# We still need it for navigation, so we mark it as onlyNav
 	for clause in clauses:
-		if len(''.join(clause.lines).strip()) == 0 and clause.level > 0:
+		if len(''.join(clause.asStringList()).strip()) == 0 and clause.level > 0:
 			clause.onlyNav = True

 	# Repair wrong markdown for indented lines.
 	# Add 2 spaces to existing 2-space indentions
 	for clause in clauses:
 		for i, line in enumerate(clause.lines):
-			if _match2spaceListIndention.match(line):
-				clause.lines[i] = '  ' + line
+			if _match2spaceListIndention.match(line.text):
+				clause.lines[i].text = '  ' + line.text
 	
 	return clauses


-def writeClauses(outLines:list[Clause], filename:str, navTitle:str) -> None:
-	"""	Write the clauses to separate files and create a navigation file.
-
-		Args:
-			outLines: The list of clauses.
-			filename: The name of the original markdown file.
-			navTitle: The title of the navigation entry. This is used to determine the directories.
-	"""
-
-	# Write the files
-	# create directory first
-	os.makedirs(f'{os.path.dirname(filename)}/{navTitle}', exist_ok = True)
-	for i, f in enumerate(outLines):
-		if len(f.lines) == 0 or f.onlyNav:	# ignore empty clauses or clauses that are only for navigation
-			print(f'[green]Navigation only   - "{f.title}"')
-			continue
-	
-		# write to single files
-		print(f'[green]Writing "{i:0{fnLength}}.md" - "{f.title}"')
-		with open(f'{os.path.dirname(filename)}/{navTitle}/{i:0{fnLength}}.md', 'w') as file:
-			file.writelines(f.lines)
-
-	
-	# write nav.yml file
-	print(f'[green]Writing "_nav.yml"')
-	with open(f'{os.path.dirname(filename)}/_nav.yml', 'w') as file:
-		file.write(f'  - {navTitle}:\n')
-		for i, f in enumerate(outLines):
-			if f.onlyNav:
-				file.write(f"  {'  '*f.level}- '{f.title}':\n")
-				#file.write(f"{'  '*f.level}- '{f.title}':\n")
-			else:
-				if len(f.lines) == 0:
-					continue
-				file.write(f"  {'  '*f.level}- '{f.title}': '{navTitle}/{i:0{fnLength}}.md'\n")
-				#file.write(f"{'  '*f.level}- '{f.title}': '{navTitle}/{i:0{fnLength}}.md'\n")
-
-
 def updateLinks(clauses:list[Clause]) -> list[Clause]:
 	"""	Update the links in the clauses to the new structure. This is done by
 		creating a dictionary of all links and their targets and then replacing
@@ -202,47 +232,133 @@ def updateLinks(clauses:list[Clause]) -> list[Clause]:
 	"""
 	print(f'[green]Updating links in clauses')

-	# Build the link target dictionary. Mapping anchor -> (clause index, clause)
-	linkTargets:dict[str, Tuple[int, str]] = {}
+	# Build the link target dictionary. Mapping anchor -> clause
+	linkTargets:dict[str, Clause] = {}

 	# Find all Markdown headers in the clauses and convert them to anchor format
 	for i, clause in enumerate(clauses):
 		# Find all headers in the clause
 		for line in clause.lines:
-			if (m := _matchHeader.match(line)):
+			if (m := _matchHeader.match(line.text)):
 				# convert the header to anchor format and add it to the dictionary
+				# Remove special characters
 				# TODO move perhaps to an own function
-				anchor = m.groups()[1].strip().casefold().replace(' ', '-').replace('.', '')
-				linkTargets[f'#{anchor}'] = (i, clause)
+				anchor = m.groups()[1].strip().casefold().replace(' ', '-')
+				for c in ( '.', '(', ')', '[', ']', ':', ',', "'", '"'):
+					anchor = anchor.replace(c, '')
+				linkTargets[f'#{anchor}'] = clause
+				if veryVerbose:
+					print(f'[dim]Added Markdown anchor "{anchor}"')

 	# Find all HTML anchors in the clauses and add them to the dictionary
 	for i, clause in enumerate(clauses):
 		for line in clause.lines:
-			if (anchors := _htmlAnchorLink.findall(line)):
+			if (anchors := _htmlAnchorLink.findall(line.text)):
 				for a in anchors:
-					linkTargets[f'#{a}'] = (i, clause)
-					print(f'[green]Found anchor "{a}" in clause "{clause.title}"')
+					linkTargets[f'#{a}'] = clause
+					if veryVerbose:
+						print(f'[dim]Found HTML anchor "{a}" in clause "{clause.title}"')

 	# Replace the html links
 	for clause in clauses:
 		for i, line in enumerate(clause.lines):
-			if (links := _htmlLink.findall(line)):
+			if (links := _htmlLink.findall(line.text)):
 				for lnk in links:
-					width = 4
 					if lnk in linkTargets:
-						line = clause.lines[i] = line.replace(lnk, f'../{linkTargets[lnk][0]:0{width}}/#{lnk[1:]}')	# Update the current line as well
+						line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[lnk].clauseNumber}/#{lnk[1:]}')	# Update the current line as well
+				if veryVerbose:
+					print(f'[dim]Updated HTML link "{lnk}" in clause "{clause.title}"')

 	# Replace the markdown links
 	for clause in clauses:
 		for i, line in enumerate(clause.lines):
-			if (links := _markdownLink.findall(line)):
+			if (links := _markdownLink.findall(line.text)):
+				# Replace the old link targets with converted 
+				# (lower case) versions that point to the output files
 				for lnk in links:
-					if lnk in linkTargets:
-						line = clause.lines[i] = line.replace(lnk, f'../{linkTargets[lnk][0]:0{fnLength}}/#{lnk[1:]}')	# Update the current line as well
+					_lnk =lnk.casefold()
+					if _lnk in linkTargets:
+						line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[_lnk].clauseNumber}/#{lnk[1:]}')	# Update the current line as well
+				if veryVerbose:
+					print(f'[dim]Updated Markdown link "{lnk}" in clause "{clause.title}"')
+
+	return clauses
+
+
+def updateNotes(clauses:list[Clause]) -> list[Clause]:
+	"""	Update the notes in the clauses to the mkDocs notes version.
+
+		Args:
+			clauses: The list of clauses.
 		
+		Returns:
+			The list of clauses.
+	"""
+	print(f'[green]Updating notes in clauses')
+
+	for clause in clauses:
+		lines:list[Line] = []
+		inNote = False
+		for line in clause.lines:
+			match line.lineType:
+				case LineType.NOTE:
+					if not inNote:
+						lines.append(Line('\n', LineType.TEXT))
+						lines.append(Line('!!! note\n', LineType.NOTE))
+						inNote = True
+					lines.append(Line(f"\t{re.sub(_matchNoteStart, '', line.text)}", LineType.NOTE))
+					if verbose:
+						print(f'[dim]Converted note in clause "{clause.title}"')
+				case _:
+					if inNote:
+						lines.append(Line('\n', LineType.TEXT))
+					inNote = False
+					lines.append(line)
+		clause.lines = lines
 	return clauses


+def writeClauses(outLines:list[Clause], filename:str, navTitle:str) -> None:
+	"""	Write the clauses to separate files and create a navigation file.
+
+		Args:
+			outLines: The list of clauses.
+			filename: The name of the original markdown file.
+			navTitle: The title of the navigation entry. This is used to determine the directories.
+	"""
+
+	print(f'[green]Writing clauses to files')
+	# Write the files
+	# create directory first
+	os.makedirs(f'{os.path.dirname(filename)}/{navTitle}', exist_ok = True)
+	for i, f in enumerate(outLines):
+		if len(f.lines) == 0 or f.onlyNav:	# ignore empty clauses or clauses that are only for navigation
+			if verbose:
+				print(f'[dim]Navigation only - "{f.title}"')
+			continue
+	
+		# write to single files
+		if verbose:
+			print(f'[dim]Writing "{f.clauseNumber}.md" - "{f.title}"')
+		with open(f'{os.path.dirname(filename)}/{navTitle}/{f.clauseNumber}.md', 'w') as file:
+			file.writelines(f.asStringList())
+
+	
+	# write nav.yml file
+	print(f'[green]Writing "_nav.yml"')
+	with open(f'{os.path.dirname(filename)}/_nav.yml', 'w') as file:
+		if veryVerbose:
+			print(f'[dim]Writing navigation file')
+		file.write(f'  - {navTitle}:\n')
+		for i, f in enumerate(outLines):
+			if f.onlyNav:
+				file.write(f"  {'  '*f.level}- '{f.title}':\n")
+			else:
+				if len(f.lines) == 0:
+					continue
+				file.write(f"  {'  '*f.level}- '{f.title}': '{navTitle}/{f.clauseNumber}.md'\n")
+
+
 def copyMediaFiles(filename:str, navTitle:str, mediaDirectory:str = 'media') -> None:
 	"""	Copy media files from the source directory to the target directory.

@@ -262,14 +378,18 @@ def copyMediaFiles(filename:str, navTitle:str, mediaDirectory:str = 'media') ->

 	
 def processDocument(args:argparse.Namespace) -> None:
-	global fnLength
+	global verbose, veryVerbose
 	document = os.path.abspath(args.document)
-	fnLength = args.filename_length
+	veryVerbose = args.very_verbose
+	verbose = args.verbose
+	if veryVerbose:
+		verbose = True

 	# Analyse the markdown file
 	clauses = analyseMarkdown(document)
 	clauses = splitMarkdownDocument(clauses, [ t.casefold() for t in args.ignore_clause ], args.split_level)
 	clauses = updateLinks(clauses)
+	clauses = updateNotes(clauses)
 	clauses = prepareForMkdocs(clauses)

 	# Write the clauses to files
@@ -282,11 +402,12 @@ def processDocument(args:argparse.Namespace) -> None:
 if __name__ == '__main__':
 	parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)

+	parser.add_argument('--verbose', '-v', action = 'store_true', help = 'verbose output during processing')
+	parser.add_argument('--very-verbose', '-vv', action = 'store_true', help = 'very verbose output during processing')
 	parser.add_argument('--title', '-t', metavar = 'title', required = True, help = 'mkdocs navigation tile')
 	parser.add_argument('--ignore-clause', '-ic', metavar = 'clause', nargs = '+', default = [ 'Contents', 'History' ], help = 'ignore headers in the markdown document')
 	parser.add_argument('--split-level', '-sl', metavar = 'level', type = int, default = 2, help = 'split clauses on which level')
 	parser.add_argument('--media-directory', '-md', metavar = 'media-directory', default = 'media', help = 'directory name where media files are stored')
-	parser.add_argument('--filename-length', '-fl', metavar = 'length', default = 4, help = 'length of the filename with leading zeros')
 	parser.add_argument('document', type = str, help = 'a oneM2M markdown specification document to process')
 	args = parser.parse_args()
 	processDocument(args)