diff --git a/spec2md.py b/spec2md.py index 5a19b8ecb680321e05272135a73874c8afb7e012..b8a1da72e74e573da2c33c115b5af507e7e4c833 100644 --- a/spec2md.py +++ b/spec2md.py @@ -52,8 +52,8 @@ unreferencedSubDir = 'unreferenced' _linebreak = '<br />' _entityLt = '<' _nbsp = ' ' -_tocInsertPoint = '__t_o_c__' -_captionMarker = '__CAPTION__' +_tocInsertPoint = '~~t~o~c~~' +_captionMarker = '~~CAPTION~~' # https://learn.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing?view=openxml-2.8.1 @@ -335,6 +335,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: _bold = '**' case 'i' if ep.attrib.get(_val, 'true') == 'true': _italics = '_' + # case _: + # _print(f'[yellow]unsupported style: {ep.tag}') # Strip white spaces if bold or italics _s = str(toMD(str(element.text))).strip() if _bold or _italics else str(toMD(str(element.text))) @@ -342,11 +344,13 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: _s = _s.replace('_', '\\_') _s = _s.replace('*', '\\*') # Add trailing white space when bold or italics - _prefix = ' ' if _bold or _italics else '' - _result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_prefix}' + _postfix = ' ' if _bold or _italics else '' + _result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_postfix}' + # print(_result) case 'br': _result += _linebreak + case 'bookmarkStart' | 'bookmarkEnd': # TODO ? pass @@ -366,20 +370,21 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: blip = element.findall('ns1:inline/ns3:graphic/ns3:graphicData/ns4:pic/ns4:blipFill/ns3:blip', namespaces = { 'ns1' : 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing', - 'ns3' : wns, + 'ns3' : 'http://schemas.openxmlformats.org/drawingml/2006/main', 'ns4' : 'http://schemas.openxmlformats.org/drawingml/2006/picture', }) if blip and \ (rId := blip[0].attrib.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')) and \ (mediaFile := mediaRelations.get(rId)): - referencedImages.append(Path(mediaFile).stem) # Add to referenced files - if docConfig.renameEMFExtension and mediaFile.lower().endswith('.emf'): - mediaFile = str(Path(mediaFile).with_suffix(f'.{docConfig.renameEMFExtension}')) - _print(f'[yellow]Renaming EMF file reference to "{mediaFile}"') - _result += f'' + mediaFilePath = Path(mediaFile) + referencedImages.append(mediaFilePath.stem) # Add to referenced files + if docConfig.renameEMFExtension and mediaFilePath.suffix.lower() == '.emf': + mediaFilePath = mediaFilePath.with_suffix(f'.{docConfig.renameEMFExtension}') + _print(f'[yellow]Renaming EMF file reference to "{str(mediaFilePath)}"') + _result += f'})' # image reference as posix path # else: # _print(blip) - + case 'pict': # for e in element: # print(f'----{e}') @@ -423,10 +428,10 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: case Paragraph(): # type: ignore[misc] return _parseXML(ET.fromstring(elem._p.xml)) case _Cell(): # type: ignore[misc] - result = '' - for p in elem.paragraphs: - result += _parseXML(ET.fromstring(p._p.xml), True) - return result + # Iterate over all paragraphs in the cell and parse them + # Create a list of parsed paragraphs and join them with linebreaks + return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip() + for p in elem.paragraphs ]) case _: return '' @@ -614,10 +619,10 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: # Table Caption elif style in docConfig.tablecaption: lines.append('') - lines.append(f'**{replaceNL(text).strip()}**') + caption = replaceNL(text).strip() anchor = f'<a name="table_{caption[6:].split(":")[0].strip()}"></a>' if caption.startswith('Table ') and ':' in caption else '' lines.append(f'**{caption}**{anchor}') - + # Image Caption elif style in docConfig.imagecaption: checkSameStyle(Style.imagecaption, lambda:lines.append('')) @@ -679,12 +684,16 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: # Warning if this is a single-row table if nrRows == 1: - _print(f'[red]Single-row table found. Consider replacing it in the original document:\n{rows[0]}') + _print(f'[red]Single-row table found. Such tables cannot be converted to markdown.[/red] Please consider to change the following table in the original document:\n[grey39]{rows[0]}', highlight = False) lines.append('') # Add an empty line before a table for idx, row in enumerate(rows): + + # Check for a table caption and add separator line if idx == 1: lines.append('-'.join('|' * (len(row) + 1) )) + + # Add table row lines.append(f'|{"|".join(row)}|' .replace('\n', _linebreak)) # replace line breaks in cells lines.append('') # Add another empty line after a table @@ -719,7 +728,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: line = lines[i] line = line.replace('__', '') line = line.replace('****', '') - line = line.replace(' ', ' ') + #line = line.replace(' ', ' ') lines[i] = line @@ -775,6 +784,15 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: line = lines[i] lines[i] = re.sub(_referenceExpression, _repl, line) # type:ignore[arg-type] + + # + # List unresolved CAPTION markers + # + for i in range(len(lines)): + line = lines[i] + if _captionMarker in line: + _print(f'[yellow]Unresolved figure caption : \[{i}] "{line}"') + # # Write produced Markdown file #