Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
S
spec2md
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Specification Tools
spec2md
Commits
6f5baea5
Commit
6f5baea5
authored
9 months ago
by
Andreas Kraft
Browse files
Options
Downloads
Patches
Plain Diff
Added support for footnotes
parent
9328ebdb
No related branches found
No related tags found
No related merge requests found
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
config.ini
+19
-6
19 additions, 6 deletions
config.ini
spec2md.py
+67
-16
67 additions, 16 deletions
spec2md.py
with
86 additions
and
22 deletions
config.ini
+
19
−
6
View file @
6f5baea5
...
...
@@ -31,18 +31,31 @@ imageCaptions2AltText = true
[toc]
# Add section numbers to the headlines
addSectionNumbers
=
false
excludeFromNumbering
=
# Exclude the following paragraph types from numbering.
# The default is to exclude the "Content" heading.
excludeFromNumbering
=
tt
# The paragraph type that is used in the original document for the table of contents.
tocStartParagraph
=
heading no numbering
# The level of the table of contents.
tocHeaderLevel
=
1
# Automatically generate a table of contents.
generateToc
=
false
# Add a macro "[toc]" to the document that can be used to generate a table of contents.
# Some converters and viewer support this macro.
addTocMacro
=
false
[paragraphs]
normal
=
normal
h1
=
heading 1, tt
h2
=
heading 2
normal
=
normal
, onem2m-normal
h1
=
heading 1, tt
, onem2m-heading1
h2
=
heading 2
, onem2m-heading2
h3
=
heading 3
h4
=
heading 4
h5
=
heading 5
...
...
@@ -53,7 +66,7 @@ h9 = heading 9
a1
=
heading 1
a2
=
heading 2
a3
=
heading 3
note
=
no
note
=
no
, onem2m-iprtitle, onem2m-ipr
code
=
pl
example
=
ex, ew
ul1
=
b1, b1+, list paragraph
...
...
@@ -63,7 +76,7 @@ ul4 = b4, b4+
ul5
=
b5, b5+
ol1
=
bn
ol2
=
bl
tablecaption
=
caption, th
tablecaption
=
caption, th
, onem2m-tabletitle
imagecaption
=
tf
image
=
fl
empty
=
fp
...
...
This diff is collapsed.
Click to expand it.
spec2md.py
+
67
−
16
View file @
6f5baea5
...
...
@@ -9,10 +9,12 @@
from
enum
import
IntEnum
,
auto
from
typing
import
Callable
,
Tuple
,
Dict
,
Optional
from
typing
import
Callable
,
Tuple
,
Dict
,
Optional
,
Any
from
pathlib
import
Path
,
PurePath
from
docx.document
import
Document
from
docx.text.paragraph
import
Paragraph
from
docx.package
import
Package
import
docx.opc.exceptions
from
docx.table
import
_Cell
,
Table
from
docx.oxml.table
import
CT_Tbl
...
...
@@ -201,7 +203,7 @@ class DocumentConfiguration(object):
def
processDocuments
(
documents
:
list
[
str
],
outDirectory
:
str
,
skipImageConversion
:
bool
)
->
None
:
docs
:
Dict
[
str
,
Tuple
[
Document
,
DocumentConfiguration
]]
=
{}
docs
:
Dict
[
str
,
Tuple
[
Document
,
DocumentConfiguration
,
Any
]]
=
{}
ptasks
=
{}
mediaRelations
:
Dict
[
str
,
str
]
=
{}
addSectionNumbers
=
False
...
...
@@ -209,6 +211,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
headers
:
list
[
Tuple
[
int
,
str
]]
=
[]
emfFiles
:
list
[
str
]
=
[]
referencedImages
:
list
[
str
]
=
[]
footnotes
:
dict
[
str
,
str
]
=
{}
global
_print
...
...
@@ -292,7 +295,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
return
tag
def
getTextFromXML
(
elem
:
Paragraph
|
_Cell
)
->
str
:
def
getTextFromXML
(
elem
:
Paragraph
|
_Cell
|
ET
.
_Element
)
->
str
:
# Not-used document tags.
_ignoredTags
=
(
'
AlternateContent
'
,
...
...
@@ -310,6 +313,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
'
commentRangeStart
'
,
'
commentRangeEnd
'
,
'
commentReference
'
,
'
smartTag
'
,
'
footnoteRef
'
,
)
...
...
@@ -405,13 +410,15 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
pass
# ignore a soft hyphen character which has no meaning in Markdown and zero-width
case
'
sym
'
:
def
_symError
(
ch
:
str
)
->
str
:
def
_symError
(
ch
:
int
)
->
None
:
nonlocal
_result
_symError
=
f
'
unknown font+symbol:
{
element
.
attrib
[
"
{
"
+wns+
"
}
font
"
]
}
-
"
{
element
.
attrib
[
"
{
"
+wns+
"
}
char
"
]
}
(
{
ch
}
)
"'
_print
(
f
'
[yellow]
{
_symError
}
'
)
_result
+=
f
'
<mark>
{
_symError
}
</mark>
'
try
:
_ch
=
'
????
'
_ch
=
0
_ch
=
int
(
element
.
attrib
[
"
{
"
+
wns
+
"
}char
"
],
16
)
if
_ch
in
docConfig
.
characters
:
if
(
rch
:
=
docConfig
.
characters
[
_ch
])
==
chr
(
0
):
...
...
@@ -431,6 +438,18 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
for
x
in
element
:
_result
+=
_parseXML
(
x
)
case
'
footnoteReference
'
:
id
=
element
.
attrib
[
f
'
{{
{
wns
}
}}id
'
]
_result
+=
f
'
[^
{
id
}
]
'
footnotes
[
id
]
=
'
<mark>unknown footnote</mark>
'
# The footnote itself is not included in the document but in a separate file.
# Therefore, we need to extract the footnote from the footnotes.xml file. The format
# of the footnote is the same as a paragraph.
case
'
footnote
'
:
for
x
in
element
:
_result
+=
_parseXML
(
x
)
case
_
if
tag
in
_ignoredTags
:
# ignore
pass
...
...
@@ -448,6 +467,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
# Create a list of parsed paragraphs and join them with linebreaks
return
'
<br />
'
.
join
([
_parseXML
(
ET
.
fromstring
(
p
.
_p
.
xml
),
True
).
rstrip
()
for
p
in
elem
.
paragraphs
])
case
ET
.
_Element
():
return
_parseXML
(
elem
)
case
_
:
return
''
...
...
@@ -480,7 +501,12 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
stopProgress
(
f
'
[red]Input document
"
{
d
}
"
is not a file
'
)
return
try
:
docs
[
d
]
=
(
docx
.
Document
(
d
),
DocumentConfiguration
(
d
))
# Search for footnotes in the document XML
footnotesPart
=
None
for
part
in
Package
.
open
(
d
).
parts
:
if
part
.
partname
.
endswith
(
'
/footnotes.xml
'
):
footnotesPart
=
part
docs
[
d
]
=
(
docx
.
Document
(
d
),
DocumentConfiguration
(
d
),
footnotesPart
)
ptasks
[
d
]
=
progress
.
add_task
(
f
'
Processing
{
d
}
'
,
total
=
None
)
progress
.
update
(
readTask
,
advance
=
1
)
except
docx
.
opc
.
exceptions
.
PackageNotFoundError
as
e
:
...
...
@@ -495,7 +521,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
# Processing Documents
#
for
docFileName
,
(
doc
,
docConfig
)
in
docs
.
items
():
for
docFileName
,
(
doc
,
docConfig
,
footnotesPart
)
in
docs
.
items
():
processTask
=
ptasks
[
docFileName
]
docItems
=
list
(
iter_block_items
(
doc
))
addSectionNumbers
=
docConfig
.
addSectionNumbers
...
...
@@ -517,7 +543,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
return
# Add sub-progress task
progress
.
update
(
processTask
,
total
=
len
(
docItems
)
+
5
)
# + relations + image extraction + characters + toc + media convert
progress
.
update
(
processTask
,
total
=
len
(
docItems
)
+
6
)
# + relations + image extraction + characters + toc +
footnotes +
media convert
# Extract the media relations file, and get the mappings from document IDs to media files
...
...
@@ -769,7 +795,9 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
elif
codeblock
:
# Add whole code block to lines
_lines
.
append
(
'
```
'
)
_lines
.
append
(
''
)
_lines
.
extend
(
codeblock
)
_lines
.
append
(
''
)
_lines
.
append
(
'
```
'
)
codeblock
=
[]
else
:
...
...
@@ -830,6 +858,29 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
lines
[
i
]
=
re
.
sub
(
_referenceExpression
,
_repl
,
line
)
# type:ignore[arg-type]
#
# Process footnotes
#
progress
.
update
(
processTask
,
advance
=
1
)
# progress update
if
len
(
footnotes
)
and
footnotesPart
is
not
None
:
_print
(
f
'
[yellow]Footnotes found:
{
len
(
footnotes
)
}
'
)
# Analyze footnotes file
footnotesXML
=
ET
.
fromstring
(
footnotesPart
.
blob
)
# Process the footnotes XML here
for
element
in
footnotesXML
:
# Footnote found
if
strippedTag
(
element
.
tag
)
==
'
footnote
'
:
footnoteID
=
element
.
attrib
[
f
'
{{
{
wns
}
}}id
'
]
if
footnoteID
in
footnotes
:
t
=
getTextFromXML
(
element
)
footnotes
[
footnoteID
]
=
t
# Add footnotes to the end of the document
lines
.
append
(
''
)
for
fid
,
text
in
footnotes
.
items
():
lines
.
append
(
f
'
[^
{
fid
}
]:
{
text
}
'
)
#
# List unresolved CAPTION markers
#
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment