Skip to content
Snippets Groups Projects
Commit d59cfbc2 authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Some cleanup + parsing converting lists in cells to html lists

parent bc780760
No related branches found
No related tags found
No related merge requests found
......@@ -417,12 +417,13 @@ _matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE)
_matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE)
_matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE)
_matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECASE)
_matchTable = re.compile(r'^\s*\|.*\|\s$', re.IGNORECASE)
_matchTable = re.compile(r'^\s*\|.*\|\s*$', re.IGNORECASE)
_matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE)
_matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE)
_matchGridTableBodySeparator = re.compile(r'.*\+([-:]+\+)+.*$', re.IGNORECASE)
_matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE)
_matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE)
_match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE)
_matchListInContent = re.compile(r'^(?:\s*(P<marker>[-*+]|\s*\d+\.))\s+(P<content>.+)$', re.IGNORECASE)
_markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE)
_htmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE)
_htmlAnchorLink = re.compile(r'<a\s+name="([^"]*)">[^<]*</a>', re.IGNORECASE)
......@@ -466,6 +467,42 @@ def parse_pandoc_table_with_spans(pandoc_table):
_matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE)
return _matchGridTableSeparator.match(line)
def handling_content(cell, content, list_flag):
if cell['content'] is None:
cell['rowspan'] += 1
cell['colspan'] += 1
if content.strip().startswith("- "): # List
list_flag = True
print(content)
cell['content'] = content.strip() + "\n" # Add newline to know when the list element ends
elif list_flag: # any other content when handling list is concatenated to the last list element
cell['content'] += content.strip() + "\n"
elif cells[i].strip() == "": # separation between list and other paragraph
list_flag = False
cell['content'] = re.sub(r'\\\s*$', "\n", content)
else:
cell['content'] = re.sub(r'\\\s*$', "\n", content.strip())
else:
if content.strip().startswith("- "): # List
if not list_flag:
cell['content'] += "\n"
#cell['content'] = cell['content'].strip("\n")
list_flag = True
cell['content'] += content.strip() + "\n" # Add newline to know when the list element ends
elif list_flag: # any other content when handling list is concatenated to the last list element
cell['content'] = cell['content'].strip("\n")
cell['content'] += " " + content.strip() + "\n"
elif cells[i].strip() == "": # separation between list and other paragraph
list_flag = False
#content = re.sub(r'\\\s*$', "\n", content.strip())
cell['content'] += "\n" if not cell['content'].endswith("\n") else ""
else:
content = re.sub(r'\\\s*$', "\n", content.strip())
cell['content'] += " " + content
#print(cell['content'])
return list_flag, cell
_matchGridTableSeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
separator_indices = [i for i, line in enumerate(lines) if is_separator(line)]
......@@ -490,11 +527,13 @@ def parse_pandoc_table_with_spans(pandoc_table):
has_header = True
header_separator_index = index
header_rows = []
data_rows = []
for row in range(len(separator_indices) - 1):
table_row = []
auxiliar_row = []
use_auxiliar_row = []
list_flags = []
has_merged_cells = False
in_data_row = False
start, end = separator_indices[row], separator_indices[row + 1]
......@@ -508,24 +547,38 @@ def parse_pandoc_table_with_spans(pandoc_table):
parts = re.split(r"\s*\+\s*", line.strip("+"))
# Add as many cells as columns with span attributes
delimiter_index = 0
# Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator)
# we need to assign the default alignment as defined in the header separator line
# We may not need the code below, as that supports alignment per cell and row
alignments = []
for part_index in range(len(parts)):
if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
alignments.append("align=\"left\"")
elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
alignments.append("align=\"right\"")
else:
alignments.append("align=\"center\"")
for i in range(number_of_columns_row):
delimiter_index += len(parts[i]) + 1
table_row.append({
"content": "NOCONTENT",
"content": None,
"rowspan": 0,
"colspan": 0,
"colspan_adjusted": False,
"alignment": alignments[i] if alignments[i] else "align=\"center\"",
"position": delimiter_index # Position of cell delimiter +
})
for i in range(number_of_columns):
auxiliar_row.append({
"content": "NOCONTENT",
"content": None,
"rowspan": 0,
"colspan": 0,
"colspan_adjusted": False,
"alignment": "align=\"center\"",
"position": 0
})
use_auxiliar_row.append(False)
list_flags.append(False)
elif in_data_row:
# Regular data row or partial separator
......@@ -536,14 +589,35 @@ def parse_pandoc_table_with_spans(pandoc_table):
for i in range(len(cells)):
if _matchGridTableSeparatorLine.match(cells[i]): # A new row is to be added
use_auxiliar_row[i] = True
if cells[i].startswith(":") and not cells[i].endswith(":"):
auxiliar_row[i]['alignment'] = "align=\"left\""
elif not cells[i].startswith(":") and cells[i].endswith(":"):
auxiliar_row[i]['alignment'] = "align=\"right\""
else:
if table_row[i]['content'] == "NOCONTENT":
table_row[i]['rowspan'] += 1
table_row[i]['colspan'] += 1
table_row[i]['content'] = cells[i]
auxiliar_row[i]['alignment'] = "align=\"center\""
else:
table_row[i]['content'] += cells[i]
# Cell which is not separator
list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
#if table_row[i]['content'] is None:
# table_row[i]['rowspan'] += 1
# table_row[i]['colspan'] += 1
#if cells[i].strip().startswith("- "): # List
# handling_list = True
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
#elif handling_list: # any other content when handling list is concatenated to the last list element
# table_row[i]['content'].strip("\n")
# table_row[i]['content'] += cells[i] + "\n"
#elif cells[i].strip(): #separation between list and other paragraph
# handling_list = False
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
#else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
#else:
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) # Cell which is not separator
table_row[i]['rowspan'] += 1
if not table_row[i]['colspan_adjusted']:
table_row[i]['colspan_adjusted'] = True
......@@ -561,13 +635,30 @@ def parse_pandoc_table_with_spans(pandoc_table):
for i in range(len(cells)):
if _matchGridTableSeparatorLine.match(cells[i]): # Update cell in new row
use_auxiliar_row[i] = True
if cells[i].startswith(":") and not cells[i].endswith(":"):
auxiliar_row[i]['alignment'] = "align=\"left\""
elif not cells[i].startswith(":") and cells[i].endswith(":"):
auxiliar_row[i]['alignment'] = "align=\"right\""
else:
if table_row[i]['content'] == "NOCONTENT":
table_row[i]['rowspan'] += 1
table_row[i]['colspan'] += 1
table_row[i]['content'] = cells[i]
auxiliar_row[i]['alignment'] = "align=\"center\""
else:
table_row[i]['content'] += cells[i]
list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
# if table_row[i]['content'] is None:
# table_row[i]['rowspan'] += 1
# table_row[i]['colspan'] += 1
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i])
# else:
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
# Cell which is not separator
table_row[i]['rowspan'] += 1
# Not needed, no colspan as number of cells is equal to number of columns
......@@ -588,40 +679,62 @@ def parse_pandoc_table_with_spans(pandoc_table):
cells = re.split(r"\s*\|\s*", line.strip("|"))
if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
for i in range(len(cells)):
if table_row[i]['content'] == "NOCONTENT":
table_row[i]['rowspan'] += 1
table_row[i]['colspan'] += 1
table_row[i]['content'] = cells[i]
else:
table_row[i]['content'] += cells[i]
list_flags[i], table_row[i] = handling_content(table_row[i], cells[i], list_flags[i])
# if table_row[i]['content'] is None:
# table_row[i]['rowspan'] += 1
# table_row[i]['colspan'] += 1
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
# else:
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
if not table_row[i]['colspan_adjusted']:
table_row[i]['colspan_adjusted'] = True
for j in range(i, len(cells)):
delimiter_start = table_row[j-1]['position'] if j != 0 else 0
if line.find("|", delimiter_start+1) > delimiter_positions[j]: # Colspan to be increased
table_row[i]['colspan'] += 1
if line.find("|", delimiter_start + 1) == delimiter_positions[len(delimiter_positions) - 1]: # last cell in row, adjust colspan to get max number columns
colspan_remaining = 0
for cell_index in range(number_of_columns_row):
colspan_remaining += table_row[cell_index]['colspan']
table_row[i]['colspan'] += number_of_columns - colspan_remaining
elif line.find("|", delimiter_start+1) < delimiter_positions[j]:
raise ValueError("Wrong cell formatting")
else:
break
elif len(cells) == number_of_columns: # Simple row
for i in range(len(cells)):
if use_auxiliar_row[i]:
if auxiliar_row[i]['content'] == "NOCONTENT":
if auxiliar_row[i]['content'] is None:
auxiliar_row[i]['rowspan'] += 1
auxiliar_row[i]['colspan'] += 1
auxiliar_row[i]['content'] = cells[i]
else:
auxiliar_row[i]['content'] += cells[i]
auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
else:
if table_row[i]['content'] == "NOCONTENT":
table_row[i]['rowspan'] += 1
table_row[i]['colspan'] += 1
table_row[i]['content'] = cells[i]
auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
else:
table_row[i]['content'] += cells[i]
list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
# if table_row[i]['content'] is None:
# table_row[i]['rowspan'] += 1
# table_row[i]['colspan'] += 1
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
# else:
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i])
else:
raise ValueError("More cells than columns found")
else:
......@@ -638,39 +751,47 @@ def parse_pandoc_table_with_spans(pandoc_table):
#print(header_rows)
#print(data_rows)
# Correct newlines characters
for row in header_rows:
for cell in row:
cell['content'] = cell['content'].replace("\\", "<br>")
for row in data_rows:
for cell in row:
cell['content'] = cell['content'].replace("\\", "<br>")
# Check if there are any data rows
if not data_rows and not header_rows:
raise ValueError("No valid rows found in the provided Pandoc table.")
# Format text
for rows in [header_rows, data_rows]:
bold = "<strong>"
for row in header_rows:
italic = "<i>"
for row in rows:
for cell in row:
while cell['content'].find("**") != -1:
cell['content'] = cell['content'].replace("**", bold, 1)
if cell['content'] is not None:
# Replacing "<" by &lt;
cell['content'] = cell['content'].replace("<", "&lt;")
#Bold
for bold_characters in ["**", "__"]:
while cell['content'].find(bold_characters) != -1:
cell['content'] = cell['content'].replace(bold_characters, bold, 1)
if bold == "<strong>":
bold = "</strong>"
else:
bold = "<strong>"
bold = "<strong>"
#Italic
while cell['content'].find("_") != -1 and cell['content'].find("\_") == -1:
cell['content'] = cell['content'].rstrip() .replace("_", italic, 1)
if italic == "<i>":
italic = "</i>"
else:
italic = "<i>"
while cell['content'].find("\_") != -1:
cell['content'] = cell['content'].rstrip().replace("\_", "_", 1)
# Correct newlines characters
for row in header_rows:
for cell in row:
cell['content'] = cell['content'].replace("\n", "<br />") if cell['content'] is not None else None
for row in data_rows:
for cell in row:
while cell['content'].find("**") != -1:
cell['content'] = cell['content'].replace("**", bold, 1)
if bold == "<strong>":
bold = "</strong>"
else:
bold = "<strong>"
cell['content'] = cell['content'].replace("\n", "<br />") if cell['content'] is not None else None
# Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows
forward_rowspan = []
for row_index in range(len(header_rows)):
if len(forward_rowspan) == 0:
......@@ -701,12 +822,7 @@ def parse_pandoc_table_with_spans(pandoc_table):
forward_rowspan[cell_index] = data_rows[row_index][cell_index]['rowspan'] - 1
if not sum == number_of_columns:
raise ValueError("Grid table not converted properly")
#if has_header:
# table_with_spans = header_rows
#table_with_spans += data_rows
#return table_with_spans
return header_rows, data_rows
def generate_html_table_with_spans(pandoc_table):
......@@ -733,9 +849,23 @@ def generate_html_table_with_spans(pandoc_table):
if cell['rowspan'] == 0 or cell['colspan'] == 0:
continue
else:
# Prepare content, in case there's a list
print(cell['content'])
if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>",
cell['content']): # Update cell in new row
#print("MATCHING")
list = "<ul>"
# Build list the matches
for match in matches:
list += "<li>" + match[1] + "</li>"
list += "</ul>"
cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell['content'])
#else:
# print("NOT MATCHING")
rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
html += f" <td{rowspan}{colspan}>{cell['content']}</td>\n"
html += f" <th{rowspan}{colspan} {cell['alignment']}>{cell['content']}</th>\n"
html += " </tr>\n"
html += " </thead>\n"
......@@ -746,9 +876,21 @@ def generate_html_table_with_spans(pandoc_table):
if cell['rowspan'] == 0 or cell['colspan'] == 0:
continue
else:
#Prepare content, in case there's a list
#print(cell['content'])
if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell['content']): # Update cell in new row
#print("MATCHING")
list = "<ul>"
# Build list the matches
for match in matches:
list += "<li>" + match[1] + "</li>"
list += "</ul>"
cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell['content'])
#else:
#print("NOT MATCHING")
rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
html += f" <td{rowspan}{colspan}>{cell['content']}</td>\n"
html += f" <td{rowspan}{colspan} {cell['alignment']}>{cell['content']}</td>\n"
html += " </tr>\n"
html += " </tbody>\n"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment