Skip to content
Snippets Groups Projects
Commit d59cfbc2 authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Some cleanup + parsing converting lists in cells to html lists

parent bc780760
No related branches found
No related tags found
No related merge requests found
...@@ -417,12 +417,13 @@ _matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE) ...@@ -417,12 +417,13 @@ _matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE)
_matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE) _matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE)
_matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE) _matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE)
_matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECASE) _matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECASE)
_matchTable = re.compile(r'^\s*\|.*\|\s$', re.IGNORECASE) _matchTable = re.compile(r'^\s*\|.*\|\s*$', re.IGNORECASE)
_matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE) _matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE)
_matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE) _matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE)
_matchGridTableBodySeparator = re.compile(r'.*\+([-:]+\+)+.*$', re.IGNORECASE) _matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE)
_matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE) _matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE)
_match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE) _match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE)
_matchListInContent = re.compile(r'^(?:\s*(P<marker>[-*+]|\s*\d+\.))\s+(P<content>.+)$', re.IGNORECASE)
_markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE) _markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE)
_htmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE) _htmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE)
_htmlAnchorLink = re.compile(r'<a\s+name="([^"]*)">[^<]*</a>', re.IGNORECASE) _htmlAnchorLink = re.compile(r'<a\s+name="([^"]*)">[^<]*</a>', re.IGNORECASE)
...@@ -466,6 +467,42 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -466,6 +467,42 @@ def parse_pandoc_table_with_spans(pandoc_table):
_matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE) _matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE)
return _matchGridTableSeparator.match(line) return _matchGridTableSeparator.match(line)
def handling_content(cell, content, list_flag):
if cell['content'] is None:
cell['rowspan'] += 1
cell['colspan'] += 1
if content.strip().startswith("- "): # List
list_flag = True
print(content)
cell['content'] = content.strip() + "\n" # Add newline to know when the list element ends
elif list_flag: # any other content when handling list is concatenated to the last list element
cell['content'] += content.strip() + "\n"
elif cells[i].strip() == "": # separation between list and other paragraph
list_flag = False
cell['content'] = re.sub(r'\\\s*$', "\n", content)
else:
cell['content'] = re.sub(r'\\\s*$', "\n", content.strip())
else:
if content.strip().startswith("- "): # List
if not list_flag:
cell['content'] += "\n"
#cell['content'] = cell['content'].strip("\n")
list_flag = True
cell['content'] += content.strip() + "\n" # Add newline to know when the list element ends
elif list_flag: # any other content when handling list is concatenated to the last list element
cell['content'] = cell['content'].strip("\n")
cell['content'] += " " + content.strip() + "\n"
elif cells[i].strip() == "": # separation between list and other paragraph
list_flag = False
#content = re.sub(r'\\\s*$', "\n", content.strip())
cell['content'] += "\n" if not cell['content'].endswith("\n") else ""
else:
content = re.sub(r'\\\s*$', "\n", content.strip())
cell['content'] += " " + content
#print(cell['content'])
return list_flag, cell
_matchGridTableSeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE) _matchGridTableSeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
separator_indices = [i for i, line in enumerate(lines) if is_separator(line)] separator_indices = [i for i, line in enumerate(lines) if is_separator(line)]
...@@ -490,11 +527,13 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -490,11 +527,13 @@ def parse_pandoc_table_with_spans(pandoc_table):
has_header = True has_header = True
header_separator_index = index header_separator_index = index
header_rows = [] header_rows = []
data_rows = [] data_rows = []
for row in range(len(separator_indices) - 1): for row in range(len(separator_indices) - 1):
table_row = [] table_row = []
auxiliar_row = [] auxiliar_row = []
use_auxiliar_row = [] use_auxiliar_row = []
list_flags = []
has_merged_cells = False has_merged_cells = False
in_data_row = False in_data_row = False
start, end = separator_indices[row], separator_indices[row + 1] start, end = separator_indices[row], separator_indices[row + 1]
...@@ -508,24 +547,38 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -508,24 +547,38 @@ def parse_pandoc_table_with_spans(pandoc_table):
parts = re.split(r"\s*\+\s*", line.strip("+")) parts = re.split(r"\s*\+\s*", line.strip("+"))
# Add as many cells as columns with span attributes # Add as many cells as columns with span attributes
delimiter_index = 0 delimiter_index = 0
# Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator)
# we need to assign the default alignment as defined in the header separator line
# We may not need the code below, as that supports alignment per cell and row
alignments = []
for part_index in range(len(parts)):
if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
alignments.append("align=\"left\"")
elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
alignments.append("align=\"right\"")
else:
alignments.append("align=\"center\"")
for i in range(number_of_columns_row): for i in range(number_of_columns_row):
delimiter_index += len(parts[i]) + 1 delimiter_index += len(parts[i]) + 1
table_row.append({ table_row.append({
"content": "NOCONTENT", "content": None,
"rowspan": 0, "rowspan": 0,
"colspan": 0, "colspan": 0,
"colspan_adjusted": False, "colspan_adjusted": False,
"alignment": alignments[i] if alignments[i] else "align=\"center\"",
"position": delimiter_index # Position of cell delimiter + "position": delimiter_index # Position of cell delimiter +
}) })
for i in range(number_of_columns): for i in range(number_of_columns):
auxiliar_row.append({ auxiliar_row.append({
"content": "NOCONTENT", "content": None,
"rowspan": 0, "rowspan": 0,
"colspan": 0, "colspan": 0,
"colspan_adjusted": False, "colspan_adjusted": False,
"alignment": "align=\"center\"",
"position": 0 "position": 0
}) })
use_auxiliar_row.append(False) use_auxiliar_row.append(False)
list_flags.append(False)
elif in_data_row: elif in_data_row:
# Regular data row or partial separator # Regular data row or partial separator
...@@ -536,14 +589,35 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -536,14 +589,35 @@ def parse_pandoc_table_with_spans(pandoc_table):
for i in range(len(cells)): for i in range(len(cells)):
if _matchGridTableSeparatorLine.match(cells[i]): # A new row is to be added if _matchGridTableSeparatorLine.match(cells[i]): # A new row is to be added
use_auxiliar_row[i] = True use_auxiliar_row[i] = True
if cells[i].startswith(":") and not cells[i].endswith(":"):
auxiliar_row[i]['alignment'] = "align=\"left\""
elif not cells[i].startswith(":") and cells[i].endswith(":"):
auxiliar_row[i]['alignment'] = "align=\"right\""
else: else:
if table_row[i]['content'] == "NOCONTENT": auxiliar_row[i]['alignment'] = "align=\"center\""
table_row[i]['rowspan'] += 1
table_row[i]['colspan'] += 1
table_row[i]['content'] = cells[i]
else: else:
table_row[i]['content'] += cells[i] list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
# Cell which is not separator #if table_row[i]['content'] is None:
# table_row[i]['rowspan'] += 1
# table_row[i]['colspan'] += 1
#if cells[i].strip().startswith("- "): # List
# handling_list = True
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
#elif handling_list: # any other content when handling list is concatenated to the last list element
# table_row[i]['content'].strip("\n")
# table_row[i]['content'] += cells[i] + "\n"
#elif cells[i].strip(): #separation between list and other paragraph
# handling_list = False
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
#else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
#else:
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) # Cell which is not separator
table_row[i]['rowspan'] += 1 table_row[i]['rowspan'] += 1
if not table_row[i]['colspan_adjusted']: if not table_row[i]['colspan_adjusted']:
table_row[i]['colspan_adjusted'] = True table_row[i]['colspan_adjusted'] = True
...@@ -561,13 +635,30 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -561,13 +635,30 @@ def parse_pandoc_table_with_spans(pandoc_table):
for i in range(len(cells)): for i in range(len(cells)):
if _matchGridTableSeparatorLine.match(cells[i]): # Update cell in new row if _matchGridTableSeparatorLine.match(cells[i]): # Update cell in new row
use_auxiliar_row[i] = True use_auxiliar_row[i] = True
if cells[i].startswith(":") and not cells[i].endswith(":"):
auxiliar_row[i]['alignment'] = "align=\"left\""
elif not cells[i].startswith(":") and cells[i].endswith(":"):
auxiliar_row[i]['alignment'] = "align=\"right\""
else: else:
if table_row[i]['content'] == "NOCONTENT": auxiliar_row[i]['alignment'] = "align=\"center\""
table_row[i]['rowspan'] += 1
table_row[i]['colspan'] += 1
table_row[i]['content'] = cells[i]
else: else:
table_row[i]['content'] += cells[i] list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
# if table_row[i]['content'] is None:
# table_row[i]['rowspan'] += 1
# table_row[i]['colspan'] += 1
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i])
# else:
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
# Cell which is not separator # Cell which is not separator
table_row[i]['rowspan'] += 1 table_row[i]['rowspan'] += 1
# Not needed, no colspan as number of cells is equal to number of columns # Not needed, no colspan as number of cells is equal to number of columns
...@@ -588,40 +679,62 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -588,40 +679,62 @@ def parse_pandoc_table_with_spans(pandoc_table):
cells = re.split(r"\s*\|\s*", line.strip("|")) cells = re.split(r"\s*\|\s*", line.strip("|"))
if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
for i in range(len(cells)): for i in range(len(cells)):
if table_row[i]['content'] == "NOCONTENT": list_flags[i], table_row[i] = handling_content(table_row[i], cells[i], list_flags[i])
table_row[i]['rowspan'] += 1 # if table_row[i]['content'] is None:
table_row[i]['colspan'] += 1 # table_row[i]['rowspan'] += 1
table_row[i]['content'] = cells[i] # table_row[i]['colspan'] += 1
else: # if cells[i].strip().startswith("- "): # List
table_row[i]['content'] += cells[i] # print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
# else:
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
if not table_row[i]['colspan_adjusted']: if not table_row[i]['colspan_adjusted']:
table_row[i]['colspan_adjusted'] = True table_row[i]['colspan_adjusted'] = True
for j in range(i, len(cells)): for j in range(i, len(cells)):
delimiter_start = table_row[j-1]['position'] if j != 0 else 0 delimiter_start = table_row[j-1]['position'] if j != 0 else 0
if line.find("|", delimiter_start+1) > delimiter_positions[j]: # Colspan to be increased if line.find("|", delimiter_start+1) > delimiter_positions[j]: # Colspan to be increased
table_row[i]['colspan'] += 1 table_row[i]['colspan'] += 1
if line.find("|", delimiter_start + 1) == delimiter_positions[len(delimiter_positions) - 1]: # last cell in row, adjust colspan to get max number columns
colspan_remaining = 0
for cell_index in range(number_of_columns_row):
colspan_remaining += table_row[cell_index]['colspan']
table_row[i]['colspan'] += number_of_columns - colspan_remaining
elif line.find("|", delimiter_start+1) < delimiter_positions[j]: elif line.find("|", delimiter_start+1) < delimiter_positions[j]:
raise ValueError("Wrong cell formatting") raise ValueError("Wrong cell formatting")
else: else:
break break
elif len(cells) == number_of_columns: # Simple row elif len(cells) == number_of_columns: # Simple row
for i in range(len(cells)): for i in range(len(cells)):
if use_auxiliar_row[i]: if use_auxiliar_row[i]:
if auxiliar_row[i]['content'] == "NOCONTENT": if auxiliar_row[i]['content'] is None:
auxiliar_row[i]['rowspan'] += 1 auxiliar_row[i]['rowspan'] += 1
auxiliar_row[i]['colspan'] += 1 auxiliar_row[i]['colspan'] += 1
auxiliar_row[i]['content'] = cells[i] auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
else:
auxiliar_row[i]['content'] += cells[i]
else: else:
if table_row[i]['content'] == "NOCONTENT": auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
table_row[i]['rowspan'] += 1
table_row[i]['colspan'] += 1
table_row[i]['content'] = cells[i]
else: else:
table_row[i]['content'] += cells[i] list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
# if table_row[i]['content'] is None:
# table_row[i]['rowspan'] += 1
# table_row[i]['colspan'] += 1
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
# else:
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i])
else: else:
raise ValueError("More cells than columns found") raise ValueError("More cells than columns found")
else: else:
...@@ -638,39 +751,47 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -638,39 +751,47 @@ def parse_pandoc_table_with_spans(pandoc_table):
#print(header_rows) #print(header_rows)
#print(data_rows) #print(data_rows)
# Correct newlines characters
for row in header_rows:
for cell in row:
cell['content'] = cell['content'].replace("\\", "<br>")
for row in data_rows:
for cell in row:
cell['content'] = cell['content'].replace("\\", "<br>")
# Check if there are any data rows # Check if there are any data rows
if not data_rows and not header_rows: if not data_rows and not header_rows:
raise ValueError("No valid rows found in the provided Pandoc table.") raise ValueError("No valid rows found in the provided Pandoc table.")
# Format text # Format text
for rows in [header_rows, data_rows]:
bold = "<strong>" bold = "<strong>"
for row in header_rows: italic = "<i>"
for row in rows:
for cell in row: for cell in row:
while cell['content'].find("**") != -1: if cell['content'] is not None:
cell['content'] = cell['content'].replace("**", bold, 1) # Replacing "<" by &lt;
cell['content'] = cell['content'].replace("<", "&lt;")
#Bold
for bold_characters in ["**", "__"]:
while cell['content'].find(bold_characters) != -1:
cell['content'] = cell['content'].replace(bold_characters, bold, 1)
if bold == "<strong>": if bold == "<strong>":
bold = "</strong>" bold = "</strong>"
else: else:
bold = "<strong>" bold = "<strong>"
bold = "<strong>" #Italic
while cell['content'].find("_") != -1 and cell['content'].find("\_") == -1:
cell['content'] = cell['content'].rstrip() .replace("_", italic, 1)
if italic == "<i>":
italic = "</i>"
else:
italic = "<i>"
while cell['content'].find("\_") != -1:
cell['content'] = cell['content'].rstrip().replace("\_", "_", 1)
# Correct newlines characters
for row in header_rows:
for cell in row:
cell['content'] = cell['content'].replace("\n", "<br />") if cell['content'] is not None else None
for row in data_rows: for row in data_rows:
for cell in row: for cell in row:
while cell['content'].find("**") != -1: cell['content'] = cell['content'].replace("\n", "<br />") if cell['content'] is not None else None
cell['content'] = cell['content'].replace("**", bold, 1)
if bold == "<strong>":
bold = "</strong>"
else:
bold = "<strong>"
# Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows # Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows
forward_rowspan = [] forward_rowspan = []
for row_index in range(len(header_rows)): for row_index in range(len(header_rows)):
if len(forward_rowspan) == 0: if len(forward_rowspan) == 0:
...@@ -701,12 +822,7 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -701,12 +822,7 @@ def parse_pandoc_table_with_spans(pandoc_table):
forward_rowspan[cell_index] = data_rows[row_index][cell_index]['rowspan'] - 1 forward_rowspan[cell_index] = data_rows[row_index][cell_index]['rowspan'] - 1
if not sum == number_of_columns: if not sum == number_of_columns:
raise ValueError("Grid table not converted properly") raise ValueError("Grid table not converted properly")
#if has_header:
# table_with_spans = header_rows
#table_with_spans += data_rows
#return table_with_spans
return header_rows, data_rows return header_rows, data_rows
def generate_html_table_with_spans(pandoc_table): def generate_html_table_with_spans(pandoc_table):
...@@ -733,9 +849,23 @@ def generate_html_table_with_spans(pandoc_table): ...@@ -733,9 +849,23 @@ def generate_html_table_with_spans(pandoc_table):
if cell['rowspan'] == 0 or cell['colspan'] == 0: if cell['rowspan'] == 0 or cell['colspan'] == 0:
continue continue
else: else:
# Prepare content, in case there's a list
print(cell['content'])
if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>",
cell['content']): # Update cell in new row
#print("MATCHING")
list = "<ul>"
# Build list the matches
for match in matches:
list += "<li>" + match[1] + "</li>"
list += "</ul>"
cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell['content'])
#else:
# print("NOT MATCHING")
rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else "" rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else "" colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
html += f" <td{rowspan}{colspan}>{cell['content']}</td>\n" html += f" <th{rowspan}{colspan} {cell['alignment']}>{cell['content']}</th>\n"
html += " </tr>\n" html += " </tr>\n"
html += " </thead>\n" html += " </thead>\n"
...@@ -746,9 +876,21 @@ def generate_html_table_with_spans(pandoc_table): ...@@ -746,9 +876,21 @@ def generate_html_table_with_spans(pandoc_table):
if cell['rowspan'] == 0 or cell['colspan'] == 0: if cell['rowspan'] == 0 or cell['colspan'] == 0:
continue continue
else: else:
#Prepare content, in case there's a list
#print(cell['content'])
if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell['content']): # Update cell in new row
#print("MATCHING")
list = "<ul>"
# Build list the matches
for match in matches:
list += "<li>" + match[1] + "</li>"
list += "</ul>"
cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell['content'])
#else:
#print("NOT MATCHING")
rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else "" rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else "" colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
html += f" <td{rowspan}{colspan}>{cell['content']}</td>\n" html += f" <td{rowspan}{colspan} {cell['alignment']}>{cell['content']}</td>\n"
html += " </tr>\n" html += " </tr>\n"
html += " </tbody>\n" html += " </tbody>\n"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment