Skip to content
Snippets Groups Projects
Commit 2451610e authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

More cleanup + cell alignment as defined in header separator line (Pandoc's behaviour)

parent d59cfbc2
Branches
No related tags found
No related merge requests found
...@@ -422,6 +422,7 @@ _matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE) ...@@ -422,6 +422,7 @@ _matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE)
_matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE) _matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE)
_matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE) _matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE)
_matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE) _matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE)
_matchGridTableBodySeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
_match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE) _match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE)
_matchListInContent = re.compile(r'^(?:\s*(P<marker>[-*+]|\s*\d+\.))\s+(P<content>.+)$', re.IGNORECASE) _matchListInContent = re.compile(r'^(?:\s*(P<marker>[-*+]|\s*\d+\.))\s+(P<content>.+)$', re.IGNORECASE)
_markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE) _markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE)
...@@ -502,8 +503,24 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -502,8 +503,24 @@ def parse_pandoc_table_with_spans(pandoc_table):
#print(cell['content']) #print(cell['content'])
return list_flag, cell return list_flag, cell
def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions):
for j in range(column_index, number_of_parts):
delimiter_start = row[j - 1]['position'] if j != 0 else 0
positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "|+" if delimiter in line[delimiter_start + 1:]]
position = min(positions) if positions else -1
if position > delimiter_positions[j]: # Colspan to be increased
row[i]['colspan'] += 1
if position == delimiter_positions[len(delimiter_positions) - 1]: # last cell in row, adjust colspan to get max number columns
colspan_allocated = 0
for cell_index in range(number_of_parts):
colspan_allocated += row[cell_index]['colspan']
row[column_index]['colspan'] += number_of_columns - colspan_allocated
elif position < delimiter_positions[j]:
raise ValueError("Wrong cell formatting")
else:
break
return row[column_index]
_matchGridTableSeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
separator_indices = [i for i, line in enumerate(lines) if is_separator(line)] separator_indices = [i for i, line in enumerate(lines) if is_separator(line)]
print(separator_indices) print(separator_indices)
...@@ -522,11 +539,26 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -522,11 +539,26 @@ def parse_pandoc_table_with_spans(pandoc_table):
del_positions = [lines[separator_index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[separator_index][delimiter_positions_start + 1:]] del_positions = [lines[separator_index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[separator_index][delimiter_positions_start + 1:]]
delimiter_positions.append(min(del_positions) if del_positions else -1) delimiter_positions.append(min(del_positions) if del_positions else -1)
has_header = False has_header = False
header_delimiter_positions = []
for index in separator_indices: for index in separator_indices:
if _matchGridTableHeaderSeparator.match(lines[index]): if _matchGridTableHeaderSeparator.match(lines[index]):
has_header = True has_header = True
header_separator_index = index header_separator_index = index
header_rows = [] header_rows = []
parts = re.split(r"\s*\+\s*", lines[index].strip("+"))
default_alignments = []
#Calculate default alignments and positions of delimiters
for part_index in range(len(parts)):
if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
default_alignments.append("align=\"left\"")
elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
default_alignments.append("align=\"right\"")
else:
default_alignments.append("align=\"center\"")
# Delimiter position
delimiter_positions_start = delimiter_positions[part_index - 1] if part_index != 0 else 0
del_positions = [lines[index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[index][delimiter_positions_start + 1:]]
header_delimiter_positions.append(min(del_positions) if del_positions else -1)
data_rows = [] data_rows = []
for row in range(len(separator_indices) - 1): for row in range(len(separator_indices) - 1):
...@@ -550,14 +582,15 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -550,14 +582,15 @@ def parse_pandoc_table_with_spans(pandoc_table):
# Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator) # Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator)
# we need to assign the default alignment as defined in the header separator line # we need to assign the default alignment as defined in the header separator line
# We may not need the code below, as that supports alignment per cell and row # We may not need the code below, as that supports alignment per cell and row
alignments = [] #alignments = []
for part_index in range(len(parts)): #for part_index in range(len(parts)):
if parts[part_index].startswith(":") and not parts[part_index].endswith(":"): # if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
alignments.append("align=\"left\"") # alignments.append("align=\"left\"")
elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"): # elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
alignments.append("align=\"right\"") # alignments.append("align=\"right\"")
else: # else:
alignments.append("align=\"center\"") # alignments.append("align=\"center\"")
header_delimiter_index = 0
for i in range(number_of_columns_row): for i in range(number_of_columns_row):
delimiter_index += len(parts[i]) + 1 delimiter_index += len(parts[i]) + 1
table_row.append({ table_row.append({
...@@ -565,9 +598,21 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -565,9 +598,21 @@ def parse_pandoc_table_with_spans(pandoc_table):
"rowspan": 0, "rowspan": 0,
"colspan": 0, "colspan": 0,
"colspan_adjusted": False, "colspan_adjusted": False,
"alignment": alignments[i] if alignments[i] else "align=\"center\"", "alignment": default_alignments[i] if i == 0 else "align=\"center\"",
"position": delimiter_index # Position of cell delimiter + "position": delimiter_index # Position of cell delimiter +
}) })
#Set alignment as defined by header separator line
while header_delimiter_index in range(len(default_alignments)) and table_row[i]['position'] > header_delimiter_positions[header_delimiter_index]:
header_delimiter_index += 1
if header_delimiter_index in range(len(default_alignments)):
if table_row[i]['position'] < header_delimiter_positions[header_delimiter_index]:
table_row[i]['alignment'] = default_alignments[header_delimiter_index]
elif table_row[i]['position'] == header_delimiter_positions[header_delimiter_index]:
table_row[i]['alignment'] = default_alignments[i]
header_delimiter_index += 1
else:
raise ValueError("Invalid table formatting")
for i in range(number_of_columns): for i in range(number_of_columns):
auxiliar_row.append({ auxiliar_row.append({
"content": None, "content": None,
...@@ -585,10 +630,11 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -585,10 +630,11 @@ def parse_pandoc_table_with_spans(pandoc_table):
if _matchGridTableBodySeparator.match(line): # Partial separator if _matchGridTableBodySeparator.match(line): # Partial separator
has_merged_cells = True has_merged_cells = True
cells = re.split(r"\s*[\|\+]\s*", line.strip("|").strip("+")) # (?<!\\)[\|\+] cells = re.split(r"\s*[\|\+]\s*", line.strip("|").strip("+")) # (?<!\\)[\|\+]
if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined if len(cells) <= number_of_columns: # Colspan: Positions of | with respect to + need to be determined
for i in range(len(cells)): for i in range(len(cells)):
if _matchGridTableSeparatorLine.match(cells[i]): # A new row is to be added if _matchGridTableBodySeparatorLine.match(cells[i]): # A new row is to be added
use_auxiliar_row[i] = True use_auxiliar_row[i] = True
list_flags[i] = False
if cells[i].startswith(":") and not cells[i].endswith(":"): if cells[i].startswith(":") and not cells[i].endswith(":"):
auxiliar_row[i]['alignment'] = "align=\"left\"" auxiliar_row[i]['alignment'] = "align=\"left\""
elif not cells[i].startswith(":") and cells[i].endswith(":"): elif not cells[i].startswith(":") and cells[i].endswith(":"):
...@@ -596,145 +642,49 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -596,145 +642,49 @@ def parse_pandoc_table_with_spans(pandoc_table):
else: else:
auxiliar_row[i]['alignment'] = "align=\"center\"" auxiliar_row[i]['alignment'] = "align=\"center\""
else: else:
#Handle content of the cell
list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i]) list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
#if table_row[i]['content'] is None: # Cell which is not separator
# table_row[i]['rowspan'] += 1
# table_row[i]['colspan'] += 1
#if cells[i].strip().startswith("- "): # List
# handling_list = True
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
#elif handling_list: # any other content when handling list is concatenated to the last list element
# table_row[i]['content'].strip("\n")
# table_row[i]['content'] += cells[i] + "\n"
#elif cells[i].strip(): #separation between list and other paragraph
# handling_list = False
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
#else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
#else:
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) # Cell which is not separator
table_row[i]['rowspan'] += 1 table_row[i]['rowspan'] += 1
if not table_row[i]['colspan_adjusted']: if not table_row[i]['colspan_adjusted']:
table_row[i]['colspan_adjusted'] = True table_row[i]['colspan_adjusted'] = True
for j in range(i, len(cells)): #TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
delimiter_start = table_row[j-1]['position'] if j != 0 else 0 table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions)
positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "|+" if delimiter in line[delimiter_start + 1:]] #elif len(cells) == number_of_columns: # Simple row with partial separator, # A new row is to be added
position = min(positions) if positions else -1 # for i in range(len(cells)):
if position > delimiter_positions_start[j]: # Colspan to add # if _matchGridTableBodySeparatorLine.match(cells[i]): # Update cell in new row
table_row[i]['colspan'] += 1 # use_auxiliar_row[i] = True
elif position < delimiter_positions_start[j]: # list_flags[i] = False
raise ValueError("Wrong cell formatting") # if cells[i].startswith(":") and not cells[i].endswith(":"):
else: # auxiliar_row[i]['alignment'] = "align=\"left\""
break # elif not cells[i].startswith(":") and cells[i].endswith(":"):
elif len(cells) == number_of_columns: # Simple row with partial separator, # A new row is to be added # auxiliar_row[i]['alignment'] = "align=\"right\""
for i in range(len(cells)):
if _matchGridTableSeparatorLine.match(cells[i]): # Update cell in new row
use_auxiliar_row[i] = True
if cells[i].startswith(":") and not cells[i].endswith(":"):
auxiliar_row[i]['alignment'] = "align=\"left\""
elif not cells[i].startswith(":") and cells[i].endswith(":"):
auxiliar_row[i]['alignment'] = "align=\"right\""
else:
auxiliar_row[i]['alignment'] = "align=\"center\""
else:
list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
# if table_row[i]['content'] is None:
# table_row[i]['rowspan'] += 1
# table_row[i]['colspan'] += 1
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i])
# else: # else:
# if cells[i].strip().startswith("- "): # List # auxiliar_row[i]['alignment'] = "align=\"center\""
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else: # else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) # #Handle content of the cell
# list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
# Cell which is not separator # # Cell which is not separator
table_row[i]['rowspan'] += 1 # table_row[i]['rowspan'] += 1
# Not needed, no colspan as number of cells is equal to number of columns # # Adjusting of colspan not needed, no colspan as number of cells is equal to number of columns
#for j in range(i, len(cells)):
# delimiter_start = table_row[j-1]['position'] if j != 0 else 0
# positions = [line.find(delimiter,delimiter_start+1) for delimiter in "|+" if delimiter in line[delimiter_start+1:]]
# position = min(positions) if positions else -1
# if position > table_row[i]['position']: # Only colspan to be increased
# table_row[i]['colspan'] += 1
# elif position + 1 < table_row[i]['position']:
# raise ValueError("Wrong cell formatting")
# else:
# break
else: else:
raise ValueError("More cells than columns found") raise ValueError("More cells than columns found")
else: # Data row else: # Data row
cells = re.split(r"\s*\|\s*", line.strip("|")) cells = re.split(r"\s*\|\s*", line.strip("|"))
if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
for i in range(len(cells)): for i in range(len(cells)):
# Handle content of the cell
list_flags[i], table_row[i] = handling_content(table_row[i], cells[i], list_flags[i]) list_flags[i], table_row[i] = handling_content(table_row[i], cells[i], list_flags[i])
# if table_row[i]['content'] is None:
# table_row[i]['rowspan'] += 1
# table_row[i]['colspan'] += 1
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
# else:
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
if not table_row[i]['colspan_adjusted']: if not table_row[i]['colspan_adjusted']:
table_row[i]['colspan_adjusted'] = True table_row[i]['colspan_adjusted'] = True
for j in range(i, len(cells)): table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions)
delimiter_start = table_row[j-1]['position'] if j != 0 else 0
if line.find("|", delimiter_start+1) > delimiter_positions[j]: # Colspan to be increased
table_row[i]['colspan'] += 1
if line.find("|", delimiter_start + 1) == delimiter_positions[len(delimiter_positions) - 1]: # last cell in row, adjust colspan to get max number columns
colspan_remaining = 0
for cell_index in range(number_of_columns_row):
colspan_remaining += table_row[cell_index]['colspan']
table_row[i]['colspan'] += number_of_columns - colspan_remaining
elif line.find("|", delimiter_start+1) < delimiter_positions[j]:
raise ValueError("Wrong cell formatting")
else:
break
elif len(cells) == number_of_columns: # Simple row elif len(cells) == number_of_columns: # Simple row
for i in range(len(cells)): for i in range(len(cells)):
if use_auxiliar_row[i]: if use_auxiliar_row[i]:
if auxiliar_row[i]['content'] is None: list_flags[i], auxiliar_row[i] = handling_content(auxiliar_row[i], cells[i],list_flags[i])
auxiliar_row[i]['rowspan'] += 1
auxiliar_row[i]['colspan'] += 1
auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
else:
auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
else: else:
# Handle content of the cell
list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i]) list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
# if table_row[i]['content'] is None:
# table_row[i]['rowspan'] += 1
# table_row[i]['colspan'] += 1
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
# else:
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i])
else: else:
raise ValueError("More cells than columns found") raise ValueError("More cells than columns found")
else: else:
...@@ -850,7 +800,7 @@ def generate_html_table_with_spans(pandoc_table): ...@@ -850,7 +800,7 @@ def generate_html_table_with_spans(pandoc_table):
continue continue
else: else:
# Prepare content, in case there's a list # Prepare content, in case there's a list
print(cell['content']) #print(cell['content'])
if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>",
cell['content']): # Update cell in new row cell['content']): # Update cell in new row
#print("MATCHING") #print("MATCHING")
...@@ -860,6 +810,8 @@ def generate_html_table_with_spans(pandoc_table): ...@@ -860,6 +810,8 @@ def generate_html_table_with_spans(pandoc_table):
list += "<li>" + match[1] + "</li>" list += "<li>" + match[1] + "</li>"
list += "</ul>" list += "</ul>"
cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell['content']) cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell['content'])
# Enforce left alignment if cell contains a list
cell['alignment'] = "align=\"left\""
#else: #else:
# print("NOT MATCHING") # print("NOT MATCHING")
...@@ -886,6 +838,8 @@ def generate_html_table_with_spans(pandoc_table): ...@@ -886,6 +838,8 @@ def generate_html_table_with_spans(pandoc_table):
list += "<li>" + match[1] + "</li>" list += "<li>" + match[1] + "</li>"
list += "</ul>" list += "</ul>"
cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell['content']) cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell['content'])
# Enforce left alignment if cell contains a list
cell['alignment'] = "align=\"left\""
#else: #else:
#print("NOT MATCHING") #print("NOT MATCHING")
rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else "" rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment