Skip to content
Snippets Groups Projects
Commit 2451610e authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

More cleanup + cell alignment as defined in header separator line (Pandoc's behaviour)

parent d59cfbc2
No related branches found
No related tags found
No related merge requests found
......@@ -422,6 +422,7 @@ _matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE)
_matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE)
_matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE)
_matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE)
_matchGridTableBodySeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
_match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE)
_matchListInContent = re.compile(r'^(?:\s*(P<marker>[-*+]|\s*\d+\.))\s+(P<content>.+)$', re.IGNORECASE)
_markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE)
......@@ -502,8 +503,24 @@ def parse_pandoc_table_with_spans(pandoc_table):
#print(cell['content'])
return list_flag, cell
def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions):
for j in range(column_index, number_of_parts):
delimiter_start = row[j - 1]['position'] if j != 0 else 0
positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "|+" if delimiter in line[delimiter_start + 1:]]
position = min(positions) if positions else -1
if position > delimiter_positions[j]: # Colspan to be increased
row[i]['colspan'] += 1
if position == delimiter_positions[len(delimiter_positions) - 1]: # last cell in row, adjust colspan to get max number columns
colspan_allocated = 0
for cell_index in range(number_of_parts):
colspan_allocated += row[cell_index]['colspan']
row[column_index]['colspan'] += number_of_columns - colspan_allocated
elif position < delimiter_positions[j]:
raise ValueError("Wrong cell formatting")
else:
break
return row[column_index]
_matchGridTableSeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
separator_indices = [i for i, line in enumerate(lines) if is_separator(line)]
print(separator_indices)
......@@ -522,11 +539,26 @@ def parse_pandoc_table_with_spans(pandoc_table):
del_positions = [lines[separator_index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[separator_index][delimiter_positions_start + 1:]]
delimiter_positions.append(min(del_positions) if del_positions else -1)
has_header = False
header_delimiter_positions = []
for index in separator_indices:
if _matchGridTableHeaderSeparator.match(lines[index]):
has_header = True
header_separator_index = index
header_rows = []
parts = re.split(r"\s*\+\s*", lines[index].strip("+"))
default_alignments = []
#Calculate default alignments and positions of delimiters
for part_index in range(len(parts)):
if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
default_alignments.append("align=\"left\"")
elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
default_alignments.append("align=\"right\"")
else:
default_alignments.append("align=\"center\"")
# Delimiter position
delimiter_positions_start = delimiter_positions[part_index - 1] if part_index != 0 else 0
del_positions = [lines[index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[index][delimiter_positions_start + 1:]]
header_delimiter_positions.append(min(del_positions) if del_positions else -1)
data_rows = []
for row in range(len(separator_indices) - 1):
......@@ -550,14 +582,15 @@ def parse_pandoc_table_with_spans(pandoc_table):
# Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator)
# we need to assign the default alignment as defined in the header separator line
# We may not need the code below, as that supports alignment per cell and row
alignments = []
for part_index in range(len(parts)):
if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
alignments.append("align=\"left\"")
elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
alignments.append("align=\"right\"")
else:
alignments.append("align=\"center\"")
#alignments = []
#for part_index in range(len(parts)):
# if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
# alignments.append("align=\"left\"")
# elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
# alignments.append("align=\"right\"")
# else:
# alignments.append("align=\"center\"")
header_delimiter_index = 0
for i in range(number_of_columns_row):
delimiter_index += len(parts[i]) + 1
table_row.append({
......@@ -565,9 +598,21 @@ def parse_pandoc_table_with_spans(pandoc_table):
"rowspan": 0,
"colspan": 0,
"colspan_adjusted": False,
"alignment": alignments[i] if alignments[i] else "align=\"center\"",
"alignment": default_alignments[i] if i == 0 else "align=\"center\"",
"position": delimiter_index # Position of cell delimiter +
})
#Set alignment as defined by header separator line
while header_delimiter_index in range(len(default_alignments)) and table_row[i]['position'] > header_delimiter_positions[header_delimiter_index]:
header_delimiter_index += 1
if header_delimiter_index in range(len(default_alignments)):
if table_row[i]['position'] < header_delimiter_positions[header_delimiter_index]:
table_row[i]['alignment'] = default_alignments[header_delimiter_index]
elif table_row[i]['position'] == header_delimiter_positions[header_delimiter_index]:
table_row[i]['alignment'] = default_alignments[i]
header_delimiter_index += 1
else:
raise ValueError("Invalid table formatting")
for i in range(number_of_columns):
auxiliar_row.append({
"content": None,
......@@ -585,10 +630,11 @@ def parse_pandoc_table_with_spans(pandoc_table):
if _matchGridTableBodySeparator.match(line): # Partial separator
has_merged_cells = True
cells = re.split(r"\s*[\|\+]\s*", line.strip("|").strip("+")) # (?<!\\)[\|\+]
if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
if len(cells) <= number_of_columns: # Colspan: Positions of | with respect to + need to be determined
for i in range(len(cells)):
if _matchGridTableSeparatorLine.match(cells[i]): # A new row is to be added
if _matchGridTableBodySeparatorLine.match(cells[i]): # A new row is to be added
use_auxiliar_row[i] = True
list_flags[i] = False
if cells[i].startswith(":") and not cells[i].endswith(":"):
auxiliar_row[i]['alignment'] = "align=\"left\""
elif not cells[i].startswith(":") and cells[i].endswith(":"):
......@@ -596,145 +642,49 @@ def parse_pandoc_table_with_spans(pandoc_table):
else:
auxiliar_row[i]['alignment'] = "align=\"center\""
else:
#Handle content of the cell
list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
#if table_row[i]['content'] is None:
# table_row[i]['rowspan'] += 1
# table_row[i]['colspan'] += 1
#if cells[i].strip().startswith("- "): # List
# handling_list = True
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
#elif handling_list: # any other content when handling list is concatenated to the last list element
# table_row[i]['content'].strip("\n")
# table_row[i]['content'] += cells[i] + "\n"
#elif cells[i].strip(): #separation between list and other paragraph
# handling_list = False
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
#else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
#else:
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) # Cell which is not separator
# Cell which is not separator
table_row[i]['rowspan'] += 1
if not table_row[i]['colspan_adjusted']:
table_row[i]['colspan_adjusted'] = True
for j in range(i, len(cells)):
delimiter_start = table_row[j-1]['position'] if j != 0 else 0
positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "|+" if delimiter in line[delimiter_start + 1:]]
position = min(positions) if positions else -1
if position > delimiter_positions_start[j]: # Colspan to add
table_row[i]['colspan'] += 1
elif position < delimiter_positions_start[j]:
raise ValueError("Wrong cell formatting")
else:
break
elif len(cells) == number_of_columns: # Simple row with partial separator, # A new row is to be added
for i in range(len(cells)):
if _matchGridTableSeparatorLine.match(cells[i]): # Update cell in new row
use_auxiliar_row[i] = True
if cells[i].startswith(":") and not cells[i].endswith(":"):
auxiliar_row[i]['alignment'] = "align=\"left\""
elif not cells[i].startswith(":") and cells[i].endswith(":"):
auxiliar_row[i]['alignment'] = "align=\"right\""
else:
auxiliar_row[i]['alignment'] = "align=\"center\""
else:
list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
# if table_row[i]['content'] is None:
# table_row[i]['rowspan'] += 1
# table_row[i]['colspan'] += 1
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i])
#TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions)
#elif len(cells) == number_of_columns: # Simple row with partial separator, # A new row is to be added
# for i in range(len(cells)):
# if _matchGridTableBodySeparatorLine.match(cells[i]): # Update cell in new row
# use_auxiliar_row[i] = True
# list_flags[i] = False
# if cells[i].startswith(":") and not cells[i].endswith(":"):
# auxiliar_row[i]['alignment'] = "align=\"left\""
# elif not cells[i].startswith(":") and cells[i].endswith(":"):
# auxiliar_row[i]['alignment'] = "align=\"right\""
# else:
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# auxiliar_row[i]['alignment'] = "align=\"center\""
# else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
# Cell which is not separator
table_row[i]['rowspan'] += 1
# Not needed, no colspan as number of cells is equal to number of columns
#for j in range(i, len(cells)):
# delimiter_start = table_row[j-1]['position'] if j != 0 else 0
# positions = [line.find(delimiter,delimiter_start+1) for delimiter in "|+" if delimiter in line[delimiter_start+1:]]
# position = min(positions) if positions else -1
# if position > table_row[i]['position']: # Only colspan to be increased
# table_row[i]['colspan'] += 1
# elif position + 1 < table_row[i]['position']:
# raise ValueError("Wrong cell formatting")
# else:
# break
# #Handle content of the cell
# list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
# # Cell which is not separator
# table_row[i]['rowspan'] += 1
# # Adjusting of colspan not needed, no colspan as number of cells is equal to number of columns
else:
raise ValueError("More cells than columns found")
else: # Data row
cells = re.split(r"\s*\|\s*", line.strip("|"))
if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
for i in range(len(cells)):
# Handle content of the cell
list_flags[i], table_row[i] = handling_content(table_row[i], cells[i], list_flags[i])
# if table_row[i]['content'] is None:
# table_row[i]['rowspan'] += 1
# table_row[i]['colspan'] += 1
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
# else:
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
if not table_row[i]['colspan_adjusted']:
table_row[i]['colspan_adjusted'] = True
for j in range(i, len(cells)):
delimiter_start = table_row[j-1]['position'] if j != 0 else 0
if line.find("|", delimiter_start+1) > delimiter_positions[j]: # Colspan to be increased
table_row[i]['colspan'] += 1
if line.find("|", delimiter_start + 1) == delimiter_positions[len(delimiter_positions) - 1]: # last cell in row, adjust colspan to get max number columns
colspan_remaining = 0
for cell_index in range(number_of_columns_row):
colspan_remaining += table_row[cell_index]['colspan']
table_row[i]['colspan'] += number_of_columns - colspan_remaining
elif line.find("|", delimiter_start+1) < delimiter_positions[j]:
raise ValueError("Wrong cell formatting")
else:
break
table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions)
elif len(cells) == number_of_columns: # Simple row
for i in range(len(cells)):
if use_auxiliar_row[i]:
if auxiliar_row[i]['content'] is None:
auxiliar_row[i]['rowspan'] += 1
auxiliar_row[i]['colspan'] += 1
auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
else:
auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
list_flags[i], auxiliar_row[i] = handling_content(auxiliar_row[i], cells[i],list_flags[i])
else:
# Handle content of the cell
list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
# if table_row[i]['content'] is None:
# table_row[i]['rowspan'] += 1
# table_row[i]['colspan'] += 1
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
# else:
# if cells[i].strip().startswith("- "): # List
# print(cells[i])
# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
# else:
# table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i])
else:
raise ValueError("More cells than columns found")
else:
......@@ -850,7 +800,7 @@ def generate_html_table_with_spans(pandoc_table):
continue
else:
# Prepare content, in case there's a list
print(cell['content'])
#print(cell['content'])
if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>",
cell['content']): # Update cell in new row
#print("MATCHING")
......@@ -860,6 +810,8 @@ def generate_html_table_with_spans(pandoc_table):
list += "<li>" + match[1] + "</li>"
list += "</ul>"
cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell['content'])
# Enforce left alignment if cell contains a list
cell['alignment'] = "align=\"left\""
#else:
# print("NOT MATCHING")
......@@ -886,6 +838,8 @@ def generate_html_table_with_spans(pandoc_table):
list += "<li>" + match[1] + "</li>"
list += "</ul>"
cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell['content'])
# Enforce left alignment if cell contains a list
cell['alignment'] = "align=\"left\""
#else:
#print("NOT MATCHING")
rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment