Skip to content
Snippets Groups Projects
Commit 708d9fb8 authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Using class Cell and Row to handle grid tables conversion to html

parent 2451610e
No related branches found
No related tags found
No related merge requests found
......@@ -420,6 +420,7 @@ _matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECA
_matchTable = re.compile(r'^\s*\|.*\|\s*$', re.IGNORECASE)
_matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE)
_matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE)
_matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE)
_matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE)
_matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE)
_matchGridTableBodySeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
......@@ -463,58 +464,85 @@ def parse_pandoc_table_with_spans(pandoc_table):
# Split the input into lines
lines = [line.strip() for line in pandoc_table.strip().split("\n")]
class Cell:
""" Represents the document object. """
content: str
rowspan: int
colspan: int
colspan_adjusted: bool
alignment: str
position: int
list_flag: bool
auxiliar_index: int
def __init__(self):
self.content = None
self.rowspan = 0
self.colspan = 0
self.colspan_adjusted = False
self.alignment = "align=\"center\""
self.position = 0
self.list_flag = False
self.auxiliar_index = None
class Row():
""" Represents a row in the markdown file. """
cells:list[Cell] = []
def __init__(self, length: int = 1) -> None:
self.cells = [Cell() for _ in range(length)]
# Detect separator lines by pattern (it does not take into account partial separators
def is_separator(line):
_matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE)
return _matchGridTableSeparator.match(line)
def handling_content(cell, content, list_flag):
if cell['content'] is None:
cell['rowspan'] += 1
cell['colspan'] += 1
def handling_content(cell, content):
if cell.content is None:
cell.rowspan += 1
cell.colspan += 1
if content.strip().startswith("- "): # List
list_flag = True
print(content)
cell['content'] = content.strip() + "\n" # Add newline to know when the list element ends
elif list_flag: # any other content when handling list is concatenated to the last list element
cell['content'] += content.strip() + "\n"
elif cells[i].strip() == "": # separation between list and other paragraph
list_flag = False
cell['content'] = re.sub(r'\\\s*$', "\n", content)
cell.list_flag = True
#print(content)
cell.content = content.strip() + "\n" # Add newline to know when the list element ends
elif cell.list_flag and cells[i].strip() != "": # any other content when handling list is concatenated to the last list element
cell.content += content.strip() + "\n"
elif cells[i].strip == "": # separation between list and other paragraph
cell.list_flag = False
cell.content += "\n" #if not cell['content'].endswith("\n") else ""
else:
cell['content'] = re.sub(r'\\\s*$', "\n", content.strip())
cell.content = re.sub(r'\\\s*$', "\n", content.strip())
else:
if content.strip().startswith("- "): # List
if not list_flag:
cell['content'] += "\n"
if not cell.list_flag:
cell.content += "\n"
#cell['content'] = cell['content'].strip("\n")
list_flag = True
cell['content'] += content.strip() + "\n" # Add newline to know when the list element ends
elif list_flag: # any other content when handling list is concatenated to the last list element
cell['content'] = cell['content'].strip("\n")
cell['content'] += " " + content.strip() + "\n"
cell.list_flag = True
cell.content += content.strip() + "\n" # Add newline to know when the list element ends
elif cell.list_flag and cells[i].strip() != "": # any other content when handling list is concatenated to the last list element
cell.content = cell.content.strip("\n")
cell.content += " " + content.strip() + "\n"
elif cells[i].strip() == "": # separation between list and other paragraph
list_flag = False
cell.list_flag = False
#content = re.sub(r'\\\s*$', "\n", content.strip())
cell['content'] += "\n" if not cell['content'].endswith("\n") else ""
cell.content += "\n" if not cell.content.endswith("\n") else ""
else:
content = re.sub(r'\\\s*$', "\n", content.strip())
cell['content'] += " " + content
cell.content += " " + content
#print(cell['content'])
return list_flag, cell
return cell
def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions):
for j in range(column_index, number_of_parts):
delimiter_start = row[j - 1]['position'] if j != 0 else 0
delimiter_start = row[j - 1].position if j != 0 else 0
positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "|+" if delimiter in line[delimiter_start + 1:]]
position = min(positions) if positions else -1
if position > delimiter_positions[j]: # Colspan to be increased
row[i]['colspan'] += 1
row[i].colspan += 1
if position == delimiter_positions[len(delimiter_positions) - 1]: # last cell in row, adjust colspan to get max number columns
colspan_allocated = 0
for cell_index in range(number_of_parts):
colspan_allocated += row[cell_index]['colspan']
row[column_index]['colspan'] += number_of_columns - colspan_allocated
colspan_allocated = row[i].colspan
#for cell_index in range(number_of_parts):
# colspan_allocated += row[cell_index].colspan
row[column_index].colspan += number_of_columns - colspan_allocated - column_index
elif position < delimiter_positions[j]:
raise ValueError("Wrong cell formatting")
else:
......@@ -563,6 +591,7 @@ def parse_pandoc_table_with_spans(pandoc_table):
data_rows = []
for row in range(len(separator_indices) - 1):
table_row = []
auxiliar_rows = []
auxiliar_row = []
use_auxiliar_row = []
list_flags = []
......@@ -591,65 +620,70 @@ def parse_pandoc_table_with_spans(pandoc_table):
# else:
# alignments.append("align=\"center\"")
header_delimiter_index = 0
table_row = Row(number_of_columns_row)
for i in range(number_of_columns_row):
delimiter_index += len(parts[i]) + 1
table_row.append({
"content": None,
"rowspan": 0,
"colspan": 0,
"colspan_adjusted": False,
"alignment": default_alignments[i] if i == 0 else "align=\"center\"",
"position": delimiter_index # Position of cell delimiter +
})
table_row.cells[i].alignment = default_alignments[i] if i == 0 else "align=\"center\""
table_row.cells[i].position = delimiter_index # Position of cell delimiter +
#Set alignment as defined by header separator line
while header_delimiter_index in range(len(default_alignments)) and table_row[i]['position'] > header_delimiter_positions[header_delimiter_index]:
while header_delimiter_index in range(len(default_alignments)) and table_row.cells[i].position > header_delimiter_positions[header_delimiter_index]:
header_delimiter_index += 1
if header_delimiter_index in range(len(default_alignments)):
if table_row[i]['position'] < header_delimiter_positions[header_delimiter_index]:
table_row[i]['alignment'] = default_alignments[header_delimiter_index]
elif table_row[i]['position'] == header_delimiter_positions[header_delimiter_index]:
table_row[i]['alignment'] = default_alignments[i]
if table_row.cells[i].position < header_delimiter_positions[header_delimiter_index]:
table_row.cells[i].alignment = default_alignments[header_delimiter_index]
elif table_row.cells[i].position == header_delimiter_positions[header_delimiter_index]:
table_row.cells[i].alignment = default_alignments[i]
header_delimiter_index += 1
else:
raise ValueError("Invalid table formatting")
for i in range(number_of_columns):
auxiliar_row.append({
"content": None,
"rowspan": 0,
"colspan": 0,
"colspan_adjusted": False,
"alignment": "align=\"center\"",
"position": 0
})
use_auxiliar_row.append(False)
list_flags.append(False)
#auxiliar_row = Row(number_of_columns)
#for i in range(number_of_columns):
#auxiliar_row.append(default_cell)
#use_auxiliar_row.append(False)
#auxiliar_rows.append({'auxiliar_row':auxiliar_row, 'use_auxiliar':use_auxiliar_row, 'list_flags':list_flags})
elif in_data_row:
# Regular data row or partial separator
if _matchGridTableBodySeparator.match(line): # Partial separator
has_merged_cells = True
#Add auxiliar line, set delimiters for each cell
auxiliar_rows.append(Row(number_of_columns))
aux_delimiter_index = 0
for i in range(number_of_columns_row):
aux_delimiter_index += len(parts[i]) + 1
auxiliar_rows[-1].cells[i].position = aux_delimiter_index # Position of cell delimiter +
cells = re.split(r"\s*[\|\+]\s*", line.strip("|").strip("+")) # (?<!\\)[\|\+]
if len(cells) <= number_of_columns: # Colspan: Positions of | with respect to + need to be determined
for i in range(len(cells)):
if _matchGridTableBodySeparatorLine.match(cells[i]): # A new row is to be added
use_auxiliar_row[i] = True
list_flags[i] = False
if cells[i].startswith(":") and not cells[i].endswith(":"):
auxiliar_row[i]['alignment'] = "align=\"left\""
elif not cells[i].startswith(":") and cells[i].endswith(":"):
auxiliar_row[i]['alignment'] = "align=\"right\""
else:
auxiliar_row[i]['alignment'] = "align=\"center\""
#auxiliar_rows[-1]['use_auxiliar_row'][i] = True
auxiliar_rows[-1].cells[i].list_flag = False
table_row.cells[i].auxiliar_index = len(auxiliar_rows)-1
#if cells[i].startswith(":") and not cells[i].endswith(":"):
# auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"left\""
#elif not cells[i].startswith(":") and cells[i].endswith(":"):
# auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"right\""
#else:
# auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"center\""
else:
# Handle content of the cell
list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
if table_row.cells[i].auxiliar_index is not None: # and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
auxiliar_rows[table_row.cells[i].auxiliar_index][i] = handling_content(auxiliar_rows[table_row.cells[i].auxiliar_index][i], cells[i])
if not auxiliar_rows[table_row.cells[i].auxiliar_index][i].colspan_adjusted:
auxiliar_rows[table_row.cells[i].auxiliar_index][i].colspan_adjusted = True
# TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
auxiliar_rows[table_row.cells[i].auxiliar_index][i] = adjust_colspan(auxiliar_rows[table_row.cells[i].auxiliar_index], i, len(cells), line, number_of_columns, delimiter_positions)
else:
table_row.cells[i] = handling_content(table_row.cells[i], cells[i])
# Cell which is not separator
table_row[i]['rowspan'] += 1
if not table_row[i]['colspan_adjusted']:
table_row[i]['colspan_adjusted'] = True
table_row.cells[i].rowspan += 1
if not table_row.cells[i].colspan_adjusted:
table_row.cells[i].colspan_adjusted = True
#TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions)
table_row.cells[i] = adjust_colspan(table_row.cells, i, len(cells), line, number_of_columns, delimiter_positions)
#elif len(cells) == number_of_columns: # Simple row with partial separator, # A new row is to be added
# for i in range(len(cells)):
# if _matchGridTableBodySeparatorLine.match(cells[i]): # Update cell in new row
......@@ -674,30 +708,42 @@ def parse_pandoc_table_with_spans(pandoc_table):
if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
for i in range(len(cells)):
# Handle content of the cell
list_flags[i], table_row[i] = handling_content(table_row[i], cells[i], list_flags[i])
if not table_row[i]['colspan_adjusted']:
table_row[i]['colspan_adjusted'] = True
table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions)
if table_row.cells[i].auxiliar_index is not None:# and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i] = handling_content(auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i], cells[i])
if not auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i].colspan_adjusted:
auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i].colspan_adjusted = True
#TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i] = adjust_colspan(auxiliar_rows[table_row.cells[i].auxiliar_index].cells, i, len(cells), line, number_of_columns, delimiter_positions)
else:
table_row.cells[i] = handling_content(table_row.cells[i], cells[i])
if not table_row.cells[i].colspan_adjusted:
table_row.cells[i].colspan_adjusted = True
table_row.cells[i] = adjust_colspan(table_row.cells, i, len(cells), line, number_of_columns, delimiter_positions)
elif len(cells) == number_of_columns: # Simple row
for i in range(len(cells)):
if use_auxiliar_row[i]:
list_flags[i], auxiliar_row[i] = handling_content(auxiliar_row[i], cells[i],list_flags[i])
if table_row.cells[i].auxiliar_index is not None:# and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i] = handling_content(auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i], cells[i])
else:
# Handle content of the cell
list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
table_row.cells[i] = handling_content(table_row.cells[i], cells[i])
else:
raise ValueError("More cells than columns found")
else:
raise ValueError("No separator line found for row starting")
if has_header and start >= header_separator_index: # table_row and auxiliar_row are part of data_rows
data_rows.append(table_row)
data_rows.append(table_row.cells)
if has_merged_cells:
data_rows.append(auxiliar_row)
for row in auxiliar_rows:
#for i in range(len(row.cells)):
# print(row.cells[i].content)
data_rows.append(row.cells)
elif has_header and start < header_separator_index: # table_row and auxiliar_row are part of header_rows
header_rows.append(table_row)
header_rows.append(table_row.cells)
if has_merged_cells:
header_rows.append(auxiliar_row)
for row in auxiliar_rows:
header_rows.append(row.cells)
#print(header_rows)
#print(data_rows)
......@@ -711,35 +757,35 @@ def parse_pandoc_table_with_spans(pandoc_table):
italic = "<i>"
for row in rows:
for cell in row:
if cell['content'] is not None:
if cell.content is not None:
# Replacing "<" by &lt;
cell['content'] = cell['content'].replace("<", "&lt;")
cell.content = cell.content.replace("<", "&lt;")
#Bold
for bold_characters in ["**", "__"]:
while cell['content'].find(bold_characters) != -1:
cell['content'] = cell['content'].replace(bold_characters, bold, 1)
while cell.content.find(bold_characters) != -1:
cell.content = cell.content.replace(bold_characters, bold, 1)
if bold == "<strong>":
bold = "</strong>"
else:
bold = "<strong>"
#Italic
while cell['content'].find("_") != -1 and cell['content'].find("\_") == -1:
cell['content'] = cell['content'].rstrip() .replace("_", italic, 1)
while cell.content.find("_") != -1 and cell.content.find("\_") == -1:
cell.content = cell.content.rstrip() .replace("_", italic, 1)
if italic == "<i>":
italic = "</i>"
else:
italic = "<i>"
while cell['content'].find("\_") != -1:
cell['content'] = cell['content'].rstrip().replace("\_", "_", 1)
while cell.content.find("\_") != -1:
cell.content = cell.content.rstrip().replace("\_", "_", 1)
# Correct newlines characters
for row in header_rows:
for cell in row:
cell['content'] = cell['content'].replace("\n", "<br />") if cell['content'] is not None else None
cell.content = cell.content.replace("\n", "<br />") if cell.content is not None else None
for row in data_rows:
for cell in row:
cell['content'] = cell['content'].replace("\n", "<br />") if cell['content'] is not None else None
cell.content = cell.content.replace("\n", "<br />") if cell.content is not None else None
# Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows
forward_rowspan = []
......@@ -748,13 +794,13 @@ def parse_pandoc_table_with_spans(pandoc_table):
forward_rowspan = [0 for _ in range(len(header_rows[row_index]))]
sum = 0
for cell_index in range(len(header_rows[row_index])):
sum += header_rows[row_index][cell_index]['colspan']
if row_index > 0 and header_rows[row_index][cell_index]['colspan'] == 0:
sum += header_rows[row_index][cell_index].colspan
if row_index > 0 and header_rows[row_index][cell_index].colspan == 0:
if forward_rowspan[cell_index] > 0:
sum += 1
forward_rowspan[cell_index] -= 1
if forward_rowspan[cell_index] == 0 and header_rows[row_index][cell_index]['rowspan'] > 1:
forward_rowspan[cell_index] = header_rows[row_index][cell_index]['rowspan'] -1
if forward_rowspan[cell_index] == 0 and header_rows[row_index][cell_index].rowspan > 1:
forward_rowspan[cell_index] = header_rows[row_index][cell_index].rowspan -1
if not sum == number_of_columns:
raise ValueError("Grid table not converted properly")
forward_rowspan = []
......@@ -763,13 +809,13 @@ def parse_pandoc_table_with_spans(pandoc_table):
forward_rowspan = [0 for _ in range(len(data_rows[row_index]))]
sum = 0
for cell_index in range(len(data_rows[row_index])):
sum += data_rows[row_index][cell_index]['colspan']
if row_index > 0 and data_rows[row_index][cell_index]['colspan'] == 0:
sum += data_rows[row_index][cell_index].colspan
if row_index > 0 and data_rows[row_index][cell_index].colspan == 0:
if forward_rowspan[cell_index] > 0:
sum += 1
forward_rowspan[cell_index] -= 1
if forward_rowspan[cell_index] == 0 and data_rows[row_index][cell_index]['rowspan'] > 1:
forward_rowspan[cell_index] = data_rows[row_index][cell_index]['rowspan'] - 1
if forward_rowspan[cell_index] == 0 and data_rows[row_index][cell_index].rowspan > 1:
forward_rowspan[cell_index] = data_rows[row_index][cell_index].rowspan - 1
if not sum == number_of_columns:
raise ValueError("Grid table not converted properly")
......@@ -789,35 +835,35 @@ def generate_html_table_with_spans(pandoc_table):
for row in grid_header:
for cell in row:
if cell['rowspan'] != 0 and cell['colspan'] != 0:
if cell.rowspan != 0 and cell.colspan != 0:
has_header = True
if has_header:
html += " <thead>\n"
for row in grid_header:
html += " <tr>\n"
for cell in row:
if cell['rowspan'] == 0 or cell['colspan'] == 0:
if cell.rowspan == 0 or cell.colspan == 0:
continue
else:
# Prepare content, in case there's a list
#print(cell['content'])
#print(cell.content)
if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>",
cell['content']): # Update cell in new row
cell.content): # Update cell in new row
#print("MATCHING")
list = "<ul>"
# Build list the matches
for match in matches:
list += "<li>" + match[1] + "</li>"
list += "</ul>"
cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell['content'])
cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell.content)
# Enforce left alignment if cell contains a list
cell['alignment'] = "align=\"left\""
cell.alignment = "align=\"left\""
#else:
# print("NOT MATCHING")
rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
html += f" <th{rowspan}{colspan} {cell['alignment']}>{cell['content']}</th>\n"
rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else ""
colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else ""
html += f" <th{rowspan}{colspan} {cell.alignment}>{cell.content}</th>\n"
html += " </tr>\n"
html += " </thead>\n"
......@@ -825,26 +871,27 @@ def generate_html_table_with_spans(pandoc_table):
for row in grid_body:
html += " <tr>\n"
for cell in row:
if cell['rowspan'] == 0 or cell['colspan'] == 0:
if cell.rowspan == 0 or cell.colspan == 0:
continue
else:
#Prepare content, in case there's a list
#print(cell['content'])
if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell['content']): # Update cell in new row
#print(cell.content)
if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell.content): # Update cell in new row
#print("MATCHING")
#print(cell.content)
list = "<ul>"
# Build list the matches
for match in matches:
list += "<li>" + match[1] + "</li>"
list += "</ul>"
cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell['content'])
cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell.content)
# Enforce left alignment if cell contains a list
cell['alignment'] = "align=\"left\""
cell.alignment = "align=\"left\""
#else:
#print("NOT MATCHING")
rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
html += f" <td{rowspan}{colspan} {cell['alignment']}>{cell['content']}</td>\n"
rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else ""
colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else ""
html += f" <td{rowspan}{colspan} {cell.alignment}>{cell.content}</td>\n"
html += " </tr>\n"
html += " </tbody>\n"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment