Skip to content
Snippets Groups Projects
Commit d15b1ca0 authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Change algorithm to convert grid tables + compacting code

parent a1dbd013
No related branches found
No related tags found
No related merge requests found
...@@ -484,7 +484,6 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -484,7 +484,6 @@ def parse_pandoc_table_with_spans(pandoc_table):
self.alignment = "align=\"center\"" self.alignment = "align=\"center\""
self.position = None self.position = None
self.list_flag = False self.list_flag = False
self.auxiliar_index = None
def set_alignment(self): def set_alignment(self):
header_delimiter_index = 0 header_delimiter_index = 0
...@@ -512,10 +511,22 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -512,10 +511,22 @@ def parse_pandoc_table_with_spans(pandoc_table):
def __setitem__(self, key, value): def __setitem__(self, key, value):
self.cells[key] = value self.cells[key] = value
class RowTracker():
""" Represents the document object. """
def __init__(self, items):
self.rowTracker = [0 for _ in range(items)]
def __getitem__(self, item):
return self.rowTracker[item]
def __setitem__(self, key, value):
self.rowTracker[key] = value
# Detect separator lines by pattern (it does not take into account partial separators # Detect separator lines by pattern (it does not take into account partial separators
def is_separator(line): def is_separator(line):
return _matchGridTableSeparator.match(line) return _matchGridTableSeparator.match(line)
# Set content on the cell - concatenating multilines, flagging lists
def handling_content(cell, content): def handling_content(cell, content):
if cell.content is None: if cell.content is None:
cell.rowspan += 1 cell.rowspan += 1
...@@ -524,9 +535,9 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -524,9 +535,9 @@ def parse_pandoc_table_with_spans(pandoc_table):
cell.list_flag = True cell.list_flag = True
#print(content) #print(content)
cell.content = content.strip() + "\n" # Add newline to know when the list element ends cell.content = content.strip() + "\n" # Add newline to know when the list element ends
elif cell.list_flag and cells[i].strip() != "": # any other content when handling list is concatenated to the last list element elif cell.list_flag and content.strip() != "": # any other content when handling list is concatenated to the last list element
cell.content += content.strip() + "\n" cell.content += content.strip() + "\n"
elif cells[i].strip == "": # separation between list and other paragraph elif content.strip == "": # separation between list and other paragraph
cell.list_flag = False cell.list_flag = False
cell.content += "\n" #if not cell['content'].endswith("\n") else "" cell.content += "\n" #if not cell['content'].endswith("\n") else ""
else: else:
...@@ -538,10 +549,10 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -538,10 +549,10 @@ def parse_pandoc_table_with_spans(pandoc_table):
#cell['content'] = cell['content'].strip("\n") #cell['content'] = cell['content'].strip("\n")
cell.list_flag = True cell.list_flag = True
cell.content += content.strip() + "\n" # Add newline to know when the list element ends cell.content += content.strip() + "\n" # Add newline to know when the list element ends
elif cell.list_flag and cells[i].strip() != "": # any other content when handling list is concatenated to the last list element elif cell.list_flag and content.strip() != "": # any other content when handling list is concatenated to the last list element
cell.content = cell.content.strip("\n") cell.content = cell.content.strip("\n")
cell.content += " " + content.strip() + "\n" cell.content += " " + content.strip() + "\n"
elif cells[i].strip() == "": # separation between list and other paragraph elif content.strip() == "": # separation between list and other paragraph
cell.list_flag = False cell.list_flag = False
#content = re.sub(r'\\\s*$', "\n", content.strip()) #content = re.sub(r'\\\s*$', "\n", content.strip())
cell.content += "\n" if not cell.content.endswith("\n") else "" cell.content += "\n" if not cell.content.endswith("\n") else ""
...@@ -551,6 +562,7 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -551,6 +562,7 @@ def parse_pandoc_table_with_spans(pandoc_table):
#print(cell['content']) #print(cell['content'])
return cell return cell
# Adjust colspan of a cell
def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions): def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions):
for j in range(column_index, number_of_parts): for j in range(column_index, number_of_parts):
delimiter_start = None delimiter_start = None
...@@ -614,9 +626,8 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -614,9 +626,8 @@ def parse_pandoc_table_with_spans(pandoc_table):
data_rows = [] data_rows = []
for row in range(len(separator_indices) - 1): for row in range(len(separator_indices) - 1):
table_row = [] rows = []
auxiliar_rows = [] rows_tracker = []
has_merged_cells = False
in_data_row = False in_data_row = False
start, end = separator_indices[row], separator_indices[row + 1] start, end = separator_indices[row], separator_indices[row + 1]
row_lines = lines[start:end] # Lines between separators including separator line start as it gives information about the number of columns of the row row_lines = lines[start:end] # Lines between separators including separator line start as it gives information about the number of columns of the row
...@@ -624,10 +635,8 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -624,10 +635,8 @@ def parse_pandoc_table_with_spans(pandoc_table):
# Combine multiline content into single strings for each cell # Combine multiline content into single strings for each cell
for line in row_lines: for line in row_lines:
if is_separator(line) and not in_data_row: if is_separator(line) and not in_data_row:
number_of_columns_row = line.count("+") - 1
in_data_row = True in_data_row = True
parts = re.split(r"\s*\+\s*", line.strip("+")) parts = re.split(r"\s*\+\s*", line.strip("+"))
# Add as many cells as columns with span attributes
delimiter_index = 0 delimiter_index = 0
# Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator) # Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator)
# we need to assign the default alignment as defined in the header separator line # we need to assign the default alignment as defined in the header separator line
...@@ -640,113 +649,101 @@ def parse_pandoc_table_with_spans(pandoc_table): ...@@ -640,113 +649,101 @@ def parse_pandoc_table_with_spans(pandoc_table):
# alignments.append("align=\"right\"") # alignments.append("align=\"right\"")
# else: # else:
# alignments.append("align=\"center\"") # alignments.append("align=\"center\"")
header_delimiter_index = 0 rows.append(Row(number_of_columns))
table_row = Row(number_of_columns) #rows_tracker = [RowTracker() for _ in range(number_of_columns)]
rows_tracker = RowTracker(number_of_columns)
i = 0 i = 0
j = 0 for j in range(len(parts)):
while i in range(number_of_columns) and j in range(len(parts)): if i in range(number_of_columns):
delimiter_index += len(parts[j]) + 1 delimiter_index += len(parts[j]) + 1
#table_row[i].alignment = default_alignments[i] if i == 0 else "align=\"center\"" # Set position
table_row[i].position = delimiter_index # Position of cell delimiter + rows[-1][i].position = delimiter_index # Position of cell delimiter +
# Set alignment as defined by header separator line # Set alignment as defined by header separator line
table_row[i].set_alignment() rows[-1][i].set_alignment()
while delimiter_index > delimiter_positions[i]: while delimiter_index > delimiter_positions[i]:
i += 1 i += 1
i += 1 i += 1
j += 1
elif in_data_row: elif in_data_row:
# Regular data row or partial separator # Regular data row or partial separator
if _matchGridTableBodySeparator.match(line): # Partial separator if _matchGridTableBodySeparator.match(line): # Partial separator
has_merged_cells = True cells_content = re.split(r"[\|\+]", line.strip("|").strip("+")) # (?<!\\)[\|\+]
cells = re.split(r"[\|\+]", line.strip("|").strip("+")) # (?<!\\)[\|\+] #Add another row, set delimiters for each cell
#Add auxiliar line, set delimiters for each cell rows.append(Row(number_of_columns))
auxiliar_rows.append(Row(number_of_columns))
aux_delimiter_index = 0 aux_delimiter_index = 0
for auxiliar_cell_index in range(number_of_columns): auxiliar_cell_index = 0
aux_delimiter_index += len(cells[auxiliar_cell_index]) + 1 for i in range(len(cells_content)):
auxiliar_rows[-1][auxiliar_cell_index].position = aux_delimiter_index # Position of cell delimiter + if auxiliar_cell_index in range(number_of_columns):
auxiliar_rows[-1][auxiliar_cell_index].set_alignment() aux_delimiter_index += len(cells_content[i]) + 1
rows[-1][auxiliar_cell_index].position = aux_delimiter_index # Position of cell delimiter +
if len(cells) <= number_of_columns: # Colspan: Positions of | with respect to + need to be determined rows[-1][auxiliar_cell_index].set_alignment()
table_row_index = 0 while aux_delimiter_index > delimiter_positions[auxiliar_cell_index]:
for i in range(len(cells)): auxiliar_cell_index += 1
if _matchGridTableBodySeparatorLine.match(cells[i]): # A new row is to be added auxiliar_cell_index += 1
if len(cells_content) <= number_of_columns: # Colspan: Positions of | with respect to + need to be determined
column_index = 0
for i in range(len(cells_content)):
if _matchGridTableBodySeparatorLine.match(cells_content[i]): # A new row is to be added
rows_tracker[column_index] += 1
rows[rows_tracker[column_index]][column_index].list_flag = False
#auxiliar_rows[-1]['use_auxiliar_row'][i] = True #auxiliar_rows[-1]['use_auxiliar_row'][i] = True
auxiliar_rows[-1][i].list_flag = False
table_row[i].auxiliar_index = len(auxiliar_rows)-1
#if cells[i].startswith(":") and not cells[i].endswith(":"): #if cells[i].startswith(":") and not cells[i].endswith(":"):
# auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"left\"" # auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"left\""
#elif not cells[i].startswith(":") and cells[i].endswith(":"): #elif not cells[i].startswith(":") and cells[i].endswith(":"):
# auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"right\"" # auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"right\""
#else: #else:
# auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"center\"" # auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"center\""
column_forward = 0
for del_index in range(column_index, len(delimiter_positions)):
if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[del_index]:
column_forward += 1
rows_tracker[column_index + column_forward - 1] += 1 if column_forward > 1 else 0
column_index += column_forward
continue
else: else:
# Handle content of the cell # Handle content of the cell
if table_row[i].auxiliar_index is not None: # and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]: rows[rows_tracker[column_index]][column_index] = handling_content(rows[rows_tracker[column_index]][column_index], cells_content[i])
auxiliar_rows[table_row[i].auxiliar_index][i] = handling_content(auxiliar_rows[table_row[i].auxiliar_index][i], cells[i]) rows[rows_tracker[column_index]][column_index].rowspan += 1
if not auxiliar_rows[table_row[i].auxiliar_index][i].colspan_adjusted: if not rows[rows_tracker[column_index]][column_index].colspan_adjusted:
auxiliar_rows[table_row[i].auxiliar_index][i].colspan_adjusted = True rows[rows_tracker[column_index]][column_index].colspan_adjusted = True
# TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator # TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
auxiliar_rows[table_row[i].auxiliar_index][i] = adjust_colspan(auxiliar_rows[table_row[i].auxiliar_index], i, number_of_columns, line, number_of_columns, delimiter_positions) rows[rows_tracker[column_index]][column_index] = adjust_colspan(rows[rows_tracker[column_index]], column_index, number_of_columns, line, number_of_columns, delimiter_positions)
table_row_index += auxiliar_rows[table_row[table_row_index].auxiliar_index][i].colspan - 1
else: if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[column_index]:
table_row[table_row_index] = handling_content(table_row[table_row_index], cells[i]) column_index += rows[rows_tracker[column_index]][column_index].colspan if rows[rows_tracker[column_index]][column_index].colspan != 0 else 1
# Cell which is not separator continue
table_row[table_row_index].rowspan += 1
if not table_row.cells[table_row_index].colspan_adjusted:
table_row[table_row_index].colspan_adjusted = True
#TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
table_row[table_row_index] = adjust_colspan(table_row, table_row_index, number_of_columns, line, number_of_columns, delimiter_positions)
#table_row_index += table_row[i].colspan - 1 #Move forward index i
if table_row[table_row_index].position == delimiter_positions[i]:
table_row_index += table_row[table_row_index].colspan if table_row[table_row_index].colspan != 0 else 1
else: else:
raise ValueError("More cells than columns found") raise ValueError("More cells than columns found")
else: # Data row else: # Data row
cells = re.split(r"\s*\|\s*", line.strip("|")) cells_content = re.split(r"\s*\|\s*", line.strip("|"))
if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined column_index = 0
table_row_index = 0 if len(cells_content) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
for i in range(len(cells)): for i in range(len(cells_content)):
# Handle content of the cell # Handle content of the cell
if table_row[table_row_index].auxiliar_index is not None:# and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]: rows[rows_tracker[column_index]][column_index] = handling_content(rows[rows_tracker[column_index]][column_index], cells_content[i])
auxiliar_rows[table_row.cells[table_row_index].auxiliar_index][i] = handling_content(auxiliar_rows[table_row[table_row_index].auxiliar_index][i], cells[i]) if not rows[rows_tracker[column_index]][column_index].colspan_adjusted:
if not auxiliar_rows[table_row[table_row_index].auxiliar_index].cells[i].colspan_adjusted: rows[rows_tracker[column_index]][column_index].colspan_adjusted = True
auxiliar_rows[table_row[table_row_index].auxiliar_index].cells[i].colspan_adjusted = True
#TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator #TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
auxiliar_rows[table_row[table_row_index].auxiliar_index][i] = adjust_colspan(auxiliar_rows[table_row[table_row_index].auxiliar_index].cells, i, number_of_columns, line, number_of_columns, delimiter_positions) rows[rows_tracker[column_index]][column_index] = adjust_colspan(rows[rows_tracker[column_index]], column_index, number_of_columns, line, number_of_columns, delimiter_positions)
table_row_index += auxiliar_rows[table_row[table_row_index].auxiliar_index][i].colspan - 1 # Move forward index i if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[column_index]:
else: column_index += rows[rows_tracker[column_index]][column_index].colspan # Move forward index i
table_row[table_row_index] = handling_content(table_row[table_row_index], cells[i])
if not table_row.cells[table_row_index].colspan_adjusted: elif len(cells_content) == number_of_columns: # Simple row
table_row[table_row_index].colspan_adjusted = True for i in range(len(cells_content)):
table_row[table_row_index] = adjust_colspan(table_row.cells, table_row_index, number_of_columns, line, number_of_columns, delimiter_positions) rows[rows_tracker[i]][i] = handling_content(rows[rows_tracker[i]][i], cells_content[i])
table_row_index += table_row[table_row_index].colspan - 1 # Move forward index i
table_row_index += 1
elif len(cells) == number_of_columns: # Simple row
for i in range(len(cells)):
if table_row[i].auxiliar_index is not None:# and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
auxiliar_rows[table_row[i].auxiliar_index][i] = handling_content(auxiliar_rows[table_row[i].auxiliar_index][i], cells[i])
else:
# Handle content of the cell
table_row[i] = handling_content(table_row[i], cells[i])
else: else:
raise ValueError("More cells than columns found") raise ValueError("More cells than columns found")
else: else:
raise ValueError("No separator line found for row starting") raise ValueError("No separator line found for row starting")
if has_header and start >= header_separator_index: # table_row and auxiliar_row are part of data_rows if has_header and start >= header_separator_index: # table_row and auxiliar_row are part of data_rows
data_rows.append(table_row.cells) for body_row in rows:
if has_merged_cells: data_rows.append(body_row.cells)
for row in auxiliar_rows:
#for i in range(len(row.cells)):
# print(row.cells[i].content)
data_rows.append(row.cells)
elif has_header and start < header_separator_index: # table_row and auxiliar_row are part of header_rows elif has_header and start < header_separator_index: # table_row and auxiliar_row are part of header_rows
header_rows.append(table_row.cells) for header_row in rows:
if has_merged_cells: header_rows.append(header_row.cells)
for row in auxiliar_rows:
header_rows.append(row.cells)
#print(header_rows) #print(header_rows)
#print(data_rows) #print(data_rows)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment