From d15b1ca019f39184e45281e9253284d160a01944 Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Fri, 6 Dec 2024 08:32:28 +0100 Subject: [PATCH] Change algorithm to convert grid tables + compacting code --- toMkdocs/toMkdocs.py | 179 +++++++++++++++++++++---------------------- 1 file changed, 88 insertions(+), 91 deletions(-) diff --git a/toMkdocs/toMkdocs.py b/toMkdocs/toMkdocs.py index c5cb22c..ae0be2a 100644 --- a/toMkdocs/toMkdocs.py +++ b/toMkdocs/toMkdocs.py @@ -484,7 +484,6 @@ def parse_pandoc_table_with_spans(pandoc_table): self.alignment = "align=\"center\"" self.position = None self.list_flag = False - self.auxiliar_index = None def set_alignment(self): header_delimiter_index = 0 @@ -512,10 +511,22 @@ def parse_pandoc_table_with_spans(pandoc_table): def __setitem__(self, key, value): self.cells[key] = value + class RowTracker(): + """ Represents the document object. """ + def __init__(self, items): + self.rowTracker = [0 for _ in range(items)] + + def __getitem__(self, item): + return self.rowTracker[item] + + def __setitem__(self, key, value): + self.rowTracker[key] = value + # Detect separator lines by pattern (it does not take into account partial separators def is_separator(line): return _matchGridTableSeparator.match(line) + # Set content on the cell - concatenating multilines, flagging lists def handling_content(cell, content): if cell.content is None: cell.rowspan += 1 @@ -524,9 +535,9 @@ def parse_pandoc_table_with_spans(pandoc_table): cell.list_flag = True #print(content) cell.content = content.strip() + "\n" # Add newline to know when the list element ends - elif cell.list_flag and cells[i].strip() != "": # any other content when handling list is concatenated to the last list element + elif cell.list_flag and content.strip() != "": # any other content when handling list is concatenated to the last list element cell.content += content.strip() + "\n" - elif cells[i].strip == "": # separation between list and other paragraph + elif content.strip == "": # separation between list and other paragraph cell.list_flag = False cell.content += "\n" #if not cell['content'].endswith("\n") else "" else: @@ -538,10 +549,10 @@ def parse_pandoc_table_with_spans(pandoc_table): #cell['content'] = cell['content'].strip("\n") cell.list_flag = True cell.content += content.strip() + "\n" # Add newline to know when the list element ends - elif cell.list_flag and cells[i].strip() != "": # any other content when handling list is concatenated to the last list element + elif cell.list_flag and content.strip() != "": # any other content when handling list is concatenated to the last list element cell.content = cell.content.strip("\n") cell.content += " " + content.strip() + "\n" - elif cells[i].strip() == "": # separation between list and other paragraph + elif content.strip() == "": # separation between list and other paragraph cell.list_flag = False #content = re.sub(r'\\\s*$', "\n", content.strip()) cell.content += "\n" if not cell.content.endswith("\n") else "" @@ -551,6 +562,7 @@ def parse_pandoc_table_with_spans(pandoc_table): #print(cell['content']) return cell + # Adjust colspan of a cell def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions): for j in range(column_index, number_of_parts): delimiter_start = None @@ -614,9 +626,8 @@ def parse_pandoc_table_with_spans(pandoc_table): data_rows = [] for row in range(len(separator_indices) - 1): - table_row = [] - auxiliar_rows = [] - has_merged_cells = False + rows = [] + rows_tracker = [] in_data_row = False start, end = separator_indices[row], separator_indices[row + 1] row_lines = lines[start:end] # Lines between separators including separator line start as it gives information about the number of columns of the row @@ -624,10 +635,8 @@ def parse_pandoc_table_with_spans(pandoc_table): # Combine multiline content into single strings for each cell for line in row_lines: if is_separator(line) and not in_data_row: - number_of_columns_row = line.count("+") - 1 in_data_row = True parts = re.split(r"\s*\+\s*", line.strip("+")) - # Add as many cells as columns with span attributes delimiter_index = 0 # Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator) # we need to assign the default alignment as defined in the header separator line @@ -640,113 +649,101 @@ def parse_pandoc_table_with_spans(pandoc_table): # alignments.append("align=\"right\"") # else: # alignments.append("align=\"center\"") - header_delimiter_index = 0 - table_row = Row(number_of_columns) + rows.append(Row(number_of_columns)) + #rows_tracker = [RowTracker() for _ in range(number_of_columns)] + rows_tracker = RowTracker(number_of_columns) i = 0 - j = 0 - while i in range(number_of_columns) and j in range(len(parts)): - delimiter_index += len(parts[j]) + 1 - #table_row[i].alignment = default_alignments[i] if i == 0 else "align=\"center\"" - table_row[i].position = delimiter_index # Position of cell delimiter + - #Set alignment as defined by header separator line - table_row[i].set_alignment() - while delimiter_index > delimiter_positions[i]: + for j in range(len(parts)): + if i in range(number_of_columns): + delimiter_index += len(parts[j]) + 1 + # Set position + rows[-1][i].position = delimiter_index # Position of cell delimiter + + # Set alignment as defined by header separator line + rows[-1][i].set_alignment() + while delimiter_index > delimiter_positions[i]: + i += 1 i += 1 - i += 1 - j += 1 + elif in_data_row: # Regular data row or partial separator if _matchGridTableBodySeparator.match(line): # Partial separator - has_merged_cells = True - cells = re.split(r"[\|\+]", line.strip("|").strip("+")) # (?<!\\)[\|\+] - #Add auxiliar line, set delimiters for each cell - auxiliar_rows.append(Row(number_of_columns)) + cells_content = re.split(r"[\|\+]", line.strip("|").strip("+")) # (?<!\\)[\|\+] + #Add another row, set delimiters for each cell + rows.append(Row(number_of_columns)) aux_delimiter_index = 0 - for auxiliar_cell_index in range(number_of_columns): - aux_delimiter_index += len(cells[auxiliar_cell_index]) + 1 - auxiliar_rows[-1][auxiliar_cell_index].position = aux_delimiter_index # Position of cell delimiter + - auxiliar_rows[-1][auxiliar_cell_index].set_alignment() - - if len(cells) <= number_of_columns: # Colspan: Positions of | with respect to + need to be determined - table_row_index = 0 - for i in range(len(cells)): - if _matchGridTableBodySeparatorLine.match(cells[i]): # A new row is to be added + auxiliar_cell_index = 0 + for i in range(len(cells_content)): + if auxiliar_cell_index in range(number_of_columns): + aux_delimiter_index += len(cells_content[i]) + 1 + rows[-1][auxiliar_cell_index].position = aux_delimiter_index # Position of cell delimiter + + rows[-1][auxiliar_cell_index].set_alignment() + while aux_delimiter_index > delimiter_positions[auxiliar_cell_index]: + auxiliar_cell_index += 1 + auxiliar_cell_index += 1 + + if len(cells_content) <= number_of_columns: # Colspan: Positions of | with respect to + need to be determined + column_index = 0 + for i in range(len(cells_content)): + if _matchGridTableBodySeparatorLine.match(cells_content[i]): # A new row is to be added + rows_tracker[column_index] += 1 + rows[rows_tracker[column_index]][column_index].list_flag = False #auxiliar_rows[-1]['use_auxiliar_row'][i] = True - auxiliar_rows[-1][i].list_flag = False - table_row[i].auxiliar_index = len(auxiliar_rows)-1 #if cells[i].startswith(":") and not cells[i].endswith(":"): # auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"left\"" #elif not cells[i].startswith(":") and cells[i].endswith(":"): # auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"right\"" #else: # auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"center\"" + column_forward = 0 + for del_index in range(column_index, len(delimiter_positions)): + if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[del_index]: + column_forward += 1 + rows_tracker[column_index + column_forward - 1] += 1 if column_forward > 1 else 0 + column_index += column_forward + continue else: # Handle content of the cell - if table_row[i].auxiliar_index is not None: # and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]: - auxiliar_rows[table_row[i].auxiliar_index][i] = handling_content(auxiliar_rows[table_row[i].auxiliar_index][i], cells[i]) - if not auxiliar_rows[table_row[i].auxiliar_index][i].colspan_adjusted: - auxiliar_rows[table_row[i].auxiliar_index][i].colspan_adjusted = True - # TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator - auxiliar_rows[table_row[i].auxiliar_index][i] = adjust_colspan(auxiliar_rows[table_row[i].auxiliar_index], i, number_of_columns, line, number_of_columns, delimiter_positions) - table_row_index += auxiliar_rows[table_row[table_row_index].auxiliar_index][i].colspan - 1 - else: - table_row[table_row_index] = handling_content(table_row[table_row_index], cells[i]) - # Cell which is not separator - table_row[table_row_index].rowspan += 1 - if not table_row.cells[table_row_index].colspan_adjusted: - table_row[table_row_index].colspan_adjusted = True - #TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator - table_row[table_row_index] = adjust_colspan(table_row, table_row_index, number_of_columns, line, number_of_columns, delimiter_positions) - #table_row_index += table_row[i].colspan - 1 #Move forward index i - if table_row[table_row_index].position == delimiter_positions[i]: - table_row_index += table_row[table_row_index].colspan if table_row[table_row_index].colspan != 0 else 1 + rows[rows_tracker[column_index]][column_index] = handling_content(rows[rows_tracker[column_index]][column_index], cells_content[i]) + rows[rows_tracker[column_index]][column_index].rowspan += 1 + if not rows[rows_tracker[column_index]][column_index].colspan_adjusted: + rows[rows_tracker[column_index]][column_index].colspan_adjusted = True + # TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator + rows[rows_tracker[column_index]][column_index] = adjust_colspan(rows[rows_tracker[column_index]], column_index, number_of_columns, line, number_of_columns, delimiter_positions) + + if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[column_index]: + column_index += rows[rows_tracker[column_index]][column_index].colspan if rows[rows_tracker[column_index]][column_index].colspan != 0 else 1 + continue + else: raise ValueError("More cells than columns found") else: # Data row - cells = re.split(r"\s*\|\s*", line.strip("|")) - if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined - table_row_index = 0 - for i in range(len(cells)): + cells_content = re.split(r"\s*\|\s*", line.strip("|")) + column_index = 0 + if len(cells_content) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined + for i in range(len(cells_content)): # Handle content of the cell - if table_row[table_row_index].auxiliar_index is not None:# and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]: - auxiliar_rows[table_row.cells[table_row_index].auxiliar_index][i] = handling_content(auxiliar_rows[table_row[table_row_index].auxiliar_index][i], cells[i]) - if not auxiliar_rows[table_row[table_row_index].auxiliar_index].cells[i].colspan_adjusted: - auxiliar_rows[table_row[table_row_index].auxiliar_index].cells[i].colspan_adjusted = True - #TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator - auxiliar_rows[table_row[table_row_index].auxiliar_index][i] = adjust_colspan(auxiliar_rows[table_row[table_row_index].auxiliar_index].cells, i, number_of_columns, line, number_of_columns, delimiter_positions) - table_row_index += auxiliar_rows[table_row[table_row_index].auxiliar_index][i].colspan - 1 # Move forward index i - else: - table_row[table_row_index] = handling_content(table_row[table_row_index], cells[i]) - if not table_row.cells[table_row_index].colspan_adjusted: - table_row[table_row_index].colspan_adjusted = True - table_row[table_row_index] = adjust_colspan(table_row.cells, table_row_index, number_of_columns, line, number_of_columns, delimiter_positions) - table_row_index += table_row[table_row_index].colspan - 1 # Move forward index i - - table_row_index += 1 - elif len(cells) == number_of_columns: # Simple row - for i in range(len(cells)): - if table_row[i].auxiliar_index is not None:# and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]: - auxiliar_rows[table_row[i].auxiliar_index][i] = handling_content(auxiliar_rows[table_row[i].auxiliar_index][i], cells[i]) - else: - # Handle content of the cell - table_row[i] = handling_content(table_row[i], cells[i]) + rows[rows_tracker[column_index]][column_index] = handling_content(rows[rows_tracker[column_index]][column_index], cells_content[i]) + if not rows[rows_tracker[column_index]][column_index].colspan_adjusted: + rows[rows_tracker[column_index]][column_index].colspan_adjusted = True + #TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator + rows[rows_tracker[column_index]][column_index] = adjust_colspan(rows[rows_tracker[column_index]], column_index, number_of_columns, line, number_of_columns, delimiter_positions) + if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[column_index]: + column_index += rows[rows_tracker[column_index]][column_index].colspan # Move forward index i + + elif len(cells_content) == number_of_columns: # Simple row + for i in range(len(cells_content)): + rows[rows_tracker[i]][i] = handling_content(rows[rows_tracker[i]][i], cells_content[i]) else: raise ValueError("More cells than columns found") else: raise ValueError("No separator line found for row starting") if has_header and start >= header_separator_index: # table_row and auxiliar_row are part of data_rows - data_rows.append(table_row.cells) - if has_merged_cells: - for row in auxiliar_rows: - #for i in range(len(row.cells)): - # print(row.cells[i].content) - data_rows.append(row.cells) + for body_row in rows: + data_rows.append(body_row.cells) elif has_header and start < header_separator_index: # table_row and auxiliar_row are part of header_rows - header_rows.append(table_row.cells) - if has_merged_cells: - for row in auxiliar_rows: - header_rows.append(row.cells) + for header_row in rows: + header_rows.append(header_row.cells) #print(header_rows) #print(data_rows) -- GitLab