From 708d9fb8d786b0f14ab5182667f2aa3ec41a1558 Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Fri, 29 Nov 2024 10:32:45 +0100 Subject: [PATCH] Using class Cell and Row to handle grid tables conversion to html --- toMkdocs/toMkdocs.py | 281 +++++++++++++++++++++++++------------------ 1 file changed, 164 insertions(+), 117 deletions(-) diff --git a/toMkdocs/toMkdocs.py b/toMkdocs/toMkdocs.py index 33947cb..71a5b85 100644 --- a/toMkdocs/toMkdocs.py +++ b/toMkdocs/toMkdocs.py @@ -420,6 +420,7 @@ _matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECA _matchTable = re.compile(r'^\s*\|.*\|\s*$', re.IGNORECASE) _matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE) _matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE) +_matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE) _matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE) _matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE) _matchGridTableBodySeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE) @@ -463,58 +464,85 @@ def parse_pandoc_table_with_spans(pandoc_table): # Split the input into lines lines = [line.strip() for line in pandoc_table.strip().split("\n")] + class Cell: + """ Represents the document object. """ + content: str + rowspan: int + colspan: int + colspan_adjusted: bool + alignment: str + position: int + list_flag: bool + auxiliar_index: int + + def __init__(self): + self.content = None + self.rowspan = 0 + self.colspan = 0 + self.colspan_adjusted = False + self.alignment = "align=\"center\"" + self.position = 0 + self.list_flag = False + self.auxiliar_index = None + + class Row(): + """ Represents a row in the markdown file. """ + cells:list[Cell] = [] + + def __init__(self, length: int = 1) -> None: + self.cells = [Cell() for _ in range(length)] + # Detect separator lines by pattern (it does not take into account partial separators def is_separator(line): - _matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE) return _matchGridTableSeparator.match(line) - def handling_content(cell, content, list_flag): - if cell['content'] is None: - cell['rowspan'] += 1 - cell['colspan'] += 1 + def handling_content(cell, content): + if cell.content is None: + cell.rowspan += 1 + cell.colspan += 1 if content.strip().startswith("- "): # List - list_flag = True - print(content) - cell['content'] = content.strip() + "\n" # Add newline to know when the list element ends - elif list_flag: # any other content when handling list is concatenated to the last list element - cell['content'] += content.strip() + "\n" - elif cells[i].strip() == "": # separation between list and other paragraph - list_flag = False - cell['content'] = re.sub(r'\\\s*$', "\n", content) + cell.list_flag = True + #print(content) + cell.content = content.strip() + "\n" # Add newline to know when the list element ends + elif cell.list_flag and cells[i].strip() != "": # any other content when handling list is concatenated to the last list element + cell.content += content.strip() + "\n" + elif cells[i].strip == "": # separation between list and other paragraph + cell.list_flag = False + cell.content += "\n" #if not cell['content'].endswith("\n") else "" else: - cell['content'] = re.sub(r'\\\s*$', "\n", content.strip()) + cell.content = re.sub(r'\\\s*$', "\n", content.strip()) else: if content.strip().startswith("- "): # List - if not list_flag: - cell['content'] += "\n" + if not cell.list_flag: + cell.content += "\n" #cell['content'] = cell['content'].strip("\n") - list_flag = True - cell['content'] += content.strip() + "\n" # Add newline to know when the list element ends - elif list_flag: # any other content when handling list is concatenated to the last list element - cell['content'] = cell['content'].strip("\n") - cell['content'] += " " + content.strip() + "\n" + cell.list_flag = True + cell.content += content.strip() + "\n" # Add newline to know when the list element ends + elif cell.list_flag and cells[i].strip() != "": # any other content when handling list is concatenated to the last list element + cell.content = cell.content.strip("\n") + cell.content += " " + content.strip() + "\n" elif cells[i].strip() == "": # separation between list and other paragraph - list_flag = False + cell.list_flag = False #content = re.sub(r'\\\s*$', "\n", content.strip()) - cell['content'] += "\n" if not cell['content'].endswith("\n") else "" + cell.content += "\n" if not cell.content.endswith("\n") else "" else: content = re.sub(r'\\\s*$', "\n", content.strip()) - cell['content'] += " " + content + cell.content += " " + content #print(cell['content']) - return list_flag, cell + return cell def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions): for j in range(column_index, number_of_parts): - delimiter_start = row[j - 1]['position'] if j != 0 else 0 + delimiter_start = row[j - 1].position if j != 0 else 0 positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "|+" if delimiter in line[delimiter_start + 1:]] position = min(positions) if positions else -1 if position > delimiter_positions[j]: # Colspan to be increased - row[i]['colspan'] += 1 + row[i].colspan += 1 if position == delimiter_positions[len(delimiter_positions) - 1]: # last cell in row, adjust colspan to get max number columns - colspan_allocated = 0 - for cell_index in range(number_of_parts): - colspan_allocated += row[cell_index]['colspan'] - row[column_index]['colspan'] += number_of_columns - colspan_allocated + colspan_allocated = row[i].colspan + #for cell_index in range(number_of_parts): + # colspan_allocated += row[cell_index].colspan + row[column_index].colspan += number_of_columns - colspan_allocated - column_index elif position < delimiter_positions[j]: raise ValueError("Wrong cell formatting") else: @@ -563,6 +591,7 @@ def parse_pandoc_table_with_spans(pandoc_table): data_rows = [] for row in range(len(separator_indices) - 1): table_row = [] + auxiliar_rows = [] auxiliar_row = [] use_auxiliar_row = [] list_flags = [] @@ -591,65 +620,70 @@ def parse_pandoc_table_with_spans(pandoc_table): # else: # alignments.append("align=\"center\"") header_delimiter_index = 0 + table_row = Row(number_of_columns_row) for i in range(number_of_columns_row): delimiter_index += len(parts[i]) + 1 - table_row.append({ - "content": None, - "rowspan": 0, - "colspan": 0, - "colspan_adjusted": False, - "alignment": default_alignments[i] if i == 0 else "align=\"center\"", - "position": delimiter_index # Position of cell delimiter + - }) + table_row.cells[i].alignment = default_alignments[i] if i == 0 else "align=\"center\"" + table_row.cells[i].position = delimiter_index # Position of cell delimiter + + #Set alignment as defined by header separator line - while header_delimiter_index in range(len(default_alignments)) and table_row[i]['position'] > header_delimiter_positions[header_delimiter_index]: + while header_delimiter_index in range(len(default_alignments)) and table_row.cells[i].position > header_delimiter_positions[header_delimiter_index]: header_delimiter_index += 1 if header_delimiter_index in range(len(default_alignments)): - if table_row[i]['position'] < header_delimiter_positions[header_delimiter_index]: - table_row[i]['alignment'] = default_alignments[header_delimiter_index] - elif table_row[i]['position'] == header_delimiter_positions[header_delimiter_index]: - table_row[i]['alignment'] = default_alignments[i] + if table_row.cells[i].position < header_delimiter_positions[header_delimiter_index]: + table_row.cells[i].alignment = default_alignments[header_delimiter_index] + elif table_row.cells[i].position == header_delimiter_positions[header_delimiter_index]: + table_row.cells[i].alignment = default_alignments[i] header_delimiter_index += 1 else: raise ValueError("Invalid table formatting") - for i in range(number_of_columns): - auxiliar_row.append({ - "content": None, - "rowspan": 0, - "colspan": 0, - "colspan_adjusted": False, - "alignment": "align=\"center\"", - "position": 0 - }) - use_auxiliar_row.append(False) - list_flags.append(False) + #auxiliar_row = Row(number_of_columns) + #for i in range(number_of_columns): + #auxiliar_row.append(default_cell) + #use_auxiliar_row.append(False) + #auxiliar_rows.append({'auxiliar_row':auxiliar_row, 'use_auxiliar':use_auxiliar_row, 'list_flags':list_flags}) elif in_data_row: # Regular data row or partial separator if _matchGridTableBodySeparator.match(line): # Partial separator has_merged_cells = True + #Add auxiliar line, set delimiters for each cell + auxiliar_rows.append(Row(number_of_columns)) + aux_delimiter_index = 0 + for i in range(number_of_columns_row): + aux_delimiter_index += len(parts[i]) + 1 + auxiliar_rows[-1].cells[i].position = aux_delimiter_index # Position of cell delimiter + + cells = re.split(r"\s*[\|\+]\s*", line.strip("|").strip("+")) # (?<!\\)[\|\+] if len(cells) <= number_of_columns: # Colspan: Positions of | with respect to + need to be determined for i in range(len(cells)): if _matchGridTableBodySeparatorLine.match(cells[i]): # A new row is to be added - use_auxiliar_row[i] = True - list_flags[i] = False - if cells[i].startswith(":") and not cells[i].endswith(":"): - auxiliar_row[i]['alignment'] = "align=\"left\"" - elif not cells[i].startswith(":") and cells[i].endswith(":"): - auxiliar_row[i]['alignment'] = "align=\"right\"" - else: - auxiliar_row[i]['alignment'] = "align=\"center\"" + #auxiliar_rows[-1]['use_auxiliar_row'][i] = True + auxiliar_rows[-1].cells[i].list_flag = False + table_row.cells[i].auxiliar_index = len(auxiliar_rows)-1 + #if cells[i].startswith(":") and not cells[i].endswith(":"): + # auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"left\"" + #elif not cells[i].startswith(":") and cells[i].endswith(":"): + # auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"right\"" + #else: + # auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"center\"" else: - #Handle content of the cell - list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i]) - # Cell which is not separator - table_row[i]['rowspan'] += 1 - if not table_row[i]['colspan_adjusted']: - table_row[i]['colspan_adjusted'] = True - #TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator - table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions) + # Handle content of the cell + if table_row.cells[i].auxiliar_index is not None: # and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]: + auxiliar_rows[table_row.cells[i].auxiliar_index][i] = handling_content(auxiliar_rows[table_row.cells[i].auxiliar_index][i], cells[i]) + if not auxiliar_rows[table_row.cells[i].auxiliar_index][i].colspan_adjusted: + auxiliar_rows[table_row.cells[i].auxiliar_index][i].colspan_adjusted = True + # TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator + auxiliar_rows[table_row.cells[i].auxiliar_index][i] = adjust_colspan(auxiliar_rows[table_row.cells[i].auxiliar_index], i, len(cells), line, number_of_columns, delimiter_positions) + else: + table_row.cells[i] = handling_content(table_row.cells[i], cells[i]) + # Cell which is not separator + table_row.cells[i].rowspan += 1 + if not table_row.cells[i].colspan_adjusted: + table_row.cells[i].colspan_adjusted = True + #TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator + table_row.cells[i] = adjust_colspan(table_row.cells, i, len(cells), line, number_of_columns, delimiter_positions) #elif len(cells) == number_of_columns: # Simple row with partial separator, # A new row is to be added # for i in range(len(cells)): # if _matchGridTableBodySeparatorLine.match(cells[i]): # Update cell in new row @@ -674,30 +708,42 @@ def parse_pandoc_table_with_spans(pandoc_table): if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined for i in range(len(cells)): # Handle content of the cell - list_flags[i], table_row[i] = handling_content(table_row[i], cells[i], list_flags[i]) - if not table_row[i]['colspan_adjusted']: - table_row[i]['colspan_adjusted'] = True - table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions) + if table_row.cells[i].auxiliar_index is not None:# and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]: + auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i] = handling_content(auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i], cells[i]) + if not auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i].colspan_adjusted: + auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i].colspan_adjusted = True + #TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator + auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i] = adjust_colspan(auxiliar_rows[table_row.cells[i].auxiliar_index].cells, i, len(cells), line, number_of_columns, delimiter_positions) + else: + table_row.cells[i] = handling_content(table_row.cells[i], cells[i]) + if not table_row.cells[i].colspan_adjusted: + table_row.cells[i].colspan_adjusted = True + table_row.cells[i] = adjust_colspan(table_row.cells, i, len(cells), line, number_of_columns, delimiter_positions) elif len(cells) == number_of_columns: # Simple row for i in range(len(cells)): - if use_auxiliar_row[i]: - list_flags[i], auxiliar_row[i] = handling_content(auxiliar_row[i], cells[i],list_flags[i]) + if table_row.cells[i].auxiliar_index is not None:# and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]: + auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i] = handling_content(auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i], cells[i]) else: # Handle content of the cell - list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i]) + table_row.cells[i] = handling_content(table_row.cells[i], cells[i]) else: raise ValueError("More cells than columns found") else: raise ValueError("No separator line found for row starting") + if has_header and start >= header_separator_index: # table_row and auxiliar_row are part of data_rows - data_rows.append(table_row) + data_rows.append(table_row.cells) if has_merged_cells: - data_rows.append(auxiliar_row) + for row in auxiliar_rows: + #for i in range(len(row.cells)): + # print(row.cells[i].content) + data_rows.append(row.cells) elif has_header and start < header_separator_index: # table_row and auxiliar_row are part of header_rows - header_rows.append(table_row) + header_rows.append(table_row.cells) if has_merged_cells: - header_rows.append(auxiliar_row) + for row in auxiliar_rows: + header_rows.append(row.cells) #print(header_rows) #print(data_rows) @@ -711,35 +757,35 @@ def parse_pandoc_table_with_spans(pandoc_table): italic = "<i>" for row in rows: for cell in row: - if cell['content'] is not None: + if cell.content is not None: # Replacing "<" by < - cell['content'] = cell['content'].replace("<", "<") + cell.content = cell.content.replace("<", "<") #Bold for bold_characters in ["**", "__"]: - while cell['content'].find(bold_characters) != -1: - cell['content'] = cell['content'].replace(bold_characters, bold, 1) + while cell.content.find(bold_characters) != -1: + cell.content = cell.content.replace(bold_characters, bold, 1) if bold == "<strong>": bold = "</strong>" else: bold = "<strong>" #Italic - while cell['content'].find("_") != -1 and cell['content'].find("\_") == -1: - cell['content'] = cell['content'].rstrip() .replace("_", italic, 1) + while cell.content.find("_") != -1 and cell.content.find("\_") == -1: + cell.content = cell.content.rstrip() .replace("_", italic, 1) if italic == "<i>": italic = "</i>" else: italic = "<i>" - while cell['content'].find("\_") != -1: - cell['content'] = cell['content'].rstrip().replace("\_", "_", 1) + while cell.content.find("\_") != -1: + cell.content = cell.content.rstrip().replace("\_", "_", 1) # Correct newlines characters for row in header_rows: for cell in row: - cell['content'] = cell['content'].replace("\n", "<br />") if cell['content'] is not None else None + cell.content = cell.content.replace("\n", "<br />") if cell.content is not None else None for row in data_rows: for cell in row: - cell['content'] = cell['content'].replace("\n", "<br />") if cell['content'] is not None else None + cell.content = cell.content.replace("\n", "<br />") if cell.content is not None else None # Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows forward_rowspan = [] @@ -748,13 +794,13 @@ def parse_pandoc_table_with_spans(pandoc_table): forward_rowspan = [0 for _ in range(len(header_rows[row_index]))] sum = 0 for cell_index in range(len(header_rows[row_index])): - sum += header_rows[row_index][cell_index]['colspan'] - if row_index > 0 and header_rows[row_index][cell_index]['colspan'] == 0: + sum += header_rows[row_index][cell_index].colspan + if row_index > 0 and header_rows[row_index][cell_index].colspan == 0: if forward_rowspan[cell_index] > 0: sum += 1 forward_rowspan[cell_index] -= 1 - if forward_rowspan[cell_index] == 0 and header_rows[row_index][cell_index]['rowspan'] > 1: - forward_rowspan[cell_index] = header_rows[row_index][cell_index]['rowspan'] -1 + if forward_rowspan[cell_index] == 0 and header_rows[row_index][cell_index].rowspan > 1: + forward_rowspan[cell_index] = header_rows[row_index][cell_index].rowspan -1 if not sum == number_of_columns: raise ValueError("Grid table not converted properly") forward_rowspan = [] @@ -763,13 +809,13 @@ def parse_pandoc_table_with_spans(pandoc_table): forward_rowspan = [0 for _ in range(len(data_rows[row_index]))] sum = 0 for cell_index in range(len(data_rows[row_index])): - sum += data_rows[row_index][cell_index]['colspan'] - if row_index > 0 and data_rows[row_index][cell_index]['colspan'] == 0: + sum += data_rows[row_index][cell_index].colspan + if row_index > 0 and data_rows[row_index][cell_index].colspan == 0: if forward_rowspan[cell_index] > 0: sum += 1 forward_rowspan[cell_index] -= 1 - if forward_rowspan[cell_index] == 0 and data_rows[row_index][cell_index]['rowspan'] > 1: - forward_rowspan[cell_index] = data_rows[row_index][cell_index]['rowspan'] - 1 + if forward_rowspan[cell_index] == 0 and data_rows[row_index][cell_index].rowspan > 1: + forward_rowspan[cell_index] = data_rows[row_index][cell_index].rowspan - 1 if not sum == number_of_columns: raise ValueError("Grid table not converted properly") @@ -789,35 +835,35 @@ def generate_html_table_with_spans(pandoc_table): for row in grid_header: for cell in row: - if cell['rowspan'] != 0 and cell['colspan'] != 0: + if cell.rowspan != 0 and cell.colspan != 0: has_header = True if has_header: html += " <thead>\n" for row in grid_header: html += " <tr>\n" for cell in row: - if cell['rowspan'] == 0 or cell['colspan'] == 0: + if cell.rowspan == 0 or cell.colspan == 0: continue else: # Prepare content, in case there's a list - #print(cell['content']) + #print(cell.content) if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", - cell['content']): # Update cell in new row + cell.content): # Update cell in new row #print("MATCHING") list = "<ul>" # Build list the matches for match in matches: list += "<li>" + match[1] + "</li>" list += "</ul>" - cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell['content']) + cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell.content) # Enforce left alignment if cell contains a list - cell['alignment'] = "align=\"left\"" + cell.alignment = "align=\"left\"" #else: # print("NOT MATCHING") - rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else "" - colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else "" - html += f" <th{rowspan}{colspan} {cell['alignment']}>{cell['content']}</th>\n" + rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else "" + colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else "" + html += f" <th{rowspan}{colspan} {cell.alignment}>{cell.content}</th>\n" html += " </tr>\n" html += " </thead>\n" @@ -825,26 +871,27 @@ def generate_html_table_with_spans(pandoc_table): for row in grid_body: html += " <tr>\n" for cell in row: - if cell['rowspan'] == 0 or cell['colspan'] == 0: + if cell.rowspan == 0 or cell.colspan == 0: continue else: #Prepare content, in case there's a list - #print(cell['content']) - if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell['content']): # Update cell in new row + #print(cell.content) + if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell.content): # Update cell in new row #print("MATCHING") + #print(cell.content) list = "<ul>" # Build list the matches for match in matches: list += "<li>" + match[1] + "</li>" list += "</ul>" - cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell['content']) + cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell.content) # Enforce left alignment if cell contains a list - cell['alignment'] = "align=\"left\"" + cell.alignment = "align=\"left\"" #else: #print("NOT MATCHING") - rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else "" - colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else "" - html += f" <td{rowspan}{colspan} {cell['alignment']}>{cell['content']}</td>\n" + rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else "" + colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else "" + html += f" <td{rowspan}{colspan} {cell.alignment}>{cell.content}</td>\n" html += " </tr>\n" html += " </tbody>\n" -- GitLab