From 2451610e5d7b6278076cd1640cc04870de606ce6 Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Wed, 27 Nov 2024 15:06:25 +0100 Subject: [PATCH] More cleanup + cell alignment as defined in header separator line (Pandoc's behaviour) --- toMkdocs/toMkdocs.py | 222 +++++++++++++++++-------------------------- 1 file changed, 88 insertions(+), 134 deletions(-) diff --git a/toMkdocs/toMkdocs.py b/toMkdocs/toMkdocs.py index 69427be..33947cb 100644 --- a/toMkdocs/toMkdocs.py +++ b/toMkdocs/toMkdocs.py @@ -422,6 +422,7 @@ _matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE) _matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE) _matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE) _matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE) +_matchGridTableBodySeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE) _match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE) _matchListInContent = re.compile(r'^(?:\s*(P<marker>[-*+]|\s*\d+\.))\s+(P<content>.+)$', re.IGNORECASE) _markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE) @@ -502,8 +503,24 @@ def parse_pandoc_table_with_spans(pandoc_table): #print(cell['content']) return list_flag, cell + def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions): + for j in range(column_index, number_of_parts): + delimiter_start = row[j - 1]['position'] if j != 0 else 0 + positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "|+" if delimiter in line[delimiter_start + 1:]] + position = min(positions) if positions else -1 + if position > delimiter_positions[j]: # Colspan to be increased + row[i]['colspan'] += 1 + if position == delimiter_positions[len(delimiter_positions) - 1]: # last cell in row, adjust colspan to get max number columns + colspan_allocated = 0 + for cell_index in range(number_of_parts): + colspan_allocated += row[cell_index]['colspan'] + row[column_index]['colspan'] += number_of_columns - colspan_allocated + elif position < delimiter_positions[j]: + raise ValueError("Wrong cell formatting") + else: + break + return row[column_index] - _matchGridTableSeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE) separator_indices = [i for i, line in enumerate(lines) if is_separator(line)] print(separator_indices) @@ -522,11 +539,26 @@ def parse_pandoc_table_with_spans(pandoc_table): del_positions = [lines[separator_index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[separator_index][delimiter_positions_start + 1:]] delimiter_positions.append(min(del_positions) if del_positions else -1) has_header = False + header_delimiter_positions = [] for index in separator_indices: if _matchGridTableHeaderSeparator.match(lines[index]): has_header = True header_separator_index = index header_rows = [] + parts = re.split(r"\s*\+\s*", lines[index].strip("+")) + default_alignments = [] + #Calculate default alignments and positions of delimiters + for part_index in range(len(parts)): + if parts[part_index].startswith(":") and not parts[part_index].endswith(":"): + default_alignments.append("align=\"left\"") + elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"): + default_alignments.append("align=\"right\"") + else: + default_alignments.append("align=\"center\"") + # Delimiter position + delimiter_positions_start = delimiter_positions[part_index - 1] if part_index != 0 else 0 + del_positions = [lines[index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[index][delimiter_positions_start + 1:]] + header_delimiter_positions.append(min(del_positions) if del_positions else -1) data_rows = [] for row in range(len(separator_indices) - 1): @@ -550,14 +582,15 @@ def parse_pandoc_table_with_spans(pandoc_table): # Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator) # we need to assign the default alignment as defined in the header separator line # We may not need the code below, as that supports alignment per cell and row - alignments = [] - for part_index in range(len(parts)): - if parts[part_index].startswith(":") and not parts[part_index].endswith(":"): - alignments.append("align=\"left\"") - elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"): - alignments.append("align=\"right\"") - else: - alignments.append("align=\"center\"") + #alignments = [] + #for part_index in range(len(parts)): + # if parts[part_index].startswith(":") and not parts[part_index].endswith(":"): + # alignments.append("align=\"left\"") + # elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"): + # alignments.append("align=\"right\"") + # else: + # alignments.append("align=\"center\"") + header_delimiter_index = 0 for i in range(number_of_columns_row): delimiter_index += len(parts[i]) + 1 table_row.append({ @@ -565,9 +598,21 @@ def parse_pandoc_table_with_spans(pandoc_table): "rowspan": 0, "colspan": 0, "colspan_adjusted": False, - "alignment": alignments[i] if alignments[i] else "align=\"center\"", + "alignment": default_alignments[i] if i == 0 else "align=\"center\"", "position": delimiter_index # Position of cell delimiter + }) + #Set alignment as defined by header separator line + while header_delimiter_index in range(len(default_alignments)) and table_row[i]['position'] > header_delimiter_positions[header_delimiter_index]: + header_delimiter_index += 1 + if header_delimiter_index in range(len(default_alignments)): + if table_row[i]['position'] < header_delimiter_positions[header_delimiter_index]: + table_row[i]['alignment'] = default_alignments[header_delimiter_index] + elif table_row[i]['position'] == header_delimiter_positions[header_delimiter_index]: + table_row[i]['alignment'] = default_alignments[i] + header_delimiter_index += 1 + else: + raise ValueError("Invalid table formatting") + for i in range(number_of_columns): auxiliar_row.append({ "content": None, @@ -585,10 +630,11 @@ def parse_pandoc_table_with_spans(pandoc_table): if _matchGridTableBodySeparator.match(line): # Partial separator has_merged_cells = True cells = re.split(r"\s*[\|\+]\s*", line.strip("|").strip("+")) # (?<!\\)[\|\+] - if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined + if len(cells) <= number_of_columns: # Colspan: Positions of | with respect to + need to be determined for i in range(len(cells)): - if _matchGridTableSeparatorLine.match(cells[i]): # A new row is to be added + if _matchGridTableBodySeparatorLine.match(cells[i]): # A new row is to be added use_auxiliar_row[i] = True + list_flags[i] = False if cells[i].startswith(":") and not cells[i].endswith(":"): auxiliar_row[i]['alignment'] = "align=\"left\"" elif not cells[i].startswith(":") and cells[i].endswith(":"): @@ -596,145 +642,49 @@ def parse_pandoc_table_with_spans(pandoc_table): else: auxiliar_row[i]['alignment'] = "align=\"center\"" else: + #Handle content of the cell list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i]) - #if table_row[i]['content'] is None: - # table_row[i]['rowspan'] += 1 - # table_row[i]['colspan'] += 1 - #if cells[i].strip().startswith("- "): # List - # handling_list = True - # print(cells[i]) - # table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends - #elif handling_list: # any other content when handling list is concatenated to the last list element - # table_row[i]['content'].strip("\n") - # table_row[i]['content'] += cells[i] + "\n" - #elif cells[i].strip(): #separation between list and other paragraph - # handling_list = False - # table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) - #else: - # table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) - #else: - # if cells[i].strip().startswith("- "): # List - # print(cells[i]) - # table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends - # else: - # table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) # Cell which is not separator + # Cell which is not separator table_row[i]['rowspan'] += 1 if not table_row[i]['colspan_adjusted']: table_row[i]['colspan_adjusted'] = True - for j in range(i, len(cells)): - delimiter_start = table_row[j-1]['position'] if j != 0 else 0 - positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "|+" if delimiter in line[delimiter_start + 1:]] - position = min(positions) if positions else -1 - if position > delimiter_positions_start[j]: # Colspan to add - table_row[i]['colspan'] += 1 - elif position < delimiter_positions_start[j]: - raise ValueError("Wrong cell formatting") - else: - break - elif len(cells) == number_of_columns: # Simple row with partial separator, # A new row is to be added - for i in range(len(cells)): - if _matchGridTableSeparatorLine.match(cells[i]): # Update cell in new row - use_auxiliar_row[i] = True - if cells[i].startswith(":") and not cells[i].endswith(":"): - auxiliar_row[i]['alignment'] = "align=\"left\"" - elif not cells[i].startswith(":") and cells[i].endswith(":"): - auxiliar_row[i]['alignment'] = "align=\"right\"" - else: - auxiliar_row[i]['alignment'] = "align=\"center\"" - else: - list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i]) - -# if table_row[i]['content'] is None: -# table_row[i]['rowspan'] += 1 -# table_row[i]['colspan'] += 1 -# if cells[i].strip().startswith("- "): # List -# print(cells[i]) -# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends -# else: -# table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i]) -# else: -# if cells[i].strip().startswith("- "): # List -# print(cells[i]) -# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends -# else: -# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) - - # Cell which is not separator - table_row[i]['rowspan'] += 1 - # Not needed, no colspan as number of cells is equal to number of columns - #for j in range(i, len(cells)): - # delimiter_start = table_row[j-1]['position'] if j != 0 else 0 - # positions = [line.find(delimiter,delimiter_start+1) for delimiter in "|+" if delimiter in line[delimiter_start+1:]] - # position = min(positions) if positions else -1 - # if position > table_row[i]['position']: # Only colspan to be increased - # table_row[i]['colspan'] += 1 - # elif position + 1 < table_row[i]['position']: - # raise ValueError("Wrong cell formatting") - # else: - # break - + #TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator + table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions) + #elif len(cells) == number_of_columns: # Simple row with partial separator, # A new row is to be added + # for i in range(len(cells)): + # if _matchGridTableBodySeparatorLine.match(cells[i]): # Update cell in new row + # use_auxiliar_row[i] = True + # list_flags[i] = False + # if cells[i].startswith(":") and not cells[i].endswith(":"): + # auxiliar_row[i]['alignment'] = "align=\"left\"" + # elif not cells[i].startswith(":") and cells[i].endswith(":"): + # auxiliar_row[i]['alignment'] = "align=\"right\"" + # else: + # auxiliar_row[i]['alignment'] = "align=\"center\"" + # else: + # #Handle content of the cell + # list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i]) + # # Cell which is not separator + # table_row[i]['rowspan'] += 1 + # # Adjusting of colspan not needed, no colspan as number of cells is equal to number of columns else: raise ValueError("More cells than columns found") else: # Data row cells = re.split(r"\s*\|\s*", line.strip("|")) if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined for i in range(len(cells)): + # Handle content of the cell list_flags[i], table_row[i] = handling_content(table_row[i], cells[i], list_flags[i]) -# if table_row[i]['content'] is None: -# table_row[i]['rowspan'] += 1 -# table_row[i]['colspan'] += 1 -# if cells[i].strip().startswith("- "): # List -# print(cells[i]) -# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends -# else: -# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) -# else: -# if cells[i].strip().startswith("- "): # List -# print(cells[i]) -# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends -# else: -# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) if not table_row[i]['colspan_adjusted']: table_row[i]['colspan_adjusted'] = True - for j in range(i, len(cells)): - delimiter_start = table_row[j-1]['position'] if j != 0 else 0 - if line.find("|", delimiter_start+1) > delimiter_positions[j]: # Colspan to be increased - table_row[i]['colspan'] += 1 - if line.find("|", delimiter_start + 1) == delimiter_positions[len(delimiter_positions) - 1]: # last cell in row, adjust colspan to get max number columns - colspan_remaining = 0 - for cell_index in range(number_of_columns_row): - colspan_remaining += table_row[cell_index]['colspan'] - table_row[i]['colspan'] += number_of_columns - colspan_remaining - elif line.find("|", delimiter_start+1) < delimiter_positions[j]: - raise ValueError("Wrong cell formatting") - else: - break - + table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions) elif len(cells) == number_of_columns: # Simple row for i in range(len(cells)): if use_auxiliar_row[i]: - if auxiliar_row[i]['content'] is None: - auxiliar_row[i]['rowspan'] += 1 - auxiliar_row[i]['colspan'] += 1 - auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) - else: - auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) + list_flags[i], auxiliar_row[i] = handling_content(auxiliar_row[i], cells[i],list_flags[i]) else: + # Handle content of the cell list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i]) -# if table_row[i]['content'] is None: -# table_row[i]['rowspan'] += 1 -# table_row[i]['colspan'] += 1 -# if cells[i].strip().startswith("- "): # List -# print(cells[i]) -# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends -# else: -# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) -# else: -# if cells[i].strip().startswith("- "): # List -# print(cells[i]) -# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends -# else: -# table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i]) else: raise ValueError("More cells than columns found") else: @@ -850,7 +800,7 @@ def generate_html_table_with_spans(pandoc_table): continue else: # Prepare content, in case there's a list - print(cell['content']) + #print(cell['content']) if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell['content']): # Update cell in new row #print("MATCHING") @@ -860,6 +810,8 @@ def generate_html_table_with_spans(pandoc_table): list += "<li>" + match[1] + "</li>" list += "</ul>" cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell['content']) + # Enforce left alignment if cell contains a list + cell['alignment'] = "align=\"left\"" #else: # print("NOT MATCHING") @@ -886,6 +838,8 @@ def generate_html_table_with_spans(pandoc_table): list += "<li>" + match[1] + "</li>" list += "</ul>" cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell['content']) + # Enforce left alignment if cell contains a list + cell['alignment'] = "align=\"left\"" #else: #print("NOT MATCHING") rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else "" -- GitLab