From 944ea98ee8fe1d122e29b9924cc40e6d156f8fcf Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Fri, 21 Feb 2025 09:41:06 +0100 Subject: [PATCH] Improvements for grid tables conversion --- toMkdocs/toMkdocs.py | 172 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 141 insertions(+), 31 deletions(-) diff --git a/toMkdocs/toMkdocs.py b/toMkdocs/toMkdocs.py index ae0be2a..0222748 100644 --- a/toMkdocs/toMkdocs.py +++ b/toMkdocs/toMkdocs.py @@ -486,18 +486,31 @@ def parse_pandoc_table_with_spans(pandoc_table): self.list_flag = False def set_alignment(self): - header_delimiter_index = 0 - while header_delimiter_index in range(len(default_alignments)) and self.position > header_delimiter_positions[header_delimiter_index]: - header_delimiter_index += 1 - if header_delimiter_index in range(len(default_alignments)): - if self.position < header_delimiter_positions[header_delimiter_index]: - self.alignment = default_alignments[header_delimiter_index] - elif self.position == header_delimiter_positions[header_delimiter_index]: - self.alignment = default_alignments[header_delimiter_index] + if has_header: + header_delimiter_index = 0 + while header_delimiter_index in range(len(default_alignments)) and self.position > header_delimiter_positions[header_delimiter_index]: header_delimiter_index += 1 + if header_delimiter_index in range(len(default_alignments)): + if self.position < header_delimiter_positions[header_delimiter_index]: + self.alignment = default_alignments[header_delimiter_index] + elif self.position == header_delimiter_positions[header_delimiter_index]: + self.alignment = default_alignments[header_delimiter_index] + header_delimiter_index += 1 + else: + raise ValueError("Invalid table formatting") else: - raise ValueError("Invalid table formatting") - + body_delimiter_index = 0 + while body_delimiter_index in range(len(default_alignments)) and self.position > \ + delimiter_positions[body_delimiter_index]: + body_delimiter_index += 1 + if body_delimiter_index in range(len(default_alignments)): + if self.position < delimiter_positions[body_delimiter_index]: + self.alignment = default_alignments[body_delimiter_index] + elif self.position == delimiter_positions[body_delimiter_index]: + self.alignment = default_alignments[body_delimiter_index] + body_delimiter_index += 1 + else: + raise ValueError("Invalid table formatting") class Row(): """ Represents a row in the markdown file. """ cells:list[Cell] = [] @@ -534,12 +547,15 @@ def parse_pandoc_table_with_spans(pandoc_table): if content.strip().startswith("- "): # List cell.list_flag = True #print(content) - cell.content = content.strip() + "\n" # Add newline to know when the list element ends + content = re.sub(r'\\\s*$', "\n", content.strip()) + cell.content = content + "@" # Add list element end mark to know when the list element ends elif cell.list_flag and content.strip() != "": # any other content when handling list is concatenated to the last list element - cell.content += content.strip() + "\n" + content = re.sub(r'\\\s*$', "\n", content.strip()) + cell.content += content + "@" #add the list element end mark elif content.strip == "": # separation between list and other paragraph - cell.list_flag = False - cell.content += "\n" #if not cell['content'].endswith("\n") else "" + #if cell.list_flag: + # cell.list_flag = False + cell.content += "\n" if not cell['content'].endswith("\n") else "" else: cell.content = re.sub(r'\\\s*$', "\n", content.strip()) else: @@ -548,12 +564,16 @@ def parse_pandoc_table_with_spans(pandoc_table): cell.content += "\n" #cell['content'] = cell['content'].strip("\n") cell.list_flag = True - cell.content += content.strip() + "\n" # Add newline to know when the list element ends + content = re.sub(r'\\\s*$', "\n", content.strip()) + cell.content += content + "@" # Add list element end mark to know when the list element ends elif cell.list_flag and content.strip() != "": # any other content when handling list is concatenated to the last list element - cell.content = cell.content.strip("\n") - cell.content += " " + content.strip() + "\n" + cell.content = cell.content.strip("@") #remove list element end mark + content = re.sub(r'\\\s*$', "\n", content.strip()) + cell.content += " " + content + "@" #add list element end mark elif content.strip() == "": # separation between list and other paragraph - cell.list_flag = False + if cell.list_flag: + cell.list_flag = False + cell.content += "\n\n" #end list by \n #content = re.sub(r'\\\s*$', "\n", content.strip()) cell.content += "\n" if not cell.content.endswith("\n") else "" else: @@ -604,6 +624,7 @@ def parse_pandoc_table_with_spans(pandoc_table): delimiter_positions.append(min(del_positions) if del_positions else -1) has_header = False header_delimiter_positions = [] + header_rows = [] for index in separator_indices: if _matchGridTableHeaderSeparator.match(lines[index]): has_header = True @@ -624,6 +645,18 @@ def parse_pandoc_table_with_spans(pandoc_table): del_positions = [lines[index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[index][delimiter_positions_start + 1:]] header_delimiter_positions.append(min(del_positions) if del_positions else -1) + if not has_header: + #Set default alignments from the first separator + parts = re.split(r"\+", lines[0].strip("+")) + default_alignments = [] + # Calculate default alignments and positions of delimiters + for part_index in range(len(parts)): + if parts[part_index].startswith(":") and not parts[part_index].endswith(":"): + default_alignments.append("align=\"left\"") + elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"): + default_alignments.append("align=\"right\"") + else: + default_alignments.append("align=\"center\"") data_rows = [] for row in range(len(separator_indices) - 1): rows = [] @@ -636,6 +669,10 @@ def parse_pandoc_table_with_spans(pandoc_table): for line in row_lines: if is_separator(line) and not in_data_row: in_data_row = True + # Add delimiter alignment check for separator lines + if not check_delimiter_alignment(line, delimiter_positions): + raise ValueError(f"Misaligned delimiters in separator row: {line}") + parts = re.split(r"\s*\+\s*", line.strip("+")) delimiter_index = 0 # Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator) @@ -667,7 +704,11 @@ def parse_pandoc_table_with_spans(pandoc_table): elif in_data_row: # Regular data row or partial separator if _matchGridTableBodySeparator.match(line): # Partial separator - cells_content = re.split(r"[\|\+]", line.strip("|").strip("+")) # (?<!\\)[\|\+] + # Add delimiter alignment check for partial separators + if not check_delimiter_alignment(line, delimiter_positions): + raise ValueError(f"Misaligned delimiters in partial separator: {line}") + + cells_content = re.split(r"[\|\+]", line.strip("|").strip("+")) #Add another row, set delimiters for each cell rows.append(Row(number_of_columns)) aux_delimiter_index = 0 @@ -717,7 +758,13 @@ def parse_pandoc_table_with_spans(pandoc_table): else: raise ValueError("More cells than columns found") else: # Data row - cells_content = re.split(r"\s*\|\s*", line.strip("|")) + cells_content = line.strip() + cells_content = re.split(r"\|", line.strip("|")) + + # Add delimiter alignment check + if not check_delimiter_alignment(line, delimiter_positions): + raise ValueError(f"Misaligned delimiters in row: {line}") + column_index = 0 if len(cells_content) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined for i in range(len(cells_content)): @@ -744,6 +791,10 @@ def parse_pandoc_table_with_spans(pandoc_table): elif has_header and start < header_separator_index: # table_row and auxiliar_row are part of header_rows for header_row in rows: header_rows.append(header_row.cells) + else: + #only body + for body_row in rows: + data_rows.append(body_row.cells) #print(header_rows) #print(data_rows) @@ -821,19 +872,32 @@ def parse_pandoc_table_with_spans(pandoc_table): return header_rows, data_rows -def generate_html_table_with_spans(pandoc_table): +def generate_html_table_with_spans(pandoc_table: str) -> str: """ Generate an HTML table from a Pandoc-style grid table with row and column spans. - :param pandoc_table: String of the Pandoc-style grid table. - :return: HTML string. + Args: + pandoc_table (str): String of the Pandoc-style grid table. + + Returns: + str: Generated HTML table markup, or error message if generation fails. """ + debug_output = [] + def debug_print(msg): + debug_output.append(str(msg)) # Convert message to string + try: + # Redirect print statements to our debug collector + global print + original_print = print + print = debug_print + grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table) - except: - logging.ERROR("Grid table could not be generated") - return "HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS" - else: + + # Restore original print + print = original_print + + # Generate table HTML... html = "<table>\n" has_header = False @@ -851,7 +915,7 @@ def generate_html_table_with_spans(pandoc_table): else: # Prepare content, in case there's a list #print(cell.content) - if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", + if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+)@", cell.content): # Update cell in new row #print("MATCHING") list = "<ul>" @@ -859,7 +923,7 @@ def generate_html_table_with_spans(pandoc_table): for match in matches: list += "<li>" + match[1] + "</li>" list += "</ul>" - cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell.content) + cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+(?:(?!@).)+@)+", list, cell.content) # Enforce left alignment if cell contains a list cell.alignment = "align=\"left\"" #else: @@ -880,7 +944,7 @@ def generate_html_table_with_spans(pandoc_table): else: #Prepare content, in case there's a list #print(cell.content) - if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell.content): # Update cell in new row + if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+)@", cell.content): # Update cell in new row #print("MATCHING") #print(cell.content) list = "<ul>" @@ -888,7 +952,7 @@ def generate_html_table_with_spans(pandoc_table): for match in matches: list += "<li>" + match[1] + "</li>" list += "</ul>" - cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell.content) + cell.content = re.sub(r"\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+@)+",list, cell.content) # Enforce left alignment if cell contains a list cell.alignment = "align=\"left\"" #else: @@ -901,6 +965,52 @@ def generate_html_table_with_spans(pandoc_table): html += " </tbody>\n" html += "</table>" return html + except Exception as e: + logging.error("Grid table could not be generated") + debug_text = "<br>".join(debug_output) # Now all items are strings + return f"HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE.<br><pre>{debug_text}</pre>" + +def check_delimiter_alignment(line: str, delimiter_positions: list[int], delimiters: str = "|+") -> bool: + """ + Check if delimiters in a row align with expected positions. + + Args: + line: The line of text to check + delimiter_positions: List of expected positions (based on + characters) + delimiters: String containing valid delimiter characters (default: "|+") + + Returns: + bool: True if delimiters align correctly, False otherwise + """ + if not line or not delimiter_positions: + return False + + print(f"\nChecking line: '{line}'") + print(f"Expected delimiter positions: {delimiter_positions}") + + # For full separator lines (only +) + if '+' in line and '|' not in line: + current_positions = [i for i, char in enumerate(line) if (char == '+' and i != 0)] + print(f"Full separator line - Found + at positions: {current_positions}") + return all(delimiter_positions[-1] in current_positions and + line.startswith("+") and + pos in delimiter_positions for pos in current_positions) + + # For data lines (only |) + if '|' in line and '+' not in line: + current_positions = [i for i, char in enumerate(line) if (char == '|' and i != 0)] + print(f"Data line - Found | at positions: {current_positions}") + return all(delimiter_positions[-1] in current_positions and + line.startswith("|") and + pos in delimiter_positions for pos in current_positions) + + # For partial separators (mix of + and |) + current_positions = [i for i, char in enumerate(line) if (char in delimiters and i != 0)] + print(f"Partial separator - Found delimiters at positions: {current_positions}") + print(f"Characters at those positions: {[line[pos] for pos in current_positions]}") + return all(delimiter_positions[-1] in current_positions and + (line.startswith("+") or line.startswith("|")) and + pos in delimiter_positions for pos in current_positions) def analyseMarkdown(filename:str) -> Document: """ Analyse the markdown file and split it into clauses. -- GitLab