From d59cfbc2f99f58e9a1fb7f9ce7c4f7df5eeb040e Mon Sep 17 00:00:00 2001 From: Miguel Angel Reina Ortega <miguelangel.reinaortega@etsi.org> Date: Tue, 26 Nov 2024 14:11:30 +0100 Subject: [PATCH] Some cleanup + parsing converting lists in cells to html lists --- toMkdocs/toMkdocs.py | 266 +++++++++++++++++++++++++++++++++---------- 1 file changed, 204 insertions(+), 62 deletions(-) diff --git a/toMkdocs/toMkdocs.py b/toMkdocs/toMkdocs.py index 3908718..69427be 100644 --- a/toMkdocs/toMkdocs.py +++ b/toMkdocs/toMkdocs.py @@ -417,12 +417,13 @@ _matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE) _matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE) _matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE) _matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECASE) -_matchTable = re.compile(r'^\s*\|.*\|\s$', re.IGNORECASE) +_matchTable = re.compile(r'^\s*\|.*\|\s*$', re.IGNORECASE) _matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE) _matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE) -_matchGridTableBodySeparator = re.compile(r'.*\+([-:]+\+)+.*$', re.IGNORECASE) +_matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE) _matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE) _match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE) +_matchListInContent = re.compile(r'^(?:\s*(P<marker>[-*+]|\s*\d+\.))\s+(P<content>.+)$', re.IGNORECASE) _markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE) _htmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE) _htmlAnchorLink = re.compile(r'<a\s+name="([^"]*)">[^<]*</a>', re.IGNORECASE) @@ -466,6 +467,42 @@ def parse_pandoc_table_with_spans(pandoc_table): _matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE) return _matchGridTableSeparator.match(line) + def handling_content(cell, content, list_flag): + if cell['content'] is None: + cell['rowspan'] += 1 + cell['colspan'] += 1 + if content.strip().startswith("- "): # List + list_flag = True + print(content) + cell['content'] = content.strip() + "\n" # Add newline to know when the list element ends + elif list_flag: # any other content when handling list is concatenated to the last list element + cell['content'] += content.strip() + "\n" + elif cells[i].strip() == "": # separation between list and other paragraph + list_flag = False + cell['content'] = re.sub(r'\\\s*$', "\n", content) + else: + cell['content'] = re.sub(r'\\\s*$', "\n", content.strip()) + else: + if content.strip().startswith("- "): # List + if not list_flag: + cell['content'] += "\n" + #cell['content'] = cell['content'].strip("\n") + list_flag = True + cell['content'] += content.strip() + "\n" # Add newline to know when the list element ends + elif list_flag: # any other content when handling list is concatenated to the last list element + cell['content'] = cell['content'].strip("\n") + cell['content'] += " " + content.strip() + "\n" + elif cells[i].strip() == "": # separation between list and other paragraph + list_flag = False + #content = re.sub(r'\\\s*$', "\n", content.strip()) + cell['content'] += "\n" if not cell['content'].endswith("\n") else "" + else: + content = re.sub(r'\\\s*$', "\n", content.strip()) + cell['content'] += " " + content + #print(cell['content']) + return list_flag, cell + + _matchGridTableSeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE) separator_indices = [i for i, line in enumerate(lines) if is_separator(line)] @@ -490,11 +527,13 @@ def parse_pandoc_table_with_spans(pandoc_table): has_header = True header_separator_index = index header_rows = [] + data_rows = [] for row in range(len(separator_indices) - 1): table_row = [] auxiliar_row = [] use_auxiliar_row = [] + list_flags = [] has_merged_cells = False in_data_row = False start, end = separator_indices[row], separator_indices[row + 1] @@ -508,24 +547,38 @@ def parse_pandoc_table_with_spans(pandoc_table): parts = re.split(r"\s*\+\s*", line.strip("+")) # Add as many cells as columns with span attributes delimiter_index = 0 + # Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator) + # we need to assign the default alignment as defined in the header separator line + # We may not need the code below, as that supports alignment per cell and row + alignments = [] + for part_index in range(len(parts)): + if parts[part_index].startswith(":") and not parts[part_index].endswith(":"): + alignments.append("align=\"left\"") + elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"): + alignments.append("align=\"right\"") + else: + alignments.append("align=\"center\"") for i in range(number_of_columns_row): delimiter_index += len(parts[i]) + 1 table_row.append({ - "content": "NOCONTENT", + "content": None, "rowspan": 0, "colspan": 0, "colspan_adjusted": False, + "alignment": alignments[i] if alignments[i] else "align=\"center\"", "position": delimiter_index # Position of cell delimiter + }) for i in range(number_of_columns): auxiliar_row.append({ - "content": "NOCONTENT", + "content": None, "rowspan": 0, "colspan": 0, "colspan_adjusted": False, + "alignment": "align=\"center\"", "position": 0 }) use_auxiliar_row.append(False) + list_flags.append(False) elif in_data_row: # Regular data row or partial separator @@ -536,14 +589,35 @@ def parse_pandoc_table_with_spans(pandoc_table): for i in range(len(cells)): if _matchGridTableSeparatorLine.match(cells[i]): # A new row is to be added use_auxiliar_row[i] = True - else: - if table_row[i]['content'] == "NOCONTENT": - table_row[i]['rowspan'] += 1 - table_row[i]['colspan'] += 1 - table_row[i]['content'] = cells[i] + if cells[i].startswith(":") and not cells[i].endswith(":"): + auxiliar_row[i]['alignment'] = "align=\"left\"" + elif not cells[i].startswith(":") and cells[i].endswith(":"): + auxiliar_row[i]['alignment'] = "align=\"right\"" else: - table_row[i]['content'] += cells[i] - # Cell which is not separator + auxiliar_row[i]['alignment'] = "align=\"center\"" + else: + list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i]) + #if table_row[i]['content'] is None: + # table_row[i]['rowspan'] += 1 + # table_row[i]['colspan'] += 1 + #if cells[i].strip().startswith("- "): # List + # handling_list = True + # print(cells[i]) + # table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends + #elif handling_list: # any other content when handling list is concatenated to the last list element + # table_row[i]['content'].strip("\n") + # table_row[i]['content'] += cells[i] + "\n" + #elif cells[i].strip(): #separation between list and other paragraph + # handling_list = False + # table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) + #else: + # table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) + #else: + # if cells[i].strip().startswith("- "): # List + # print(cells[i]) + # table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends + # else: + # table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) # Cell which is not separator table_row[i]['rowspan'] += 1 if not table_row[i]['colspan_adjusted']: table_row[i]['colspan_adjusted'] = True @@ -561,13 +635,30 @@ def parse_pandoc_table_with_spans(pandoc_table): for i in range(len(cells)): if _matchGridTableSeparatorLine.match(cells[i]): # Update cell in new row use_auxiliar_row[i] = True - else: - if table_row[i]['content'] == "NOCONTENT": - table_row[i]['rowspan'] += 1 - table_row[i]['colspan'] += 1 - table_row[i]['content'] = cells[i] + if cells[i].startswith(":") and not cells[i].endswith(":"): + auxiliar_row[i]['alignment'] = "align=\"left\"" + elif not cells[i].startswith(":") and cells[i].endswith(":"): + auxiliar_row[i]['alignment'] = "align=\"right\"" else: - table_row[i]['content'] += cells[i] + auxiliar_row[i]['alignment'] = "align=\"center\"" + else: + list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i]) + +# if table_row[i]['content'] is None: +# table_row[i]['rowspan'] += 1 +# table_row[i]['colspan'] += 1 +# if cells[i].strip().startswith("- "): # List +# print(cells[i]) +# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends +# else: +# table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i]) +# else: +# if cells[i].strip().startswith("- "): # List +# print(cells[i]) +# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends +# else: +# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) + # Cell which is not separator table_row[i]['rowspan'] += 1 # Not needed, no colspan as number of cells is equal to number of columns @@ -588,40 +679,62 @@ def parse_pandoc_table_with_spans(pandoc_table): cells = re.split(r"\s*\|\s*", line.strip("|")) if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined for i in range(len(cells)): - if table_row[i]['content'] == "NOCONTENT": - table_row[i]['rowspan'] += 1 - table_row[i]['colspan'] += 1 - table_row[i]['content'] = cells[i] - else: - table_row[i]['content'] += cells[i] + list_flags[i], table_row[i] = handling_content(table_row[i], cells[i], list_flags[i]) +# if table_row[i]['content'] is None: +# table_row[i]['rowspan'] += 1 +# table_row[i]['colspan'] += 1 +# if cells[i].strip().startswith("- "): # List +# print(cells[i]) +# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends +# else: +# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) +# else: +# if cells[i].strip().startswith("- "): # List +# print(cells[i]) +# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends +# else: +# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) if not table_row[i]['colspan_adjusted']: table_row[i]['colspan_adjusted'] = True for j in range(i, len(cells)): delimiter_start = table_row[j-1]['position'] if j != 0 else 0 if line.find("|", delimiter_start+1) > delimiter_positions[j]: # Colspan to be increased table_row[i]['colspan'] += 1 + if line.find("|", delimiter_start + 1) == delimiter_positions[len(delimiter_positions) - 1]: # last cell in row, adjust colspan to get max number columns + colspan_remaining = 0 + for cell_index in range(number_of_columns_row): + colspan_remaining += table_row[cell_index]['colspan'] + table_row[i]['colspan'] += number_of_columns - colspan_remaining elif line.find("|", delimiter_start+1) < delimiter_positions[j]: raise ValueError("Wrong cell formatting") else: - break elif len(cells) == number_of_columns: # Simple row for i in range(len(cells)): if use_auxiliar_row[i]: - if auxiliar_row[i]['content'] == "NOCONTENT": + if auxiliar_row[i]['content'] is None: auxiliar_row[i]['rowspan'] += 1 auxiliar_row[i]['colspan'] += 1 - auxiliar_row[i]['content'] = cells[i] + auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) else: - auxiliar_row[i]['content'] += cells[i] + auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) else: - if table_row[i]['content'] == "NOCONTENT": - table_row[i]['rowspan'] += 1 - table_row[i]['colspan'] += 1 - table_row[i]['content'] = cells[i] - else: - table_row[i]['content'] += cells[i] + list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i]) +# if table_row[i]['content'] is None: +# table_row[i]['rowspan'] += 1 +# table_row[i]['colspan'] += 1 +# if cells[i].strip().startswith("- "): # List +# print(cells[i]) +# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends +# else: +# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) +# else: +# if cells[i].strip().startswith("- "): # List +# print(cells[i]) +# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends +# else: +# table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i]) else: raise ValueError("More cells than columns found") else: @@ -638,39 +751,47 @@ def parse_pandoc_table_with_spans(pandoc_table): #print(header_rows) #print(data_rows) - # Correct newlines characters - for row in header_rows: - for cell in row: - cell['content'] = cell['content'].replace("\\", "<br>") - for row in data_rows: - for cell in row: - cell['content'] = cell['content'].replace("\\", "<br>") # Check if there are any data rows if not data_rows and not header_rows: raise ValueError("No valid rows found in the provided Pandoc table.") # Format text - bold = "<strong>" + for rows in [header_rows, data_rows]: + bold = "<strong>" + italic = "<i>" + for row in rows: + for cell in row: + if cell['content'] is not None: + # Replacing "<" by < + cell['content'] = cell['content'].replace("<", "<") + + #Bold + for bold_characters in ["**", "__"]: + while cell['content'].find(bold_characters) != -1: + cell['content'] = cell['content'].replace(bold_characters, bold, 1) + if bold == "<strong>": + bold = "</strong>" + else: + bold = "<strong>" + #Italic + while cell['content'].find("_") != -1 and cell['content'].find("\_") == -1: + cell['content'] = cell['content'].rstrip() .replace("_", italic, 1) + if italic == "<i>": + italic = "</i>" + else: + italic = "<i>" + while cell['content'].find("\_") != -1: + cell['content'] = cell['content'].rstrip().replace("\_", "_", 1) + + # Correct newlines characters for row in header_rows: for cell in row: - while cell['content'].find("**") != -1: - cell['content'] = cell['content'].replace("**", bold, 1) - if bold == "<strong>": - bold = "</strong>" - else: - bold = "<strong>" - bold = "<strong>" + cell['content'] = cell['content'].replace("\n", "<br />") if cell['content'] is not None else None for row in data_rows: for cell in row: - while cell['content'].find("**") != -1: - cell['content'] = cell['content'].replace("**", bold, 1) - if bold == "<strong>": - bold = "</strong>" - else: - bold = "<strong>" + cell['content'] = cell['content'].replace("\n", "<br />") if cell['content'] is not None else None # Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows - forward_rowspan = [] for row_index in range(len(header_rows)): if len(forward_rowspan) == 0: @@ -701,12 +822,7 @@ def parse_pandoc_table_with_spans(pandoc_table): forward_rowspan[cell_index] = data_rows[row_index][cell_index]['rowspan'] - 1 if not sum == number_of_columns: raise ValueError("Grid table not converted properly") - #if has_header: - # table_with_spans = header_rows - - #table_with_spans += data_rows - #return table_with_spans return header_rows, data_rows def generate_html_table_with_spans(pandoc_table): @@ -733,9 +849,23 @@ def generate_html_table_with_spans(pandoc_table): if cell['rowspan'] == 0 or cell['colspan'] == 0: continue else: + # Prepare content, in case there's a list + print(cell['content']) + if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", + cell['content']): # Update cell in new row + #print("MATCHING") + list = "<ul>" + # Build list the matches + for match in matches: + list += "<li>" + match[1] + "</li>" + list += "</ul>" + cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell['content']) + #else: + # print("NOT MATCHING") + rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else "" colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else "" - html += f" <td{rowspan}{colspan}>{cell['content']}</td>\n" + html += f" <th{rowspan}{colspan} {cell['alignment']}>{cell['content']}</th>\n" html += " </tr>\n" html += " </thead>\n" @@ -746,9 +876,21 @@ def generate_html_table_with_spans(pandoc_table): if cell['rowspan'] == 0 or cell['colspan'] == 0: continue else: + #Prepare content, in case there's a list + #print(cell['content']) + if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell['content']): # Update cell in new row + #print("MATCHING") + list = "<ul>" + # Build list the matches + for match in matches: + list += "<li>" + match[1] + "</li>" + list += "</ul>" + cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell['content']) + #else: + #print("NOT MATCHING") rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else "" colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else "" - html += f" <td{rowspan}{colspan}>{cell['content']}</td>\n" + html += f" <td{rowspan}{colspan} {cell['alignment']}>{cell['content']}</td>\n" html += " </tr>\n" html += " </tbody>\n" -- GitLab