From 3bf002707c270a3b136320d6d3b1cb8352c93ccb Mon Sep 17 00:00:00 2001 From: krande Date: Thu, 18 Jan 2024 16:11:26 +0100 Subject: [PATCH 1/2] parse existing tables from markdown --- files/doc_regular_table/00-main/table.md | 14 +++++++++ files/doc_regular_table/metadata.yaml | 5 ++++ src/paradoc/cli_app.py | 2 +- src/paradoc/common.py | 38 +++++++++++++++++++++++- src/paradoc/document.py | 26 +++++++++------- tests/tables/test_tables.py | 7 +++++ 6 files changed, 79 insertions(+), 13 deletions(-) create mode 100644 files/doc_regular_table/00-main/table.md create mode 100644 files/doc_regular_table/metadata.yaml diff --git a/files/doc_regular_table/00-main/table.md b/files/doc_regular_table/00-main/table.md new file mode 100644 index 0000000..c595590 --- /dev/null +++ b/files/doc_regular_table/00-main/table.md @@ -0,0 +1,14 @@ +# A basic table + +Some text before the table + +| | cat A [unit] | cat 2 [unitB] | num ex [-] | +|:----------|-------------:|--------------:|-----------:| +| example1 | 4000 | 1.13 | 6 | +| example4 | 4000 | 2.15 | 6 | +| example9 | 4000 | 4.04 | 6 | +| example10 | 4500 | 2 | 6 | + +Table: A basic table {#tbl:a-basic-table} + +And some text after \ No newline at end of file diff --git a/files/doc_regular_table/metadata.yaml b/files/doc_regular_table/metadata.yaml new file mode 100644 index 0000000..4aa9cf6 --- /dev/null +++ b/files/doc_regular_table/metadata.yaml @@ -0,0 +1,5 @@ +lang: en-GB +linkReferences: true +nameInLink: true +figPrefix: "Figure" +tblPrefix: "Table" \ No newline at end of file diff --git a/src/paradoc/cli_app.py b/src/paradoc/cli_app.py index 3400d34..df224f2 100644 --- a/src/paradoc/cli_app.py +++ b/src/paradoc/cli_app.py @@ -11,7 +11,7 @@ def main( source_dir: str, report_name: str, auto_open: bool = False, - work_dir: str = None, + work_dir: str = "temp", export_format: ExportFormats = ExportFormats.DOCX, ): one = OneDoc(source_dir, work_dir=work_dir) diff --git a/src/paradoc/common.py b/src/paradoc/common.py index 23b216b..b1b1af7 100644 --- a/src/paradoc/common.py +++ b/src/paradoc/common.py @@ -41,6 +41,7 @@ class Table: add_link: bool = True md_instances: List[MarkDownFile] = field(default_factory=list) docx_instances: List[object] = field(default_factory=list) + link_name_override: str = None def __post_init__(self): if self.df is None: @@ -61,9 +62,40 @@ def to_markdown(self, include_name_in_cell=False, flags=None): return tbl_str tbl_str += f"\n\nTable: {self.caption}" if self.add_link: - tbl_str += f" {{#tbl:{self.name}}}" + if self.link_name_override is None: + link_name = self.name + else: + link_name = self.link_name_override + + tbl_str += f" {{#tbl:{link_name}}}" return tbl_str + @staticmethod + def from_markdown_str(table_str: str) -> Table: + """Parse a markdown table string and return a Table instance""" + lines = table_str.splitlines() + header = [x.strip() for x in lines[0].split("|")[1:-1]] + data = [] + table_caption_str = None + for line in lines[2:]: + if line == "": + continue + if line.strip().startswith("Table:"): + table_caption_str = line.strip() + break + data.append([x.strip() for x in line.split("|")[1:-1]]) + + caption = table_caption_str.split("Table:")[1].strip() + caption = caption.split('{')[0].strip() + # Create a pandas DataFrame using the extracted header and data rows + df = pd.DataFrame(data, columns=header) + name = str(df.values[0][0]) + tbl_ref = re.search(r"{#tbl:(.*?)}", table_str) + link_override = None + if tbl_ref is not None: + link_override = tbl_ref.group(1) + return Table(name=name, df=df, caption=caption, link_name_override=link_override) + @dataclass class Figure: @@ -110,6 +142,10 @@ def get_figures(self): regx = re.compile(r'(?P<caption>.*?)') yield from regx.finditer(self.read_original_file()) + def get_tables(self): + regx = re.compile(r'(\|.*?\nTable:.*?$)', re.MULTILINE | re.DOTALL) + yield from regx.finditer(self.read_original_file()) + class ExportFormats(str, Enum): DOCX = "docx" diff --git a/src/paradoc/document.py b/src/paradoc/document.py index 9fea5b7..678363a 100644 --- a/src/paradoc/document.py +++ b/src/paradoc/document.py @@ -61,16 +61,16 @@ class OneDoc: FORMATS = ExportFormats def __init__( - self, - source_dir=None, - main_prefix="00-main", - app_prefix="01-app", - clean_build_dir=True, - create_dirs=False, - output_dir=None, - work_dir="temp", - use_default_html_style=True, - **kwargs, + self, + source_dir=None, + main_prefix="00-main", + app_prefix="01-app", + clean_build_dir=True, + create_dirs=False, + output_dir=None, + work_dir="temp", + use_default_html_style=True, + **kwargs, ): self.source_dir = pathlib.Path().resolve().absolute() if source_dir is None else pathlib.Path(source_dir) self.work_dir = pathlib.Path(work_dir).resolve().absolute() @@ -138,7 +138,7 @@ def _setup(self, create_dirs, clean_build_dir): # Check if the figure is commented out # Get first newline right before regex search found start and till the end (capture entire line) start = fig.string[: fig.start()].rfind("\n") + 1 - end = fig.string[fig.start() :].find("\n") + fig.start() + end = fig.string[fig.start():].find("\n") + fig.start() line = fig.string[start:end] if line.startswith("[//]: #"): continue @@ -155,6 +155,10 @@ def _setup(self, create_dirs, clean_build_dir): ) self.figures[caption] = Figure(name, caption, ref, file_path, md_instance=md_file) + for re_table in md_file.get_tables(): + table = Table.from_markdown_str(re_table.group(1)) + self.tables[table.name] = table + if clean_build_dir is True: shutil.rmtree(self.build_dir, ignore_errors=True) diff --git a/tests/tables/test_tables.py b/tests/tables/test_tables.py index 1938118..e33d8d2 100644 --- a/tests/tables/test_tables.py +++ b/tests/tables/test_tables.py @@ -16,3 +16,10 @@ def test_table(files_dir, test_dir): one.add_table("my_table_5", df, "No Space 3") one.compile("TableDoc") + + +def test_regular_table(files_dir, test_dir): + report_dir = files_dir / "doc_regular_table" + one = OneDoc(report_dir, work_dir=test_dir / "doc_regular_table") + + one.compile("TableDoc", export_format="docx") From e9b986ef1a32b8d611b87272e3556c7b1e8ce9e9 Mon Sep 17 00:00:00 2001 From: krande Date: Thu, 18 Jan 2024 16:31:25 +0100 Subject: [PATCH 2/2] fix black formatting --- src/paradoc/common.py | 4 ++-- src/paradoc/document.py | 22 +++++++++++----------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/paradoc/common.py b/src/paradoc/common.py index b1b1af7..6cdf142 100644 --- a/src/paradoc/common.py +++ b/src/paradoc/common.py @@ -86,7 +86,7 @@ def from_markdown_str(table_str: str) -> Table: data.append([x.strip() for x in line.split("|")[1:-1]]) caption = table_caption_str.split("Table:")[1].strip() - caption = caption.split('{')[0].strip() + caption = caption.split("{")[0].strip() # Create a pandas DataFrame using the extracted header and data rows df = pd.DataFrame(data, columns=header) name = str(df.values[0][0]) @@ -143,7 +143,7 @@ def get_figures(self): yield from regx.finditer(self.read_original_file()) def get_tables(self): - regx = re.compile(r'(\|.*?\nTable:.*?$)', re.MULTILINE | re.DOTALL) + regx = re.compile(r"(\|.*?\nTable:.*?$)", re.MULTILINE | re.DOTALL) yield from regx.finditer(self.read_original_file()) diff --git a/src/paradoc/document.py b/src/paradoc/document.py index 678363a..aa43fc5 100644 --- a/src/paradoc/document.py +++ b/src/paradoc/document.py @@ -61,16 +61,16 @@ class OneDoc: FORMATS = ExportFormats def __init__( - self, - source_dir=None, - main_prefix="00-main", - app_prefix="01-app", - clean_build_dir=True, - create_dirs=False, - output_dir=None, - work_dir="temp", - use_default_html_style=True, - **kwargs, + self, + source_dir=None, + main_prefix="00-main", + app_prefix="01-app", + clean_build_dir=True, + create_dirs=False, + output_dir=None, + work_dir="temp", + use_default_html_style=True, + **kwargs, ): self.source_dir = pathlib.Path().resolve().absolute() if source_dir is None else pathlib.Path(source_dir) self.work_dir = pathlib.Path(work_dir).resolve().absolute() @@ -138,7 +138,7 @@ def _setup(self, create_dirs, clean_build_dir): # Check if the figure is commented out # Get first newline right before regex search found start and till the end (capture entire line) start = fig.string[: fig.start()].rfind("\n") + 1 - end = fig.string[fig.start():].find("\n") + fig.start() + end = fig.string[fig.start() :].find("\n") + fig.start() line = fig.string[start:end] if line.startswith("[//]: #"): continue