diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1d0b5bc..8e52bd9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -55,7 +55,6 @@ jobs: activate-environment: anaconda-client-env python-version: ${{ matrix.pyver.distver }} environment-file: conda/environment.yml - - name: build, test and upload conda package if: github.event_name == 'push' && github.ref == 'refs/heads/main' env: @@ -63,7 +62,6 @@ jobs: shell: bash -l {0} run: | conda-build -c krande -c conda-forge conda --python=${{ matrix.pyver.distver }} --token=$ANACONDA_TOKEN --user krande --override-channels - - name: build and test conda package if: github.event_name == 'push' && github.ref != 'refs/heads/main' shell: bash -l {0} diff --git a/conda/environment.yml b/conda/environment.yml index b2c92ac..f7dedcb 100644 --- a/conda/environment.yml +++ b/conda/environment.yml @@ -1,5 +1,6 @@ name: anaconda-client-env channels: + - krande - conda-forge dependencies: - conda-build diff --git a/files/doc_math/00-main/00-intro.md b/files/doc_math/00-main/00-intro.md index ecd493e..3982310 100644 --- a/files/doc_math/00-main/00-intro.md +++ b/files/doc_math/00-main/00-intro.md @@ -1,8 +1,8 @@ # A List of functions -Here are two basic function @eq:my_equation and @eq:my_equation_2. +Here are two basic function @eq:my_equation_1 and @eq:my_equation_2. -{{__my_equation__}} +{{__my_equation_1__}} And here is a small edit of that function diff --git a/files/doc_math/00-main/00-results-tables.md b/files/doc_math/00-main/00-results-tables.md deleted file mode 100644 index 0cc9b3e..0000000 --- a/files/doc_math/00-main/00-results-tables.md +++ /dev/null @@ -1,21 +0,0 @@ -# Results in tabular form - -Here are some tables created by running the equations in for loops - -## Equation 1 - -The results from equation @eq:my_equation_1 is presented in table @tbl:results below. - - -{{__results__}} - - -some text after the table - -## Equation 2 - -The results from equation @eq:my_equation_2 is presented in table @tbl:results_2 below. - - -{{__results_2__}} - diff --git a/files/doc_math/metadata.yaml b/files/doc_math/metadata.yaml index 4aa9cf6..6e32f5c 100644 --- a/files/doc_math/metadata.yaml +++ b/files/doc_math/metadata.yaml @@ -1,5 +1,7 @@ lang: en-GB +date: \today +urlcolor: "black" linkReferences: true nameInLink: true figPrefix: "Figure" -tblPrefix: "Table" \ No newline at end of file +tblPrefix: "Table" diff --git a/files/doc_table/00-main/table.md b/files/doc_table/00-main/table.md index f3a9eb1..0e5d1f3 100644 --- a/files/doc_table/00-main/table.md +++ b/files/doc_table/00-main/table.md @@ -4,4 +4,6 @@ Some text before the table {{__my_table__}} +{{__my_table_3__}} + And some text after \ No newline at end of file diff --git a/files/doc_table/01-app/table.md b/files/doc_table/01-app/table.md index c70e7a1..b3b1710 100644 --- a/files/doc_table/01-app/table.md +++ b/files/doc_table/01-app/table.md @@ -4,4 +4,8 @@ Some text before the table {{__my_table_2__}} +{{__my_table_4__}} + +{{__my_table_5__}} + and some text after \ No newline at end of file diff --git a/setup.py b/setup.py index 1560f20..c83684a 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,3 @@ from setuptools import setup -setup(version="0.0.3") +setup(version="0.0.4") diff --git a/src/paradoc/common.py b/src/paradoc/common.py index b69b2f2..0fdd5f6 100644 --- a/src/paradoc/common.py +++ b/src/paradoc/common.py @@ -1,8 +1,9 @@ from __future__ import annotations import pathlib -from dataclasses import dataclass -from typing import Callable +import re +from dataclasses import dataclass, field +from typing import Callable, List, Union import pandas as pd @@ -15,6 +16,7 @@ class TableFormat: style: str = "Grid Table 1 Light" font_size: float = 11 font_style: str = "Arial" + float_fmt: Union[str, tuple] = None class TableFlags: @@ -27,19 +29,29 @@ class Table: df: pd.DataFrame caption: str format: TableFormat = TableFormat() - add_link: bool = False + add_link: bool = True + md_instances: List[MarkDownFile] = field(default_factory=list) + docx_instances: List[object] = field(default_factory=list) + + # def get_cell0(self): + # col_name = self.df.columns[0] + # df.iloc[0, df.columns.get_loc(col_name)] def to_markdown(self, include_name_in_cell=False, flags=None): df = self.df.copy() if include_name_in_cell: col_name = df.columns[0] df.iloc[0, df.columns.get_loc(col_name)] = self.name - tbl_str = df.to_markdown(index=False, tablefmt="grid") + + props = dict(index=False, tablefmt="grid") + if self.format.float_fmt is not None: + props["floatfmt"] = self.format.float_fmt + tbl_str = df.to_markdown(**props) if flags is not None and TableFlags.NO_CAPTION in flags: return tbl_str tbl_str += f"\n\nTable: {self.caption}" if self.add_link: - tbl_str += f"{{#tbl:{self.name}}}" + tbl_str += f" {{#tbl:{self.name}}}" return tbl_str @@ -48,12 +60,16 @@ class Equation: name: str func: Callable custom_eq_str_compiler: Callable = None + add_link: bool = True + include_python_code: bool = False + md_instances: List[MarkDownFile] = field(default_factory=list) + docx_instances: List[object] = field(default_factory=list) def to_latex(self, print_latex=False, print_formula=False, flags=None): if self.custom_eq_str_compiler is not None: return self.custom_eq_str_compiler(self.func) - from inspect import getsourcelines + from inspect import getsource, getsourcelines import pytexit @@ -68,8 +84,14 @@ def to_latex(self, print_latex=False, print_formula=False, flags=None): continue if dots >= 6 or dots == 0: eq_latex += pytexit.py2tex(line, print_latex=print_latex, print_formula=print_formula) + "\n" + eq_str = eq_latex + + if self.add_link: + eq_str += f"{{#eq:{self.name}}}" - return eq_latex + f"{{#eq:{self.name}}}" + if self.include_python_code: + eq_str = f"\n\n```python\n{getsource(self.func)}\n```\n\n" + eq_str + return eq_str @dataclass @@ -85,6 +107,20 @@ class MarkDownFile: new_file: pathlib.Path build_file: pathlib.Path + def read_original_file(self): + with open(self.path, "r") as f: + return f.read() + + def read_built_file(self): + """Read the Markdown file after performed variable substitution""" + with open(self.build_file, "r") as f: + return f.read() + + def get_variables(self): + md_doc_str = self.read_original_file() + key_re = re.compile("{{(.*)}}") + return key_re.finditer(md_doc_str) + class ExportFormats: DOCX = "docx" diff --git a/src/paradoc/document.py b/src/paradoc/document.py index 9fe8892..f67553b 100644 --- a/src/paradoc/document.py +++ b/src/paradoc/document.py @@ -7,7 +7,6 @@ from typing import Dict import pandas as pd -import pypandoc from .common import ( DocXFormat, @@ -17,7 +16,8 @@ Table, TableFormat, ) -from .utils import get_list_of_files, variable_sub +from .exceptions import LatexNotInstalled +from .utils import get_list_of_files class OneDoc: @@ -49,11 +49,11 @@ class OneDoc: "Body Text": "Normal Indent", "Compact": "Normal Indent", } + FORMATS = ExportFormats def __init__( self, source_dir=None, - export_format=ExportFormats.DOCX, main_prefix="00-main", app_prefix="01-app", clean_build_dir=True, @@ -64,7 +64,6 @@ def __init__( self.work_dir = kwargs.get("work_dir", pathlib.Path("").resolve().absolute()) self._main_prefix = main_prefix self._app_prefix = app_prefix - self.export_format = export_format self.variables = dict() self.tables: Dict[str, Table] = dict() self.equations: Dict[str, Equation] = dict() @@ -73,9 +72,9 @@ def __init__( # Style info: https://python-docx.readthedocs.io/en/latest/user/styles-using.html self.paragraph_style_map = kwargs.get("paragraph_style_map", OneDoc.default_paragraph_map) self.appendix_heading_map = kwargs.get("appendix_heading_map", OneDoc.default_app_map) - self.md_files_main = [] self.md_files_app = [] + self.metadata_file = None for md_file in get_list_of_files(self.source_dir, ".md"): is_appendix = True if app_prefix in md_file else False @@ -101,65 +100,106 @@ def __init__( if clean_build_dir is True: shutil.rmtree(self.build_dir, ignore_errors=True) - def compile(self, output_name, auto_open=False, metadata_file=None): - dest_file = (self.dist_dir / output_name).with_suffix(f".{self.export_format}").resolve().absolute() + def compile(self, output_name, auto_open=False, metadata_file=None, export_format=ExportFormats.DOCX, **kwargs): + dest_file = (self.dist_dir / output_name).with_suffix(f".{export_format}").resolve().absolute() logging.debug(f'Compiling report to "{dest_file}"') os.makedirs(self.build_dir, exist_ok=True) os.makedirs(self.dist_dir, exist_ok=True) - for mdf in self.md_files_main + self.md_files_app: - md_file = mdf.path - os.makedirs(mdf.new_file.parent, exist_ok=True) + self.metadata_file = self.source_dir / "metadata.yaml" if metadata_file is None else pathlib.Path(metadata_file) - # Substitute parameters/tables in the creation of the document - with open(md_file, "r") as f: - tmp_md_doc = f.read() - tmp_md_doc = variable_sub(tmp_md_doc, self.tables) - tmp_md_doc = variable_sub(tmp_md_doc, self.variables) - tmp_md_doc = variable_sub(tmp_md_doc, self.equations) + if self.metadata_file.exists() is False: + with open(self.metadata_file, "w") as f: + f.write('linkReferences: true\nnameInLink: true\nfigPrefix: "Figure"\ntblPrefix: "Table"') - with open(mdf.build_file, "w") as f: - f.write(tmp_md_doc) - - metadata_file = self.source_dir / "metadata.yaml" if metadata_file is None else metadata_file - if metadata_file.exists() is False: - with open(metadata_file, "w") as f: - f.write('linkReferences: true\nnameInLink: true\nfigPrefix: "Figure"\ntblPrefix: "Table"') - - pypandoc.convert_file( - str(mdf.build_file), - self.export_format, - outputfile=str(mdf.new_file), - format="markdown", - extra_args=[ - "-M2GB", - "+RTS", - "-K64m", - "-RTS", - f"--resource-path={md_file.parent}", - f"--metadata-file={metadata_file}" - # f"--reference-doc={MY_DOCX_TMPL}", - ], - filters=["pandoc-crossref"], - encoding="utf8", - ) - if self.export_format == ExportFormats.DOCX: + if export_format == ExportFormats.DOCX: from paradoc.io.word.exporter import WordExporter - wordx = WordExporter(self) - wordx.convert_to_docx(output_name, dest_file) + use_custom_compile = kwargs.get("use_custom_docx_compile", True) + if use_custom_compile is False: + use_table_name_in_cell_as_index = False + else: + use_table_name_in_cell_as_index = True + + self._perform_variable_substitution(use_table_name_in_cell_as_index) + + wordx = WordExporter(self, **kwargs) + wordx.export(output_name, dest_file) + elif export_format == ExportFormats.PDF: + from paradoc.io.pdf.exporter import PdfExporter + + latex_path = shutil.which("latex") + if latex_path is None: + latex_url = "https://www.latex-project.org/get/" + raise LatexNotInstalled( + "Latex was not installed on your system. " + f'Please install latex before exporting to pdf. See "{latex_url}" for installation packages' + ) + self._perform_variable_substitution(False) + pdf = PdfExporter(self) + pdf.export(dest_file) + else: + raise NotImplementedError(f'Export format "{export_format}" is not yet supported') if auto_open is True: os.startfile(dest_file) - def add_table(self, name, df: pd.DataFrame, caption: str, tbl_format: TableFormat = TableFormat()): + def add_table(self, name, df: pd.DataFrame, caption: str, tbl_format: TableFormat = TableFormat(), **kwargs): if '"' in caption: raise ValueError('Using characters such as " currently breaks the caption search in the docs compiler') - self.tables[name] = Table(name, df, caption, tbl_format) + self._uniqueness_check(name) + self.tables[name] = Table(name, df, caption, tbl_format, **kwargs) - def add_equation(self, name, eq, custom_eq_str_compiler=None): - self.equations[name] = Equation(name, eq, custom_eq_str_compiler=custom_eq_str_compiler) + def add_equation(self, name, eq, custom_eq_str_compiler=None, **kwargs): + self._uniqueness_check(name) + self.equations[name] = Equation(name, eq, custom_eq_str_compiler=custom_eq_str_compiler, **kwargs) + + def _perform_variable_substitution(self, use_table_var_substitution): + logging.info("Performing variable substitution") + for mdf in self.md_files_main + self.md_files_app: + md_file = mdf.path + os.makedirs(mdf.new_file.parent, exist_ok=True) + md_str = mdf.read_original_file() + for m in mdf.get_variables(): + res = m.group(1) + key = res.split("|")[0] if "|" in res else res + list_of_flags = res.split("|")[1:] if "|" in res else None + key_clean = key[2:-2] + + tbl = self.tables.get(key_clean, None) + eq = self.equations.get(key_clean, None) + variables = self.variables.get(key_clean, None) + + if tbl is not None: + tbl.md_instances.append(mdf) + new_str = tbl.to_markdown(use_table_var_substitution, list_of_flags) + elif eq is not None: + eq.md_instances.append(mdf) + new_str = eq.to_latex() + elif variables is not None: + new_str = str(variables) + else: + logging.error(f'key "{key_clean}" located in {md_file} has not been substituted') + new_str = m.group(0) + + md_str = md_str.replace(m.group(0), new_str) + + with open(mdf.build_file, "w") as f: + f.write(md_str) + + def _uniqueness_check(self, name): + error_msg = 'Table name "{name}" must be unique. This name is already used by {cont_type}="{container}"' + + tbl = self.tables.get(name, None) + if tbl is not None: + raise ValueError(error_msg.format(name=name, cont_type="Table", container=tbl)) + eq = self.equations.get(name, None) + if eq is not None: + raise ValueError(error_msg.format(name=name, cont_type="Equation", container=eq)) + v = self.variables.get(name, None) + if v is not None: + raise ValueError(error_msg.format(name=name, cont_type="Variable", container=v)) @property def main_dir(self): diff --git a/src/paradoc/exceptions.py b/src/paradoc/exceptions.py new file mode 100644 index 0000000..5cd6475 --- /dev/null +++ b/src/paradoc/exceptions.py @@ -0,0 +1,2 @@ +class LatexNotInstalled(Exception): + pass diff --git a/src/paradoc/io/pdf/__init__.py b/src/paradoc/io/pdf/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/paradoc/io/pdf/exporter.py b/src/paradoc/io/pdf/exporter.py new file mode 100644 index 0000000..f4fa8a7 --- /dev/null +++ b/src/paradoc/io/pdf/exporter.py @@ -0,0 +1,34 @@ +import pypandoc + +from paradoc import OneDoc + + +class PdfExporter: + def __init__(self, one_doc: OneDoc): + self.one_doc = one_doc + + def export(self, dest_file): + one = self.one_doc + + md_main_str = "\n".join([md.read_built_file() for md in one.md_files_main]) + + app_str = """\n\n\\appendix\n\n""" + + md_app_str = "\n".join([md.read_built_file() for md in one.md_files_app]) + combined_str = md_main_str + app_str + md_app_str + pypandoc.convert_text( + combined_str, + one.FORMATS.PDF, + outputfile=str(dest_file), + format="markdown", + extra_args=[ + "-M2GB", + "+RTS", + "-K64m", + "-RTS", + f"--metadata-file={one.metadata_file}" + # f"--reference-doc={MY_DOCX_TMPL}", + ], + filters=["pandoc-crossref"], + encoding="utf8", + ) diff --git a/src/paradoc/io/word/common.py b/src/paradoc/io/word/common.py index 15b70e1..e424e67 100644 --- a/src/paradoc/io/word/common.py +++ b/src/paradoc/io/word/common.py @@ -1,7 +1,7 @@ import logging from dataclasses import dataclass -from docx.enum.table import WD_TABLE_ALIGNMENT +from docx.enum.table import WD_CELL_VERTICAL_ALIGNMENT, WD_TABLE_ALIGNMENT from docx.enum.text import WD_PARAGRAPH_ALIGNMENT from docx.shared import Pt from docx.table import Table as DocxTable @@ -20,23 +20,33 @@ class DocXTableRef: docx_caption: Paragraph = None docx_following_pg: Paragraph = None is_appendix = False + document_index: int = None def is_complete(self): docx_attr = [self.docx_caption, self.docx_table, self.docx_following_pg] return all([x is not None for x in docx_attr]) - def format_table(self, is_appendix): + def get_content_cell0_pg(self) -> Paragraph: + tbl = self.docx_table + return tbl.rows[1].cells[0].paragraphs[0] + + def format_table(self, is_appendix, should_restart_caption_numbering=False): tbl = self.docx_table tbl_format = self.table_ref.format # Format content of table tbl.style = tbl_format.style tbl.alignment = WD_TABLE_ALIGNMENT.CENTER + logging.info(f'Changed Table style from "{tbl.style}" to "{tbl_format.style}"') for i, row in enumerate(tbl.rows): for cell in row.cells: + # https://python-docx.readthedocs.io/en/latest/api/enum/WdCellVerticalAlignment.html + cell.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER paragraphs = cell.paragraphs for paragraph in paragraphs: + # assert isinstance(paragraph, Paragraph) + paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT for run in paragraph.runs: font = run.font font.name = tbl_format.font_style @@ -48,32 +58,50 @@ def format_table(self, is_appendix): tbl.autofit = True # Format table Caption - caption = self.docx_caption - caption.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER - - rebuild_caption(caption, self.table_ref.caption, is_appendix) + self.docx_caption.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER + rebuild_caption(self.docx_caption, self.table_ref.caption, is_appendix, should_restart_caption_numbering) - for run in caption.runs: + for run in self.docx_caption.runs: run.font.name = tbl_format.font_style + # Fix formatting before Table + self.docx_caption.paragraph_format.space_before = Pt(18) + # Fix formatting after Table + self.docx_following_pg.paragraph_format.space_before = Pt(22) + + def substitute_back_temp_var(self): + pg_0 = self.get_content_cell0_pg() - self.docx_following_pg.paragraph_format.space_before = Pt(12) - # follower_pg = self.docx_following_pg - # - # i = par_index(follower_pg) - # - # follower_pg.runs[0].text = "\n" + follower_pg.runs[0].text - # follower_pg.paragraph_format.space_before = None + df = self.table_ref.df + col_name = df.columns[0] + res = df.iloc[0, df.columns.get_loc(col_name)] + fmt = self.table_ref.format.float_fmt + use_decimals = True + if len(self.docx_table.rows) > 1: + row2 = self.docx_table.rows[2].cells[0].paragraphs[0] + if "." not in row2.text: + use_decimals = False + if fmt is None or use_decimals is False: + pg_0.text = f"{res}" + else: + pg_0.text = f"{res:{fmt}}" -def rebuild_caption(caption: Paragraph, caption_str, is_appendix): + +def rebuild_caption(caption: Paragraph, caption_str, is_appendix, should_restart=False): caption.clear() caption.runs.clear() run = caption.add_run() - heading_ref = "Appendix" if is_appendix is True else '"Heading 1"' + heading_ref = '"Appendix"' if is_appendix is True else '"Heading 1"' + + sub_heading_ref = "\\s" if should_restart is False else "\\r" + if is_appendix and should_restart: + sub_heading_ref += '"Appendix X.1"' + else: + sub_heading_ref += '"Heading 1"' seq1 = caption._element._new_r() seq1.text = "Table " @@ -86,7 +114,7 @@ def rebuild_caption(caption: Paragraph, caption_str, is_appendix): new_run.text = "-" run._element.addprevious(stroke) seq2 = caption._element._new_r() - add_seq_reference(seq2, "SEQ Table \\* ARABIC \\s 1", run._parent) + add_seq_reference(seq2, f"SEQ {sub_heading_ref} ARABIC", run._parent) run._element.addprevious(seq2) fin = caption._element._new_r() fin_run = Run(fin, run._parent) diff --git a/src/paradoc/io/word/exporter.py b/src/paradoc/io/word/exporter.py index a66f959..a318a38 100644 --- a/src/paradoc/io/word/exporter.py +++ b/src/paradoc/io/word/exporter.py @@ -1,16 +1,10 @@ from __future__ import annotations -import logging -import re -from typing import List, Union - -import numpy as np +import pypandoc from docx import Document from docx.table import Table as DocxTable -from docx.text.paragraph import Paragraph -from docxcompose.composer import Composer -from paradoc.common import MY_DOCX_TMPL, MY_DOCX_TMPL_BLANK, MarkDownFile, Table +from paradoc.common import MY_DOCX_TMPL, MY_DOCX_TMPL_BLANK, ExportFormats from paradoc.document import OneDoc from .common import DocXTableRef @@ -19,37 +13,66 @@ format_image_captions, format_paragraphs_and_headings, ) -from .utils import close_word_docs_by_name, docx_update, iter_block_items +from .utils import ( + add_to_composer, + close_word_docs_by_name, + docx_update, + get_from_doc_by_index, + iter_block_items, +) class WordExporter: - def __init__(self, one_doc: OneDoc): + def __init__(self, one_doc: OneDoc, **kwargs): self.one_doc = one_doc + self.use_custom_docx_compile = kwargs.get("use_custom_docx_compile", True) - def convert_to_docx(self, output_name, dest_file): - one_doc = self.one_doc - - composer_main = add_to_composer(MY_DOCX_TMPL, one_doc.md_files_main) - composer_app = add_to_composer(MY_DOCX_TMPL_BLANK, one_doc.md_files_app) - - for tbl in self.identify_tables(composer_main.doc): + def export(self, output_name, dest_file): + if self.use_custom_docx_compile: + self._compile_individual_md_files_to_docx(output_name, dest_file) + else: + self._compile_docx_from_str(dest_file) - tbl.format_table(is_appendix=False) - - for tbl in self.identify_tables(composer_app.doc): - tbl.format_table(is_appendix=True) + def _compile_individual_md_files_to_docx(self, output_name, dest_file): + one = self.one_doc + for mdf in one.md_files_main + one.md_files_app: + md_file = mdf.path + pypandoc.convert_file( + str(mdf.build_file), + ExportFormats.DOCX, + outputfile=str(mdf.new_file), + format="markdown", + extra_args=[ + "-M2GB", + "+RTS", + "-K64m", + "-RTS", + f"--resource-path={md_file.parent}", + f"--metadata-file={one.metadata_file}" + # f"--reference-doc={MY_DOCX_TMPL}", + ], + filters=["pandoc-crossref"], + encoding="utf8", + ) + + composer_main = add_to_composer(MY_DOCX_TMPL, one.md_files_main) + composer_app = add_to_composer(MY_DOCX_TMPL_BLANK, one.md_files_app) + + # self.format_tables(composer_main, composer_app) + self.format_tables(composer_main.doc, False) + self.format_tables(composer_app.doc, True) format_image_captions(composer_main.doc, False) format_image_captions(composer_app.doc, True) - format_paragraphs_and_headings(composer_app.doc, one_doc.appendix_heading_map) + format_paragraphs_and_headings(composer_app.doc, one.appendix_heading_map) # Merge docs composer_main.doc.add_page_break() composer_main.append(composer_app.doc) # Format all paragraphs - format_paragraphs_and_headings(composer_main.doc, one_doc.paragraph_style_map) + format_paragraphs_and_headings(composer_main.doc, one.paragraph_style_map) # Apply last minute fixes fix_headers_after_compose(composer_main.doc) @@ -62,94 +85,57 @@ def convert_to_docx(self, output_name, dest_file): docx_update(str(dest_file)) - def identify_tables(self, doc: Document): - prev_table = False + def format_tables(self, composer_doc: Document, is_appendix): + for i, docx_tbl in enumerate(self.get_all_tables(composer_doc)): + cell0 = docx_tbl.get_content_cell0_pg() + tbl_name = cell0.text + tbl = self.one_doc.tables.get(tbl_name, None) + if tbl is None: + raise ValueError("Unable to retrieve originally parsed table") + + docx_tbl.table_ref = tbl + docx_tbl.substitute_back_temp_var() + if is_appendix and i == 0: + restart_caption_num = True + else: + restart_caption_num = False + docx_tbl.format_table(is_appendix, should_restart_caption_numbering=restart_caption_num) + + def get_all_tables(self, doc: Document): tables = [] - current_table = DocXTableRef() - for block in iter_block_items(doc): - if type(block) == DocxTable: - current_table.docx_table = block - prev_table = True - continue - - if block.style.name == "Table Caption": - if "using solid elements" in block.text: - print("sd") - current_table.docx_caption = block - - if type(block) == Paragraph and prev_table is True: - prev_table = False - current_table.docx_following_pg = block - - if current_table.is_complete(): - source_table = self.get_related_table(current_table) - if source_table is not None: - current_table.table_ref = source_table - tables.append(current_table) - else: - logging.error(f'Unable to find table with caption "{current_table.docx_caption}"') + + for i, block in enumerate(iter_block_items(doc)): + if type(block) is DocxTable: current_table = DocXTableRef() + current_table.docx_table = block + current_table.docx_caption = get_from_doc_by_index(i - 1, doc) + current_table.docx_following_pg = get_from_doc_by_index(i + 1, doc) + current_table.document_index = i + tables.append(current_table) return tables - def get_related_table(self, current_table: DocXTableRef, frac=1e-4) -> Union[Table, None]: + def _compile_docx_from_str(self, dest_file): one = self.one_doc - - # Search using Caption string - caption = current_table.docx_caption - re_cap = re.compile(r"Table\s*[0-9]{0,9}:(.*)") - for key, tbl in one.tables.items(): - if "Table" in caption.text: - m = re_cap.search(caption.text) - if m is None: - raise ValueError() - caption_text = str(m.group(1).strip()) - else: - caption_text = str(caption.text) - caption_text = caption_text.replace("”", '"') - if tbl.caption == caption_text: - return tbl - - # If no match using caption string, then use contents of table - content = get_first_row_from_table(current_table.docx_table) - is_content_numeric = False - - try: - content_numeric = np.array(content, dtype=float) - is_content_numeric = True - except ValueError: - content_numeric = None - - for key, tbl in one.tables.items(): - row_1 = tbl.df.iloc[0].values - if is_content_numeric and len(content) == len(row_1): - tot = sum(row_1) - diff = sum(row_1 - content_numeric) - if abs(diff) < abs(tot) * frac: - return tbl - print("") - return None - - -def add_to_composer(source_doc, md_files: List[MarkDownFile]) -> Composer: - composer_doc = Composer(Document(source_doc)) - if source_doc == MY_DOCX_TMPL: - composer_doc.doc.add_page_break() - for i, md in enumerate(md_files): - doc_in = Document(str(md.new_file)) - doc_in.add_page_break() - composer_doc.append(doc_in) - logging.info(f"Added {md.new_file}") - return composer_doc - - -def get_first_row_from_table(docx_table: DocxTable, num_row=1): - content = [] - for i, row in enumerate(docx_table.rows): - if i == 0: - continue - for cell in row.cells: - paragraphs = cell.paragraphs - for paragraph in paragraphs: - content.append(paragraph.text.strip()) - return content + md_main_str = "\n".join([md.read_built_file() for md in one.md_files_main]) + + app_str = """\n\n\\appendix\n\n""" + + md_app_str = "\n".join([md.read_built_file() for md in one.md_files_app]) + combined_str = md_main_str + app_str + md_app_str + pypandoc.convert_text( + combined_str, + one.FORMATS.DOCX, + outputfile=str(dest_file), + format="markdown", + extra_args=[ + "-M2GB", + "+RTS", + "-K64m", + "-RTS", + f"--metadata-file={one.metadata_file}" + # f"--reference-doc={MY_DOCX_TMPL}", + ], + filters=["pandoc-crossref"], + encoding="utf8", + ) diff --git a/src/paradoc/io/word/utils.py b/src/paradoc/io/word/utils.py index 3493a78..99dfc29 100644 --- a/src/paradoc/io/word/utils.py +++ b/src/paradoc/io/word/utils.py @@ -2,9 +2,17 @@ import os import pathlib import traceback +from typing import List import pypandoc - +from docx.document import Document as ProxyDocument +from docx.oxml.table import CT_Tbl +from docx.oxml.text.paragraph import CT_P +from docx.table import Table, _Cell +from docx.text.paragraph import Paragraph +from docxcompose.composer import Composer + +from paradoc.common import MY_DOCX_TMPL, MarkDownFile from paradoc.utils import get_list_of_files @@ -41,11 +49,6 @@ def open_word_win32(): def docx_update(docx_file): - """ - - :param docx_file: - :return: - """ word = open_word_win32() if word is None: return @@ -65,14 +68,7 @@ def docx_update(docx_file): word.Quit() -def close_word_docs_by_name(names): - """ - - :param names: List of word document basenames (basenames e.g. "something.docx"). - :type names: list - :return: - """ - +def close_word_docs_by_name(names: list) -> None: word = open_word_win32() if word is None: return @@ -97,10 +93,6 @@ def iter_block_items(parent): also works for a _Cell object, which itself can contain paragraphs and tables. """ from docx.document import Document - from docx.oxml.table import CT_Tbl - from docx.oxml.text.paragraph import CT_P - from docx.table import Table, _Cell - from docx.text.paragraph import Paragraph if isinstance(parent, Document): parent_elm = parent.element.body @@ -115,7 +107,7 @@ def iter_block_items(parent): elif isinstance(child, CT_Tbl): yield Table(child, parent) else: - logging.error(f"Unrecognized child element type {type(child)}") + logging.debug(f"Unrecognized child element type {type(child)}") def convert_markdown_dir_to_docx(source, dest, dest_format, extra_args, style_doc=None): @@ -169,3 +161,23 @@ def convert_markdown_dir_to_docx(source, dest, dest_format, extra_args, style_do # logging.info(f"Added {files[i]}") composer.save(str(dest)) + + +def get_from_doc_by_index(index: int, doc: ProxyDocument): + for i, block in enumerate(iter_block_items(doc)): + if i == index: + return block + + +def add_to_composer(source_doc, md_files: List[MarkDownFile]) -> Composer: + from docx import Document + + composer_doc = Composer(Document(str(source_doc))) + if source_doc == MY_DOCX_TMPL: + composer_doc.doc.add_page_break() + for i, md in enumerate(md_files): + doc_in = Document(str(md.new_file)) + doc_in.add_page_break() + composer_doc.append(doc_in) + logging.info(f"Added {md.new_file}") + return composer_doc diff --git a/src/paradoc/utils.py b/src/paradoc/utils.py index 77410c2..fc3b0fb 100644 --- a/src/paradoc/utils.py +++ b/src/paradoc/utils.py @@ -5,6 +5,8 @@ import pypandoc +from .common import Equation, MarkDownFile, Table + def func_to_eq(func): """ @@ -122,23 +124,7 @@ def basic_equation_compiler(f, print_latex=False, print_formula=False): return eq_latex -def variable_sub(md_doc_str, variable_dict): - from .common import Equation, Table - - def sub_table(tbl: Table, flags) -> str: - return tbl.to_markdown(False, flags=flags) - - def sub_equation(eq: Equation, flags) -> str: - return eq.to_latex(flags=flags) - - def convert_variable(value, flags) -> str: - if type(value) is Table: - value_str = sub_table(value, flags) - elif type(value) is Equation: - value_str = sub_equation(value, flags) - else: - value_str = str(value) - return value_str +def variable_sub(md_doc_str, variable_dict, md_file: MarkDownFile): key_re = re.compile("{{(.*)}}") for m in key_re.finditer(md_doc_str): @@ -165,3 +151,21 @@ def make_df(inputs, header, func): df.columns = df.iloc[0] df = df.drop(df.index[0]) return df + + +def sub_table(tbl: Table, flags) -> str: + return tbl.to_markdown(False, flags=flags) + + +def sub_equation(eq: Equation, flags) -> str: + return eq.to_latex(flags=flags) + + +def convert_variable(value, flags) -> str: + if type(value) is Table: + value_str = sub_table(value, flags) + elif type(value) is Equation: + value_str = sub_equation(value, flags) + else: + value_str = str(value) + return value_str diff --git a/tests/test_doc_math.py b/tests/test_doc_math.py index 25d7916..d40b6ee 100644 --- a/tests/test_doc_math.py +++ b/tests/test_doc_math.py @@ -4,11 +4,12 @@ from ex_funcs import my_calc_example_1, my_calc_example_2 from paradoc import OneDoc +from paradoc.exceptions import LatexNotInstalled from paradoc.utils import make_df class MathDocTests(unittest.TestCase): - def test_math_doc(self): + def setUp(self) -> None: report_dir = files_dir / "doc_math" inputs = [(0, 0), (1, 1), (2, 1), (2, 2)] @@ -17,13 +18,22 @@ def test_math_doc(self): one = OneDoc(report_dir, work_dir=test_dir / "doc_math") - one.add_equation("my_equation", my_calc_example_1) + one.add_equation("my_equation_1", my_calc_example_1, include_python_code=True) one.add_equation("my_equation_2", my_calc_example_2) one.add_table("results", df1, "Results from Equation my_equation") one.add_table("results_2", df2, "Results from Equation my_equation_2") - one.compile("MathDoc") + self.one = one + + def test_math_docx(self): + self.one.compile("MathDoc", export_format=OneDoc.FORMATS.DOCX) + + def test_math_pdf_latex_installation(self): + try: + self.one.compile("MathDoc", export_format=OneDoc.FORMATS.PDF) + except LatexNotInstalled as e: + print(e) if __name__ == "__main__": diff --git a/tests/test_tables.py b/tests/test_tables.py index 77437b9..0cb1277 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -15,6 +15,9 @@ def test_table(self): one.add_table("my_table", df, "A basic table") one.add_table("my_table_2", df, "A slightly smaller table", TableFormat(font_size=8)) + one.add_table("my_table_3", df, "No Space 1") + one.add_table("my_table_4", df, "No Space 2") + one.add_table("my_table_5", df, "No Space 3") one.compile("TableDoc")