From e409b5d5f198800edb8c8b52d13cce04cc7a4e2d Mon Sep 17 00:00:00 2001 From: fonhorst Date: Mon, 25 Dec 2023 11:44:13 +0300 Subject: [PATCH] add comforatable debug script for measuring execution time of cooc dict building --- examples/preparation_pipeline.py | 62 +++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 12 deletions(-) diff --git a/examples/preparation_pipeline.py b/examples/preparation_pipeline.py index 9168c88..cbd3f88 100644 --- a/examples/preparation_pipeline.py +++ b/examples/preparation_pipeline.py @@ -1,6 +1,8 @@ import logging +import os.path -from autotm.preprocessing.dictionaries_preparation import prepare_all_artifacts +from autotm.preprocessing import PREPOCESSED_DATASET_FILENAME +from autotm.preprocessing.dictionaries_preparation import prepare_all_artifacts, prepearing_cooc_dict from autotm.preprocessing.text_preprocessing import process_dataset logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.DEBUG, datefmt='%Y-%m-%d %H:%M:%S') @@ -18,17 +20,53 @@ lang = "ru" # available languages: ru, en min_tokens_num = 3 # the minimal amount of tokens after processing to save the result -if __name__ == "__main__": - logger.info("Stage 1: Dataset preparation") - process_dataset( - PATH_TO_DATASET, - col_to_process, - SAVE_PATH, - lang, - min_tokens_count=min_tokens_num, +def prepare_all_artifacts_debug(save_path: str): + DATASET_PATH = os.path.join(save_path, PREPOCESSED_DATASET_FILENAME) + BATCHES_DIR = os.path.join(save_path, "batches") + WV_PATH = os.path.join(save_path, "test_set_data_voc.txt") + COOC_DICTIONARY_PATH = os.path.join(save_path, "cooc_dictionary.txt") + DICTIONARY_PATH = os.path.join(save_path, "dictionary.txt") + VOCAB_PATH = os.path.join(save_path, "vocab.txt") + cooc_file_path_df = os.path.join(save_path, "cooc_df.txt") + cooc_file_path_tf = os.path.join(save_path, "cooc_tf.txt") + ppmi_dict_df = os.path.join(save_path, "ppmi_df.txt") + ppmi_dict_tf = os.path.join(save_path, "ppmi_tf.txt") + MUTUAL_INFO_DICT_PATH = os.path.join(save_path, "mutual_info_dict.pkl") + DOCUMENTS_TO_BATCH_PATH = os.path.join(save_path, PREPOCESSED_DATASET_FILENAME) + + logger.debug("Cooc dictionary preparing...") + prepearing_cooc_dict( + BATCHES_DIR, + WV_PATH, + VOCAB_PATH, + COOC_DICTIONARY_PATH, + DATASET_PATH, + cooc_file_path_tf, + cooc_file_path_df, + ppmi_dict_tf, + ppmi_dict_df, ) - logger.info("Stage 2: Prepare all artefacts") - prepare_all_artifacts(SAVE_PATH) - logger.info("All finished") +if __name__ == "__main__": + prepare_all_artifacts_debug(SAVE_PATH) + + +# Normal version. DO NOT DELETE!!! +# if __name__ == "__main__": +# logger.info("Stage 1: Dataset preparation") +# if not os.path.exists(SAVE_PATH): +# process_dataset( +# PATH_TO_DATASET, +# col_to_process, +# SAVE_PATH, +# lang, +# min_tokens_count=min_tokens_num, +# ) +# else: +# logger.info("The preprocessed dataset already exists. Found files on path: %s" % SAVE_PATH) +# +# logger.info("Stage 2: Prepare all artefacts") +# prepare_all_artifacts(SAVE_PATH) +# +# logger.info("All finished")