Skip to content

Commit

Permalink
add comforatable debug script for measuring execution time of cooc di…
Browse files Browse the repository at this point in the history
…ct building
  • Loading branch information
fonhorst committed Dec 25, 2023
1 parent b696156 commit e409b5d
Showing 1 changed file with 50 additions and 12 deletions.
62 changes: 50 additions & 12 deletions examples/preparation_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging
import os.path

from autotm.preprocessing.dictionaries_preparation import prepare_all_artifacts
from autotm.preprocessing import PREPOCESSED_DATASET_FILENAME
from autotm.preprocessing.dictionaries_preparation import prepare_all_artifacts, prepearing_cooc_dict
from autotm.preprocessing.text_preprocessing import process_dataset

logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.DEBUG, datefmt='%Y-%m-%d %H:%M:%S')
Expand All @@ -18,17 +20,53 @@
lang = "ru" # available languages: ru, en
min_tokens_num = 3 # the minimal amount of tokens after processing to save the result

if __name__ == "__main__":
logger.info("Stage 1: Dataset preparation")
process_dataset(
PATH_TO_DATASET,
col_to_process,
SAVE_PATH,
lang,
min_tokens_count=min_tokens_num,
def prepare_all_artifacts_debug(save_path: str):
DATASET_PATH = os.path.join(save_path, PREPOCESSED_DATASET_FILENAME)
BATCHES_DIR = os.path.join(save_path, "batches")
WV_PATH = os.path.join(save_path, "test_set_data_voc.txt")
COOC_DICTIONARY_PATH = os.path.join(save_path, "cooc_dictionary.txt")
DICTIONARY_PATH = os.path.join(save_path, "dictionary.txt")
VOCAB_PATH = os.path.join(save_path, "vocab.txt")
cooc_file_path_df = os.path.join(save_path, "cooc_df.txt")
cooc_file_path_tf = os.path.join(save_path, "cooc_tf.txt")
ppmi_dict_df = os.path.join(save_path, "ppmi_df.txt")
ppmi_dict_tf = os.path.join(save_path, "ppmi_tf.txt")
MUTUAL_INFO_DICT_PATH = os.path.join(save_path, "mutual_info_dict.pkl")
DOCUMENTS_TO_BATCH_PATH = os.path.join(save_path, PREPOCESSED_DATASET_FILENAME)

logger.debug("Cooc dictionary preparing...")
prepearing_cooc_dict(
BATCHES_DIR,
WV_PATH,
VOCAB_PATH,
COOC_DICTIONARY_PATH,
DATASET_PATH,
cooc_file_path_tf,
cooc_file_path_df,
ppmi_dict_tf,
ppmi_dict_df,
)

logger.info("Stage 2: Prepare all artefacts")
prepare_all_artifacts(SAVE_PATH)

logger.info("All finished")
if __name__ == "__main__":
prepare_all_artifacts_debug(SAVE_PATH)


# Normal version. DO NOT DELETE!!!
# if __name__ == "__main__":
# logger.info("Stage 1: Dataset preparation")
# if not os.path.exists(SAVE_PATH):
# process_dataset(
# PATH_TO_DATASET,
# col_to_process,
# SAVE_PATH,
# lang,
# min_tokens_count=min_tokens_num,
# )
# else:
# logger.info("The preprocessed dataset already exists. Found files on path: %s" % SAVE_PATH)
#
# logger.info("Stage 2: Prepare all artefacts")
# prepare_all_artifacts(SAVE_PATH)
#
# logger.info("All finished")

0 comments on commit e409b5d

Please sign in to comment.