Skip to content

Commit

Permalink
add basic test
Browse files Browse the repository at this point in the history
  • Loading branch information
fonhorst committed Jan 3, 2024
1 parent 24a4202 commit bc8032d
Show file tree
Hide file tree
Showing 15 changed files with 35,805 additions and 51 deletions.
102 changes: 51 additions & 51 deletions examples/preparation_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,47 +9,47 @@
logger = logging.getLogger(__name__)


# PATH_TO_DATASET = "../data/sample_corpora/sample_dataset_lenta.csv" # dataset with corpora to be processed
# SAVE_PATH = "../data/processed_sample_corpora" # place where all the artifacts will be stored
PATH_TO_DATASET = "../data/sample_corpora/sample_dataset_lenta.csv" # dataset with corpora to be processed
SAVE_PATH = "../data/processed_sample_corpora" # place where all the artifacts will be stored

PATH_TO_DATASET = "../tmp/train-00000-of-00001.csv" # dataset with corpora to be processed
SAVE_PATH = "../tmp/train-00000-of-00001-processed-corpora" # place where all the artifacts will be stored
# PATH_TO_DATASET = "../tmp/train-00000-of-00001.csv" # dataset with corpora to be processed
# SAVE_PATH = "../tmp/train-00000-of-00001-processed-corpora" # place where all the artifacts will be stored

col_to_process = "text"
dataset_name = "lenta_sample"
lang = "ru" # available languages: ru, en
min_tokens_num = 3 # the minimal amount of tokens after processing to save the result

def prepare_all_artifacts_debug(save_path: str):
DATASET_PATH = os.path.join(save_path, PREPOCESSED_DATASET_FILENAME)
BATCHES_DIR = os.path.join(save_path, "batches")
WV_PATH = os.path.join(save_path, "test_set_data_voc.txt")
COOC_DICTIONARY_PATH = os.path.join(save_path, "cooc_dictionary.txt")
DICTIONARY_PATH = os.path.join(save_path, "dictionary.txt")
VOCAB_PATH = os.path.join(save_path, "vocab.txt")
cooc_file_path_df = os.path.join(save_path, "cooc_df.txt")
cooc_file_path_tf = os.path.join(save_path, "cooc_tf.txt")
ppmi_dict_df = os.path.join(save_path, "ppmi_df.txt")
ppmi_dict_tf = os.path.join(save_path, "ppmi_tf.txt")
MUTUAL_INFO_DICT_PATH = os.path.join(save_path, "mutual_info_dict.pkl")
DOCUMENTS_TO_BATCH_PATH = os.path.join(save_path, PREPOCESSED_DATASET_FILENAME)

logger.debug("Cooc dictionary preparing...")
prepearing_cooc_dict(
BATCHES_DIR,
WV_PATH,
VOCAB_PATH,
COOC_DICTIONARY_PATH,
DATASET_PATH,
cooc_file_path_tf,
cooc_file_path_df,
ppmi_dict_tf,
ppmi_dict_df,
)


if __name__ == "__main__":
prepare_all_artifacts_debug(SAVE_PATH)
# def prepare_all_artifacts_debug(save_path: str):
# DATASET_PATH = os.path.join(save_path, PREPOCESSED_DATASET_FILENAME)
# BATCHES_DIR = os.path.join(save_path, "batches")
# WV_PATH = os.path.join(save_path, "test_set_data_voc.txt")
# COOC_DICTIONARY_PATH = os.path.join(save_path, "cooc_dictionary.txt")
# DICTIONARY_PATH = os.path.join(save_path, "dictionary.txt")
# VOCAB_PATH = os.path.join(save_path, "vocab.txt")
# cooc_file_path_df = os.path.join(save_path, "cooc_df.txt")
# cooc_file_path_tf = os.path.join(save_path, "cooc_tf.txt")
# ppmi_dict_df = os.path.join(save_path, "ppmi_df.txt")
# ppmi_dict_tf = os.path.join(save_path, "ppmi_tf.txt")
# MUTUAL_INFO_DICT_PATH = os.path.join(save_path, "mutual_info_dict.pkl")
# DOCUMENTS_TO_BATCH_PATH = os.path.join(save_path, PREPOCESSED_DATASET_FILENAME)
#
# logger.debug("Cooc dictionary preparing...")
# prepearing_cooc_dict(
# BATCHES_DIR,
# WV_PATH,
# VOCAB_PATH,
# COOC_DICTIONARY_PATH,
# DATASET_PATH,
# cooc_file_path_tf,
# cooc_file_path_df,
# ppmi_dict_tf,
# ppmi_dict_df,
# )
#
#
# if __name__ == "__main__":
# prepare_all_artifacts_debug(SAVE_PATH)

# Run bigartm from cli
# bigartm \
Expand All @@ -69,20 +69,20 @@ def prepare_all_artifacts_debug(save_path: str):


# Normal version. DO NOT DELETE!!!
# if __name__ == "__main__":
# logger.info("Stage 1: Dataset preparation")
# if not os.path.exists(SAVE_PATH):
# process_dataset(
# PATH_TO_DATASET,
# col_to_process,
# SAVE_PATH,
# lang,
# min_tokens_count=min_tokens_num,
# )
# else:
# logger.info("The preprocessed dataset already exists. Found files on path: %s" % SAVE_PATH)
#
# logger.info("Stage 2: Prepare all artefacts")
# prepare_all_artifacts(SAVE_PATH)
#
# logger.info("All finished")
if __name__ == "__main__":
logger.info("Stage 1: Dataset preparation")
if not os.path.exists(SAVE_PATH):
process_dataset(
PATH_TO_DATASET,
col_to_process,
SAVE_PATH,
lang,
min_tokens_count=min_tokens_num,
)
else:
logger.info("The preprocessed dataset already exists. Found files on path: %s" % SAVE_PATH)

logger.info("Stage 2: Prepare all artefacts")
prepare_all_artifacts(SAVE_PATH)

logger.info("All finished")
5 changes: 5 additions & 0 deletions examples/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from artm.wrapper.messages_pb2 import CollectionParserConfig

parser_config = CollectionParserConfig()

k = 0
Loading

0 comments on commit bc8032d

Please sign in to comment.