-
Notifications
You must be signed in to change notification settings - Fork 75
/
01_index.py
47 lines (40 loc) · 1.84 KB
/
01_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import json
import argparse
import mlflow
from azureml.pipeline import initialise_mlflow_client
from rag_experiment_accelerator.checkpoint import init_checkpoint
from rag_experiment_accelerator.run.index import run
from rag_experiment_accelerator.config.config import Config
from rag_experiment_accelerator.config.environment import Environment
from rag_experiment_accelerator.config.paths import get_all_file_paths, mlflow_run_name
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--config_path", type=str, help="input: path to the config file"
)
parser.add_argument("--data_dir", type=str, help="input: path to the input data")
parser.add_argument(
"-s",
"--sampling",
action="store_true",
help="input: run sampling. Avoid running on distributed compute",
)
args, _ = parser.parse_known_args()
environment = Environment.from_env_or_keyvault()
config = Config.from_path(environment, args.config_path, args.data_dir)
init_checkpoint(config)
file_paths = get_all_file_paths(config.path.data_dir)
mlflow_client = initialise_mlflow_client(environment, config)
mlflow.set_experiment(config.experiment_name)
do_sample = args.sampling
index_dict = {"indexes": []}
file_paths = get_all_file_paths(config.path.data_dir)
for index_config in config.index.flatten():
with mlflow.start_run(run_name=mlflow_run_name(f"index_job_{config.job_name}")):
index_name = run(
environment, config, index_config, file_paths, mlflow_client, do_sample
)
index_dict["indexes"].append(index_name)
# saves the list of index names locally, not used afterwards
with open(config.path.generated_index_names_file, "w") as index_names_file:
json.dump(index_dict, index_names_file, indent=4)