Merge pull request #8 from anyangml/chore/integration-cleanup

Chore: Integration cleanup
anyangml · Oct 31, 2024 · 8accd89 · 8accd89
2 parents bea8c65 + e206c1b
commit 8accd89
Show file tree

Hide file tree

Showing 10 changed files with 178 additions and 99 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ __pycache__/
 .installed.cfg
 *.egg
 .env
+*.jpg
diff --git a/README.md b/README.md
@@ -1 +1,17 @@
 # LAMstare
+<p align="center">
+<img width="125" alt="image" src="https://github.com/user-attachments/assets/e4949ee3-1ec6-4eab-b817-b1596035bcc2">
+</p>
+
+LAMstare is an MLOps tool developed for monitoring the training process of Large Atomic Models (LAM) within the deepmd-kit framework. It serves two primary functions:
+- Monitoring the training process (for both single-task and multi-task scenarios) with a learning curve and on-the-fly in-distribution (ID) tests.
+- Conducting thorough evaluations of the model on out-of-distribution (OOD) tests.
+
+
+
+## How to use
+
+The MLOps pipeline is designed to be as straightforward as possible. To monitor the training process, you can set up cron jobs. The less frequent OOD tests should be triggered manually.
+
+- To monitor the training process, set a cron job for `LAMstare/lamstare/experiments/cron_job.sh`.
+- Update the experiments you wish to monitor in `LAMstare/lamstare/utils/dlc_submit.py`.
diff --git a/lamstare/experiments/cron_job.sh b/lamstare/experiments/cron_job.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 
-source /mnt/workspace/public/Miniconda/etc/profile.d/conda.sh # may need update
-conda activate openlam_db # may need update
-python /mnt/data_nas/public/multitask/eval_scripts/run_dptest.py # may need update --> update to call dlc_submit
+source /mnt/data_nas/public/Miniconda/etc/profile.d/conda.sh # may need update
+conda activate lamsatre # may need update
+bash /mnt/data_nas/penganyang/renew_dlc.sh # to renew dlc credential
+python /mnt/data_nas/public/multitask/LAMstare/lamstare/utils/dlc_submit.py # may need update --> update to call dlc_submit
diff --git a/lamstare/experiments/cron_job_sendimg.sh b/lamstare/experiments/cron_job_sendimg.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+source /mnt/workspace/public/Miniconda/etc/profile.d/conda.sh
+conda activate lamstare
+python /mnt/workspace/public/multitask/LAMstare/lamstare/experiments/plt_lcurve.py
+python /mnt/workspace/public/multitask/LAMstare/lamstare/experiments/plt_test.py
diff --git a/lamstare/experiments/plt_lcurve.py b/lamstare/experiments/plt_lcurve.py
@@ -15,46 +15,68 @@
 
 
 def main(exp_path:str, roll:int=50):
-    weights = get_head_weights(exp_path)
-    heads = list(get_head_weights(exp_path).keys())
+    run_id=exp_path.split("/")[-1] # Get basename as id
+    try:
+        weights = get_head_weights(exp_path)
+        heads = list(get_head_weights(exp_path).keys())
+    except KeyError:
+        heads = [""] # single task
+
     n_heads = len(heads)
     mult_hist = fetch_lcurve(exp_path)
     fig, ax = plt.subplots(n_heads, 3, figsize=(12,2*n_heads+1),sharex=True)
-    for i, head in enumerate(heads):
-        ax[i][0].loglog(mult_hist["step"], mult_hist[f"rmse_e_trn_{head}"].rolling(roll).mean(), linestyle='-',color="blue")
-        ax[i][0].loglog(mult_hist["step"], mult_hist[f"rmse_e_val_{head}"].rolling(roll).mean(), linestyle='-.',color="blue")
-        ax[i][0].set_ylabel(f"rmse_e_{head}")
-
-        ax[i][1].loglog(mult_hist["step"], mult_hist[f"rmse_f_trn_{head}"].rolling(roll).mean(), linestyle='-',color="blue")
-        ax[i][1].loglog(mult_hist["step"], mult_hist[f"rmse_f_val_{head}"].rolling(roll).mean(), linestyle='-.',color="blue")
-        ax[i][1].set_ylabel(f"rmse_f_{head}")
-
-        ax[i][2].loglog(mult_hist["step"], mult_hist[f"rmse_v_trn_{head}"].rolling(roll).mean(), linestyle='-',color="blue")
-        ax[i][2].loglog(mult_hist["step"], mult_hist[f"rmse_v_val_{head}"].rolling(roll).mean(), linestyle='-.',color="blue")
-        ax[i][2].set_ylabel(f"rmse_v_{head}")
-
-        if head in BASELINE_MAP:
-            baseline_hist = fetch_lcurve(BASELINE_MAP[head])
-            STEP_NORMAL_PREF = sum(weights.values())/weights[head]*128/120 # need to adjust this value
-
-            ax[i][0].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_e_trn"].rolling(1000).mean(), linestyle='-',color="red")
-            ax[i][0].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_e_val"].rolling(1000).mean(), linestyle='-.',color="red")
-            ax[i][1].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_f_trn"].rolling(1000).mean(), linestyle='-',color="red")
-            ax[i][1].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_f_val"].rolling(1000).mean(), linestyle='-.',color="red")
-            if "rmse_v_val" in baseline_hist:
-                ax[i][2].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_v_trn"].rolling(1000).mean(), linestyle='-',color="red")
-                ax[i][2].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_v_val"].rolling(1000).mean(), linestyle='-.',color="red")
-        if head in PREVIOUS_BASELINE:
-            ax[i][0].axhline(PREVIOUS_BASELINE[head]["rmse_e"],color="green", linestyle="-.")
-            ax[i][0].axhline(PREVIOUS_BASELINE[head]["e_std"],color="purple", linestyle="-.")
-            ax[i][1].axhline(PREVIOUS_BASELINE[head]["rmse_f"],color="green", linestyle="-.")
-            ax[i][1].axhline(PREVIOUS_BASELINE[head]["f_std"],color="purple", linestyle="-.")
-            ax[i][2].axhline(PREVIOUS_BASELINE[head]["rmse_v"],color="green", linestyle="-.")
-            ax[i][2].axhline(PREVIOUS_BASELINE[head]["v_std"],color="purple", linestyle="-.")
+
+    if n_heads == 1:
+        ax[0].loglog(mult_hist["step"], mult_hist[f"rmse_e_trn"].rolling(roll).mean(), linestyle='-',color="blue")
+        ax[0].loglog(mult_hist["step"], mult_hist[f"rmse_e_val"].rolling(roll).mean(), linestyle='-.',color="blue")
+        ax[0].set_ylabel(f"rmse_e")
+
+        ax[1].loglog(mult_hist["step"], mult_hist[f"rmse_f_trn"].rolling(roll).mean(), linestyle='-',color="blue")
+        ax[1].loglog(mult_hist["step"], mult_hist[f"rmse_f_val"].rolling(roll).mean(), linestyle='-.',color="blue")
+        ax[1].set_ylabel(f"rmse_f")
+
+        ax[2].loglog(mult_hist["step"], mult_hist[f"rmse_v_trn"].rolling(roll).mean(), linestyle='-',color="blue")
+        ax[2].loglog(mult_hist["step"], mult_hist[f"rmse_v_val"].rolling(roll).mean(), linestyle='-.',color="blue")
+        ax[2].set_ylabel(f"rmse_v")
+        fig.suptitle(run_id)
+    else:
+        for i, head in enumerate(heads):
+            ax[i][0].loglog(mult_hist["step"], mult_hist[f"rmse_e_trn_{head}"].rolling(roll).mean(), linestyle='-',color="blue")
+            ax[i][0].loglog(mult_hist["step"], mult_hist[f"rmse_e_val_{head}"].rolling(roll).mean(), linestyle='-.',color="blue")
+            ax[i][0].set_ylabel(f"rmse_e_{head}")
+
+            ax[i][1].loglog(mult_hist["step"], mult_hist[f"rmse_f_trn_{head}"].rolling(roll).mean(), linestyle='-',color="blue")
+            ax[i][1].loglog(mult_hist["step"], mult_hist[f"rmse_f_val_{head}"].rolling(roll).mean(), linestyle='-.',color="blue")
+            ax[i][1].set_ylabel(f"rmse_f_{head}")
+
+            ax[i][2].loglog(mult_hist["step"], mult_hist[f"rmse_v_trn_{head}"].rolling(roll).mean(), linestyle='-',color="blue")
+            ax[i][2].loglog(mult_hist["step"], mult_hist[f"rmse_v_val_{head}"].rolling(roll).mean(), linestyle='-.',color="blue")
+            ax[i][2].set_ylabel(f"rmse_v_{head}")
+
+            if head in BASELINE_MAP:
+                baseline_hist = fetch_lcurve(BASELINE_MAP[head])
+                STEP_NORMAL_PREF = sum(weights.values())/weights[head]*128/120 # need to adjust this value
+
+                ax[i][0].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_e_trn"].rolling(1000).mean(), linestyle='-',color="red")
+                ax[i][0].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_e_val"].rolling(1000).mean(), linestyle='-.',color="red")
+                ax[i][1].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_f_trn"].rolling(1000).mean(), linestyle='-',color="red")
+                ax[i][1].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_f_val"].rolling(1000).mean(), linestyle='-.',color="red")
+                if "rmse_v_val" in baseline_hist:
+                    ax[i][2].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_v_trn"].rolling(1000).mean(), linestyle='-',color="red")
+                    ax[i][2].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_v_val"].rolling(1000).mean(), linestyle='-.',color="red")
+            if head in PREVIOUS_BASELINE:
+                ax[i][0].axhline(PREVIOUS_BASELINE[head]["rmse_e"],color="green", linestyle="-.")
+                ax[i][0].axhline(PREVIOUS_BASELINE[head]["e_std"],color="purple", linestyle="-.")
+                ax[i][1].axhline(PREVIOUS_BASELINE[head]["rmse_f"],color="green", linestyle="-.")
+                ax[i][1].axhline(PREVIOUS_BASELINE[head]["f_std"],color="purple", linestyle="-.")
+                ax[i][2].axhline(PREVIOUS_BASELINE[head]["rmse_v"],color="green", linestyle="-.")
+                ax[i][2].axhline(PREVIOUS_BASELINE[head]["v_std"],color="purple", linestyle="-.")
 
     plt.tight_layout()
     fig.savefig("lcurve.jpg")
-    sendimg(["lcurve.jpg"])
+    sendimg(["lcurve.jpg"], run_id)
 
 if __name__ == "__main__":
-    main("/mnt/data_nas/public/multitask/training_exps/1018_b4_medium_l6_atton_37head_linear_fitting_tanh")
+    for exp_path in ["/mnt/data_nas/penganyang/experiments/1029_omat_batch128_medium_test"]:
+        main(exp_path)
+    # main("/mnt/data_nas/public/multitask/training_exps/1018_b4_medium_l6_atton_37head_linear_fitting_tanh")
diff --git a/lamstare/experiments/plt_test.py b/lamstare/experiments/plt_test.py
@@ -61,65 +61,86 @@ def parse_record_dict(all_records:dict) -> dict:
 
 def main(exp_path:str):
     run_id=exp_path.split("/")[-1] # Get basename as id
-    weights = get_head_weights(exp_path)
-    heads = list(weights.keys())
+    try:
+        weights = get_head_weights(exp_path)
+        heads = list(get_head_weights(exp_path).keys())
+    except KeyError:
+        heads = [""] # single task
+
     n_heads = len(heads)
-    fig, ax = plt.subplots(n_heads+1, 3, figsize=(12,2*n_heads+3),sharex=True)
+
     all_records = fetch_dptest_res(run_id)
     all_records = parse_record_dict(all_records)
-    weighted_dptest = []
-    for i, head in enumerate(heads):
-        if head in all_records:
-            ax[i][0].loglog(all_records[head]["step"], all_records[head]["ener_mae"],"bo-",label="MAE")
-            ax[i][0].loglog(all_records[head]["step"], all_records[head]["ener_rmse"],"ro-",label="RMSE")
-            ax[i][0].set_ylabel(f"Energy_{head}")
-
-            ax[i][1].loglog(all_records[head]["step"], all_records[head]["force_mae"],"bo-",label="MAE")
-            ax[i][1].loglog(all_records[head]["step"], all_records[head]["force_rmse"],"ro-",label="RMSE")
-            ax[i][1].set_ylabel(f"Force_{head}")
-
-            ax[i][2].loglog(all_records[head]["step"], all_records[head]["virial_mae"],"bo-",label="MAE")
-            ax[i][2].loglog(all_records[head]["step"], all_records[head]["virial_rmse"],"ro-",label="RMSE")
-            ax[i][2].set_ylabel(f"Virial_{head}")
-            if head in PREVIOUS_BASELINE:
-                ax[i][0].axhline(PREVIOUS_BASELINE[head]["rmse_e"],color="red", linestyle="-.")
-                ax[i][1].axhline(PREVIOUS_BASELINE[head]["rmse_f"],color="red", linestyle="-.")
-                ax[i][2].axhline(PREVIOUS_BASELINE[head]["rmse_v"],color="red", linestyle="-.")
-                if "mae_e" in PREVIOUS_BASELINE[head]:
-                    ax[i][0].axhline(PREVIOUS_BASELINE[head]["mae_e"],color="blue", linestyle="-.")
-                    ax[i][1].axhline(PREVIOUS_BASELINE[head]["mae_f"],color="blue", linestyle="-.")
-                    ax[i][2].axhline(PREVIOUS_BASELINE[head]["mae_v"],color="blue", linestyle="-.")
-
-            weighted_dptest.append(np.array([
-                all_records[head]["ener_mae"],
-                all_records[head]["ener_rmse"],
-                all_records[head]["force_mae"],
-                all_records[head]["force_rmse"],
-                all_records[head]["virial_mae"],
-                all_records[head]["virial_rmse"]
-            ])*weights[head])
-    n_ckpt_to_weight = weighted_dptest[-1].shape[1]
-    weighted_dptest = [rr[:,:n_ckpt_to_weight] for rr in weighted_dptest]
-    weighted_dptest = np.sum(weighted_dptest, axis=0)/sum(weights.values())
-
-    ax[-1][0].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[0],"bo-",label="MAE")
-    ax[-1][0].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[1],"ro-",label="RMSE")
-    ax[-1][0].set_ylabel(f"Energy_Weighted")
-
-    ax[-1][1].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[2],"bo-",label="MAE")
-    ax[-1][1].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[3],"ro-",label="RMSE")
-    ax[-1][1].set_ylabel(f"Force_Weighted")
-
-    ax[-1][2].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[4],"bo-",label="MAE")
-    ax[-1][2].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[5],"ro-",label="RMSE")
-    ax[-1][2].set_ylabel(f"Virial_Weighted")
+    if n_heads == 1:
+        fig, ax = plt.subplots(1, 3, figsize=(12,5),sharex=True)
+        ax[0].loglog(all_records[""]["step"], all_records[""]["ener_mae"],"bo-",label="MAE")
+        ax[0].loglog(all_records[""]["step"], all_records[""]["ener_rmse"],"ro-",label="RMSE")
+        ax[0].set_ylabel(f"Energy")
+
+        ax[1].loglog(all_records[""]["step"], all_records[""]["force_mae"],"bo-",label="MAE")
+        ax[1].loglog(all_records[""]["step"], all_records[""]["force_rmse"],"ro-",label="RMSE")
+        ax[1].set_ylabel(f"Force")
+
+        ax[2].loglog(all_records[""]["step"], all_records[""]["virial_mae"],"bo-",label="MAE")
+        ax[2].loglog(all_records[""]["step"], all_records[""]["virial_rmse"],"ro-",label="RMSE")
+        ax[2].set_ylabel(f"Virial")
+        fig.suptitle(run_id)
+    else:
+        fig, ax = plt.subplots(n_heads+1, 3, figsize=(12,2*n_heads+3),sharex=True)
+        weighted_dptest = []
+        for i, head in enumerate(heads):
+            if head in all_records:
+                ax[i][0].loglog(all_records[head]["step"], all_records[head]["ener_mae"],"bo-",label="MAE")
+                ax[i][0].loglog(all_records[head]["step"], all_records[head]["ener_rmse"],"ro-",label="RMSE")
+                ax[i][0].set_ylabel(f"Energy_{head}")
+
+                ax[i][1].loglog(all_records[head]["step"], all_records[head]["force_mae"],"bo-",label="MAE")
+                ax[i][1].loglog(all_records[head]["step"], all_records[head]["force_rmse"],"ro-",label="RMSE")
+                ax[i][1].set_ylabel(f"Force_{head}")
+
+                ax[i][2].loglog(all_records[head]["step"], all_records[head]["virial_mae"],"bo-",label="MAE")
+                ax[i][2].loglog(all_records[head]["step"], all_records[head]["virial_rmse"],"ro-",label="RMSE")
+                ax[i][2].set_ylabel(f"Virial_{head}")
+                if head in PREVIOUS_BASELINE:
+                    ax[i][0].axhline(PREVIOUS_BASELINE[head]["rmse_e"],color="red", linestyle="-.")
+                    ax[i][1].axhline(PREVIOUS_BASELINE[head]["rmse_f"],color="red", linestyle="-.")
+                    ax[i][2].axhline(PREVIOUS_BASELINE[head]["rmse_v"],color="red", linestyle="-.")
+                    if "mae_e" in PREVIOUS_BASELINE[head]:
+                        ax[i][0].axhline(PREVIOUS_BASELINE[head]["mae_e"],color="blue", linestyle="-.")
+                        ax[i][1].axhline(PREVIOUS_BASELINE[head]["mae_f"],color="blue", linestyle="-.")
+                        ax[i][2].axhline(PREVIOUS_BASELINE[head]["mae_v"],color="blue", linestyle="-.")
+
+                weighted_dptest.append(np.array([
+                    all_records[head]["ener_mae"],
+                    all_records[head]["ener_rmse"],
+                    all_records[head]["force_mae"],
+                    all_records[head]["force_rmse"],
+                    all_records[head]["virial_mae"],
+                    all_records[head]["virial_rmse"]
+                ])*weights[head])
+        n_ckpt_to_weight = weighted_dptest[-1].shape[1]
+        weighted_dptest = [rr[:,:n_ckpt_to_weight] for rr in weighted_dptest]
+        weighted_dptest = np.sum(weighted_dptest, axis=0)/sum(weights.values())
+
+        ax[-1][0].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[0],"bo-",label="MAE")
+        ax[-1][0].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[1],"ro-",label="RMSE")
+        ax[-1][0].set_ylabel(f"Energy_Weighted")
+
+        ax[-1][1].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[2],"bo-",label="MAE")
+        ax[-1][1].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[3],"ro-",label="RMSE")
+        ax[-1][1].set_ylabel(f"Force_Weighted")
+
+        ax[-1][2].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[4],"bo-",label="MAE")
+        ax[-1][2].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[5],"ro-",label="RMSE")
+        ax[-1][2].set_ylabel(f"Virial_Weighted")
 
     plt.tight_layout()
     fig.savefig("dptest.jpg")
-    sendimg(["dptest.jpg"])
+    sendimg(["dptest.jpg"], run_id)
 
 
 if __name__ == "__main__":
-
-    main("/mnt/data_nas/public/multitask/training_exps/1018_b4_medium_l6_atton_37head_linear_fitting_tanh")
+    for exp_path in ["/mnt/data_nas/penganyang/experiments/1029_omat_batch128_medium_test"]:
+        main(exp_path)
+    # main("/mnt/data_nas/public/multitask/training_exps/1018_b4_medium_l6_atton_37head_linear_fitting_tanh")
     # main("1015_37head_multitask_1gpu_test")
diff --git a/lamstare/experiments/run_test.py b/lamstare/experiments/run_test.py
@@ -8,6 +8,7 @@
 import numpy as np
 from dotenv import load_dotenv # type: ignore
 import os
+import sys
 load_dotenv()
 
 temp_file_path = os.environ.get("TEMP_FILE_DIR")
@@ -84,5 +85,6 @@ def main(exp_path:str, freq:int=200000):
         print("No new ckpt to test.\n")
 
 if __name__ == "__main__":
-    main("/mnt/workspace/cc/multitask/training_exps/1015_37head_multitask_1gpu_test") # multi task test data
+    main(sys.argv[1])
+    # main("/mnt/workspace/cc/multitask/training_exps/1015_37head_multitask_1gpu_test") # multi task test data
     # main("/mnt/workspace/penganyang/experiments/1018_mptrj_l6_atton_b256_test") # single task test data
diff --git a/lamstare/utils/dlc_submit.py b/lamstare/utils/dlc_submit.py
@@ -5,7 +5,8 @@
 import sys
 load_dotenv()
 
-def submit_dptest_job_to_dlc(job_name:str):
+def submit_dptest_job_to_dlc(exp_path:str):
+    job_name = exp_path.split("/")[-1]
     docker_image = os.environ.get("DOCKER_IMAGE")
     data_sources = os.environ.get("DATA_SOURCES")
     workspace_id = os.environ.get("WORKSPACE_ID")
@@ -23,7 +24,7 @@ def submit_dptest_job_to_dlc(job_name:str):
             f". /mnt/data_nas/public/.bashrc \n" \
             f"conda activate /mnt/data_nas/public/Miniconda/envs/{venv} \n" \
             f"cd {lamstare_path} \n" \
-            f"python lamstare/experiments/run_dptest.py \n" \
+            f"python lamstare/experiments/run_test.py {exp_path} \n" \
 
 
     cmd = ['/mnt/data_nas/penganyang/dlc', 'submit', 'pytorchjob']
@@ -46,4 +47,5 @@ def submit_dptest_job_to_dlc(job_name:str):
         raise RuntimeError(f"{job_name} failed")
 
 if __name__ == "__main__":
-    submit_dptest_job_to_dlc(sys.argv[1])
+    for exp_path in ["/mnt/data_nas/penganyang/experiments/1029_omat_batch128_medium_test"]:
+        submit_dptest_job_to_dlc(exp_path) # abosulute path to the experiment folder
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,3 +5,4 @@ __pycache__/ @@
     .installed.cfg
     *.egg
     .env
+    *.jpg