Skip to content

Commit

Permalink
Merge pull request #8 from anyangml/chore/integration-cleanup
Browse files Browse the repository at this point in the history
Chore: Integration cleanup
  • Loading branch information
anyangml authored Oct 31, 2024
2 parents bea8c65 + e206c1b commit 8accd89
Show file tree
Hide file tree
Showing 10 changed files with 178 additions and 99 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ __pycache__/
.installed.cfg
*.egg
.env
*.jpg
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1 +1,17 @@
# LAMstare
<p align="center">
<img width="125" alt="image" src="https://github.com/user-attachments/assets/e4949ee3-1ec6-4eab-b817-b1596035bcc2">
</p>

LAMstare is an MLOps tool developed for monitoring the training process of Large Atomic Models (LAM) within the deepmd-kit framework. It serves two primary functions:
- Monitoring the training process (for both single-task and multi-task scenarios) with a learning curve and on-the-fly in-distribution (ID) tests.
- Conducting thorough evaluations of the model on out-of-distribution (OOD) tests.



## How to use

The MLOps pipeline is designed to be as straightforward as possible. To monitor the training process, you can set up cron jobs. The less frequent OOD tests should be triggered manually.

- To monitor the training process, set a cron job for `LAMstare/lamstare/experiments/cron_job.sh`.
- Update the experiments you wish to monitor in `LAMstare/lamstare/utils/dlc_submit.py`.
7 changes: 4 additions & 3 deletions lamstare/experiments/cron_job.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/bin/bash

source /mnt/workspace/public/Miniconda/etc/profile.d/conda.sh # may need update
conda activate openlam_db # may need update
python /mnt/data_nas/public/multitask/eval_scripts/run_dptest.py # may need update --> update to call dlc_submit
source /mnt/data_nas/public/Miniconda/etc/profile.d/conda.sh # may need update
conda activate lamsatre # may need update
bash /mnt/data_nas/penganyang/renew_dlc.sh # to renew dlc credential
python /mnt/data_nas/public/multitask/LAMstare/lamstare/utils/dlc_submit.py # may need update --> update to call dlc_submit
6 changes: 6 additions & 0 deletions lamstare/experiments/cron_job_sendimg.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

source /mnt/workspace/public/Miniconda/etc/profile.d/conda.sh
conda activate lamstare
python /mnt/workspace/public/multitask/LAMstare/lamstare/experiments/plt_lcurve.py
python /mnt/workspace/public/multitask/LAMstare/lamstare/experiments/plt_test.py
92 changes: 57 additions & 35 deletions lamstare/experiments/plt_lcurve.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,46 +15,68 @@


def main(exp_path:str, roll:int=50):
weights = get_head_weights(exp_path)
heads = list(get_head_weights(exp_path).keys())
run_id=exp_path.split("/")[-1] # Get basename as id
try:
weights = get_head_weights(exp_path)
heads = list(get_head_weights(exp_path).keys())
except KeyError:
heads = [""] # single task

n_heads = len(heads)
mult_hist = fetch_lcurve(exp_path)
fig, ax = plt.subplots(n_heads, 3, figsize=(12,2*n_heads+1),sharex=True)
for i, head in enumerate(heads):
ax[i][0].loglog(mult_hist["step"], mult_hist[f"rmse_e_trn_{head}"].rolling(roll).mean(), linestyle='-',color="blue")
ax[i][0].loglog(mult_hist["step"], mult_hist[f"rmse_e_val_{head}"].rolling(roll).mean(), linestyle='-.',color="blue")
ax[i][0].set_ylabel(f"rmse_e_{head}")

ax[i][1].loglog(mult_hist["step"], mult_hist[f"rmse_f_trn_{head}"].rolling(roll).mean(), linestyle='-',color="blue")
ax[i][1].loglog(mult_hist["step"], mult_hist[f"rmse_f_val_{head}"].rolling(roll).mean(), linestyle='-.',color="blue")
ax[i][1].set_ylabel(f"rmse_f_{head}")

ax[i][2].loglog(mult_hist["step"], mult_hist[f"rmse_v_trn_{head}"].rolling(roll).mean(), linestyle='-',color="blue")
ax[i][2].loglog(mult_hist["step"], mult_hist[f"rmse_v_val_{head}"].rolling(roll).mean(), linestyle='-.',color="blue")
ax[i][2].set_ylabel(f"rmse_v_{head}")

if head in BASELINE_MAP:
baseline_hist = fetch_lcurve(BASELINE_MAP[head])
STEP_NORMAL_PREF = sum(weights.values())/weights[head]*128/120 # need to adjust this value

ax[i][0].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_e_trn"].rolling(1000).mean(), linestyle='-',color="red")
ax[i][0].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_e_val"].rolling(1000).mean(), linestyle='-.',color="red")
ax[i][1].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_f_trn"].rolling(1000).mean(), linestyle='-',color="red")
ax[i][1].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_f_val"].rolling(1000).mean(), linestyle='-.',color="red")
if "rmse_v_val" in baseline_hist:
ax[i][2].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_v_trn"].rolling(1000).mean(), linestyle='-',color="red")
ax[i][2].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_v_val"].rolling(1000).mean(), linestyle='-.',color="red")
if head in PREVIOUS_BASELINE:
ax[i][0].axhline(PREVIOUS_BASELINE[head]["rmse_e"],color="green", linestyle="-.")
ax[i][0].axhline(PREVIOUS_BASELINE[head]["e_std"],color="purple", linestyle="-.")
ax[i][1].axhline(PREVIOUS_BASELINE[head]["rmse_f"],color="green", linestyle="-.")
ax[i][1].axhline(PREVIOUS_BASELINE[head]["f_std"],color="purple", linestyle="-.")
ax[i][2].axhline(PREVIOUS_BASELINE[head]["rmse_v"],color="green", linestyle="-.")
ax[i][2].axhline(PREVIOUS_BASELINE[head]["v_std"],color="purple", linestyle="-.")

if n_heads == 1:
ax[0].loglog(mult_hist["step"], mult_hist[f"rmse_e_trn"].rolling(roll).mean(), linestyle='-',color="blue")
ax[0].loglog(mult_hist["step"], mult_hist[f"rmse_e_val"].rolling(roll).mean(), linestyle='-.',color="blue")
ax[0].set_ylabel(f"rmse_e")

ax[1].loglog(mult_hist["step"], mult_hist[f"rmse_f_trn"].rolling(roll).mean(), linestyle='-',color="blue")
ax[1].loglog(mult_hist["step"], mult_hist[f"rmse_f_val"].rolling(roll).mean(), linestyle='-.',color="blue")
ax[1].set_ylabel(f"rmse_f")

ax[2].loglog(mult_hist["step"], mult_hist[f"rmse_v_trn"].rolling(roll).mean(), linestyle='-',color="blue")
ax[2].loglog(mult_hist["step"], mult_hist[f"rmse_v_val"].rolling(roll).mean(), linestyle='-.',color="blue")
ax[2].set_ylabel(f"rmse_v")
fig.suptitle(run_id)
else:
for i, head in enumerate(heads):
ax[i][0].loglog(mult_hist["step"], mult_hist[f"rmse_e_trn_{head}"].rolling(roll).mean(), linestyle='-',color="blue")
ax[i][0].loglog(mult_hist["step"], mult_hist[f"rmse_e_val_{head}"].rolling(roll).mean(), linestyle='-.',color="blue")
ax[i][0].set_ylabel(f"rmse_e_{head}")

ax[i][1].loglog(mult_hist["step"], mult_hist[f"rmse_f_trn_{head}"].rolling(roll).mean(), linestyle='-',color="blue")
ax[i][1].loglog(mult_hist["step"], mult_hist[f"rmse_f_val_{head}"].rolling(roll).mean(), linestyle='-.',color="blue")
ax[i][1].set_ylabel(f"rmse_f_{head}")

ax[i][2].loglog(mult_hist["step"], mult_hist[f"rmse_v_trn_{head}"].rolling(roll).mean(), linestyle='-',color="blue")
ax[i][2].loglog(mult_hist["step"], mult_hist[f"rmse_v_val_{head}"].rolling(roll).mean(), linestyle='-.',color="blue")
ax[i][2].set_ylabel(f"rmse_v_{head}")

if head in BASELINE_MAP:
baseline_hist = fetch_lcurve(BASELINE_MAP[head])
STEP_NORMAL_PREF = sum(weights.values())/weights[head]*128/120 # need to adjust this value

ax[i][0].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_e_trn"].rolling(1000).mean(), linestyle='-',color="red")
ax[i][0].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_e_val"].rolling(1000).mean(), linestyle='-.',color="red")
ax[i][1].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_f_trn"].rolling(1000).mean(), linestyle='-',color="red")
ax[i][1].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_f_val"].rolling(1000).mean(), linestyle='-.',color="red")
if "rmse_v_val" in baseline_hist:
ax[i][2].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_v_trn"].rolling(1000).mean(), linestyle='-',color="red")
ax[i][2].loglog([s * STEP_NORMAL_PREF for s in baseline_hist["step"]], baseline_hist[f"rmse_v_val"].rolling(1000).mean(), linestyle='-.',color="red")
if head in PREVIOUS_BASELINE:
ax[i][0].axhline(PREVIOUS_BASELINE[head]["rmse_e"],color="green", linestyle="-.")
ax[i][0].axhline(PREVIOUS_BASELINE[head]["e_std"],color="purple", linestyle="-.")
ax[i][1].axhline(PREVIOUS_BASELINE[head]["rmse_f"],color="green", linestyle="-.")
ax[i][1].axhline(PREVIOUS_BASELINE[head]["f_std"],color="purple", linestyle="-.")
ax[i][2].axhline(PREVIOUS_BASELINE[head]["rmse_v"],color="green", linestyle="-.")
ax[i][2].axhline(PREVIOUS_BASELINE[head]["v_std"],color="purple", linestyle="-.")

plt.tight_layout()
fig.savefig("lcurve.jpg")
sendimg(["lcurve.jpg"])
sendimg(["lcurve.jpg"], run_id)

if __name__ == "__main__":
main("/mnt/data_nas/public/multitask/training_exps/1018_b4_medium_l6_atton_37head_linear_fitting_tanh")
for exp_path in ["/mnt/data_nas/penganyang/experiments/1029_omat_batch128_medium_test"]:
main(exp_path)
# main("/mnt/data_nas/public/multitask/training_exps/1018_b4_medium_l6_atton_37head_linear_fitting_tanh")
125 changes: 73 additions & 52 deletions lamstare/experiments/plt_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,65 +61,86 @@ def parse_record_dict(all_records:dict) -> dict:

def main(exp_path:str):
run_id=exp_path.split("/")[-1] # Get basename as id
weights = get_head_weights(exp_path)
heads = list(weights.keys())
try:
weights = get_head_weights(exp_path)
heads = list(get_head_weights(exp_path).keys())
except KeyError:
heads = [""] # single task

n_heads = len(heads)
fig, ax = plt.subplots(n_heads+1, 3, figsize=(12,2*n_heads+3),sharex=True)

all_records = fetch_dptest_res(run_id)
all_records = parse_record_dict(all_records)
weighted_dptest = []
for i, head in enumerate(heads):
if head in all_records:
ax[i][0].loglog(all_records[head]["step"], all_records[head]["ener_mae"],"bo-",label="MAE")
ax[i][0].loglog(all_records[head]["step"], all_records[head]["ener_rmse"],"ro-",label="RMSE")
ax[i][0].set_ylabel(f"Energy_{head}")

ax[i][1].loglog(all_records[head]["step"], all_records[head]["force_mae"],"bo-",label="MAE")
ax[i][1].loglog(all_records[head]["step"], all_records[head]["force_rmse"],"ro-",label="RMSE")
ax[i][1].set_ylabel(f"Force_{head}")

ax[i][2].loglog(all_records[head]["step"], all_records[head]["virial_mae"],"bo-",label="MAE")
ax[i][2].loglog(all_records[head]["step"], all_records[head]["virial_rmse"],"ro-",label="RMSE")
ax[i][2].set_ylabel(f"Virial_{head}")
if head in PREVIOUS_BASELINE:
ax[i][0].axhline(PREVIOUS_BASELINE[head]["rmse_e"],color="red", linestyle="-.")
ax[i][1].axhline(PREVIOUS_BASELINE[head]["rmse_f"],color="red", linestyle="-.")
ax[i][2].axhline(PREVIOUS_BASELINE[head]["rmse_v"],color="red", linestyle="-.")
if "mae_e" in PREVIOUS_BASELINE[head]:
ax[i][0].axhline(PREVIOUS_BASELINE[head]["mae_e"],color="blue", linestyle="-.")
ax[i][1].axhline(PREVIOUS_BASELINE[head]["mae_f"],color="blue", linestyle="-.")
ax[i][2].axhline(PREVIOUS_BASELINE[head]["mae_v"],color="blue", linestyle="-.")

weighted_dptest.append(np.array([
all_records[head]["ener_mae"],
all_records[head]["ener_rmse"],
all_records[head]["force_mae"],
all_records[head]["force_rmse"],
all_records[head]["virial_mae"],
all_records[head]["virial_rmse"]
])*weights[head])
n_ckpt_to_weight = weighted_dptest[-1].shape[1]
weighted_dptest = [rr[:,:n_ckpt_to_weight] for rr in weighted_dptest]
weighted_dptest = np.sum(weighted_dptest, axis=0)/sum(weights.values())

ax[-1][0].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[0],"bo-",label="MAE")
ax[-1][0].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[1],"ro-",label="RMSE")
ax[-1][0].set_ylabel(f"Energy_Weighted")

ax[-1][1].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[2],"bo-",label="MAE")
ax[-1][1].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[3],"ro-",label="RMSE")
ax[-1][1].set_ylabel(f"Force_Weighted")

ax[-1][2].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[4],"bo-",label="MAE")
ax[-1][2].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[5],"ro-",label="RMSE")
ax[-1][2].set_ylabel(f"Virial_Weighted")
if n_heads == 1:
fig, ax = plt.subplots(1, 3, figsize=(12,5),sharex=True)
ax[0].loglog(all_records[""]["step"], all_records[""]["ener_mae"],"bo-",label="MAE")
ax[0].loglog(all_records[""]["step"], all_records[""]["ener_rmse"],"ro-",label="RMSE")
ax[0].set_ylabel(f"Energy")

ax[1].loglog(all_records[""]["step"], all_records[""]["force_mae"],"bo-",label="MAE")
ax[1].loglog(all_records[""]["step"], all_records[""]["force_rmse"],"ro-",label="RMSE")
ax[1].set_ylabel(f"Force")

ax[2].loglog(all_records[""]["step"], all_records[""]["virial_mae"],"bo-",label="MAE")
ax[2].loglog(all_records[""]["step"], all_records[""]["virial_rmse"],"ro-",label="RMSE")
ax[2].set_ylabel(f"Virial")
fig.suptitle(run_id)
else:
fig, ax = plt.subplots(n_heads+1, 3, figsize=(12,2*n_heads+3),sharex=True)
weighted_dptest = []
for i, head in enumerate(heads):
if head in all_records:
ax[i][0].loglog(all_records[head]["step"], all_records[head]["ener_mae"],"bo-",label="MAE")
ax[i][0].loglog(all_records[head]["step"], all_records[head]["ener_rmse"],"ro-",label="RMSE")
ax[i][0].set_ylabel(f"Energy_{head}")

ax[i][1].loglog(all_records[head]["step"], all_records[head]["force_mae"],"bo-",label="MAE")
ax[i][1].loglog(all_records[head]["step"], all_records[head]["force_rmse"],"ro-",label="RMSE")
ax[i][1].set_ylabel(f"Force_{head}")

ax[i][2].loglog(all_records[head]["step"], all_records[head]["virial_mae"],"bo-",label="MAE")
ax[i][2].loglog(all_records[head]["step"], all_records[head]["virial_rmse"],"ro-",label="RMSE")
ax[i][2].set_ylabel(f"Virial_{head}")
if head in PREVIOUS_BASELINE:
ax[i][0].axhline(PREVIOUS_BASELINE[head]["rmse_e"],color="red", linestyle="-.")
ax[i][1].axhline(PREVIOUS_BASELINE[head]["rmse_f"],color="red", linestyle="-.")
ax[i][2].axhline(PREVIOUS_BASELINE[head]["rmse_v"],color="red", linestyle="-.")
if "mae_e" in PREVIOUS_BASELINE[head]:
ax[i][0].axhline(PREVIOUS_BASELINE[head]["mae_e"],color="blue", linestyle="-.")
ax[i][1].axhline(PREVIOUS_BASELINE[head]["mae_f"],color="blue", linestyle="-.")
ax[i][2].axhline(PREVIOUS_BASELINE[head]["mae_v"],color="blue", linestyle="-.")

weighted_dptest.append(np.array([
all_records[head]["ener_mae"],
all_records[head]["ener_rmse"],
all_records[head]["force_mae"],
all_records[head]["force_rmse"],
all_records[head]["virial_mae"],
all_records[head]["virial_rmse"]
])*weights[head])
n_ckpt_to_weight = weighted_dptest[-1].shape[1]
weighted_dptest = [rr[:,:n_ckpt_to_weight] for rr in weighted_dptest]
weighted_dptest = np.sum(weighted_dptest, axis=0)/sum(weights.values())

ax[-1][0].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[0],"bo-",label="MAE")
ax[-1][0].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[1],"ro-",label="RMSE")
ax[-1][0].set_ylabel(f"Energy_Weighted")

ax[-1][1].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[2],"bo-",label="MAE")
ax[-1][1].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[3],"ro-",label="RMSE")
ax[-1][1].set_ylabel(f"Force_Weighted")

ax[-1][2].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[4],"bo-",label="MAE")
ax[-1][2].loglog(all_records[head]["step"][:n_ckpt_to_weight], weighted_dptest[5],"ro-",label="RMSE")
ax[-1][2].set_ylabel(f"Virial_Weighted")

plt.tight_layout()
fig.savefig("dptest.jpg")
sendimg(["dptest.jpg"])
sendimg(["dptest.jpg"], run_id)


if __name__ == "__main__":

main("/mnt/data_nas/public/multitask/training_exps/1018_b4_medium_l6_atton_37head_linear_fitting_tanh")
for exp_path in ["/mnt/data_nas/penganyang/experiments/1029_omat_batch128_medium_test"]:
main(exp_path)
# main("/mnt/data_nas/public/multitask/training_exps/1018_b4_medium_l6_atton_37head_linear_fitting_tanh")
# main("1015_37head_multitask_1gpu_test")
4 changes: 3 additions & 1 deletion lamstare/experiments/run_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
from dotenv import load_dotenv # type: ignore
import os
import sys
load_dotenv()

temp_file_path = os.environ.get("TEMP_FILE_DIR")
Expand Down Expand Up @@ -84,5 +85,6 @@ def main(exp_path:str, freq:int=200000):
print("No new ckpt to test.\n")

if __name__ == "__main__":
main("/mnt/workspace/cc/multitask/training_exps/1015_37head_multitask_1gpu_test") # multi task test data
main(sys.argv[1])
# main("/mnt/workspace/cc/multitask/training_exps/1015_37head_multitask_1gpu_test") # multi task test data
# main("/mnt/workspace/penganyang/experiments/1018_mptrj_l6_atton_b256_test") # single task test data
8 changes: 5 additions & 3 deletions lamstare/utils/dlc_submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import sys
load_dotenv()

def submit_dptest_job_to_dlc(job_name:str):
def submit_dptest_job_to_dlc(exp_path:str):
job_name = exp_path.split("/")[-1]
docker_image = os.environ.get("DOCKER_IMAGE")
data_sources = os.environ.get("DATA_SOURCES")
workspace_id = os.environ.get("WORKSPACE_ID")
Expand All @@ -23,7 +24,7 @@ def submit_dptest_job_to_dlc(job_name:str):
f". /mnt/data_nas/public/.bashrc \n" \
f"conda activate /mnt/data_nas/public/Miniconda/envs/{venv} \n" \
f"cd {lamstare_path} \n" \
f"python lamstare/experiments/run_dptest.py \n" \
f"python lamstare/experiments/run_test.py {exp_path} \n" \


cmd = ['/mnt/data_nas/penganyang/dlc', 'submit', 'pytorchjob']
Expand All @@ -46,4 +47,5 @@ def submit_dptest_job_to_dlc(job_name:str):
raise RuntimeError(f"{job_name} failed")

if __name__ == "__main__":
submit_dptest_job_to_dlc(sys.argv[1])
for exp_path in ["/mnt/data_nas/penganyang/experiments/1029_omat_batch128_medium_test"]:
submit_dptest_job_to_dlc(exp_path) # abosulute path to the experiment folder
Loading

0 comments on commit 8accd89

Please sign in to comment.