From aa1bb735c6ef7882803579681ac2b34f1956f299 Mon Sep 17 00:00:00 2001 From: knc6 Date: Wed, 27 Nov 2024 16:21:58 -0500 Subject: [PATCH] Add database generator. --- .../examples/inverse_model_multi/config.json | 1 + atomgpt/forward_models/forward_models.py | 7 +- atomgpt/inverse_models/database_generator.py | 110 ++++++++ atomgpt/inverse_models/inference.py | 106 -------- atomgpt/inverse_models/inverse_models.py | 254 +++++++++++------- atomgpt/inverse_models/saver.py | 63 ----- setup.py | 2 +- 7 files changed, 272 insertions(+), 271 deletions(-) create mode 100644 atomgpt/inverse_models/database_generator.py delete mode 100644 atomgpt/inverse_models/inference.py delete mode 100644 atomgpt/inverse_models/saver.py diff --git a/atomgpt/examples/inverse_model_multi/config.json b/atomgpt/examples/inverse_model_multi/config.json index a417edf..fa3d2e5 100644 --- a/atomgpt/examples/inverse_model_multi/config.json +++ b/atomgpt/examples/inverse_model_multi/config.json @@ -21,6 +21,7 @@ "csv_out": "AI-AtomGen-prop-dft_3d-test-rmse.csv", "chem_info": "formula", "max_seq_length": 2048, + "prop": "multival", "dtype": null, "load_in_4bit": true, "instruction": "", diff --git a/atomgpt/forward_models/forward_models.py b/atomgpt/forward_models/forward_models.py index 817cdb5..86e447b 100644 --- a/atomgpt/forward_models/forward_models.py +++ b/atomgpt/forward_models/forward_models.py @@ -280,10 +280,15 @@ def __getitem__(self, idx): # Example usage -def main(config_file="config.json"): +def main(config_file=None): figlet = get_figlet() print(figlet) print("Running AtomGPT prop predictor.") + if config_file is None: + + args = parser.parse_args(sys.argv[1:]) + config_file = args.config_name + # run_path = os.path.abspath(config_file).split("config.json")[0] config = loadjson(config_file) config = TrainingPropConfig(**config) diff --git a/atomgpt/inverse_models/database_generator.py b/atomgpt/inverse_models/database_generator.py new file mode 100644 index 0000000..71c18b5 --- /dev/null +++ b/atomgpt/inverse_models/database_generator.py @@ -0,0 +1,110 @@ +from jarvis.core.specie import atomic_numbers_to_symbols +import numpy as np +from jarvis.db.jsonutils import loadjson, dumpjson +from jarvis.core.composition import Composition +from atomgpt.inverse_models.inverse_models import ( + load_model, + get_input, + batch_evaluate, +) +import time +from itertools import combinations_with_replacement, permutations +from jarvis.core.atoms import Atoms + + +class AtomicDBGenerator: + def __init__( + self, + max_atomic_number=100, + max_stoichiometry=2, + elements=None, + model_path="", + config=None, + tokenizer=None, + model=None, + target=10, + batch_size=2, + ): + self.max_atomic_number = max_atomic_number + self.max_stoichiometry = ( + max_stoichiometry # Maximum number of elements in a compound + ) + self.model_path = model_path + self.elements = elements or [] + self.target = str(target) + self.batch_size = batch_size + if not self.elements: + Z = np.arange(max_atomic_number) + 1 + self.elements = atomic_numbers_to_symbols(Z) + self.elements = list(set(self.elements)) + self.model = model + self.config = config + self.tokenizer = tokenizer + + if self.model_path == "" and self.model is None: + raise ValueError("Provide model_path") + if self.model is None: + model, tokenizer, config = load_model(path=self.model_path) + self.model = model + self.tokenizer = tokenizer + self.config = config + + def generate_samples(self): + t1 = time.time() + mem = {} + inputs = set() # Use a set to ensure uniqueness + compositions = set() # To keep track of unique compositions + + for stoich_count in range( + 1, self.max_stoichiometry + 1 + ): # From unary to desired stoichiometry + for comb in combinations_with_replacement( + self.elements, stoich_count + ): + for perm in permutations(comb): # To generate all orderings + try: + comp_dict = {el: perm.count(el) for el in set(perm)} + comp = Composition.from_dict(comp_dict) + reduced_formula = comp.reduced_formula + + if reduced_formula not in compositions: + compositions.add(reduced_formula) + inp = get_input( + config=self.config, + chem=reduced_formula, + val=self.target, + ) + inputs.add( + inp + ) # Add to inputs to ensure uniqueness + except Exception as exp: + print("Exp", exp) + pass + + mem["inputs"] = list(inputs) + mem["outputs"] = batch_evaluate( + prompts=list(inputs), + model=self.model, + tokenizer=self.tokenizer, + csv_out="out.csv", + config=self.config, + batch_size=self.batch_size, + ) + # for i,j in mem.items(): + # print(i,j) + fname = f"materials_stoichiometry_{self.max_stoichiometry}.json" + t2 = time.time() + mem["time"] = t2 - t1 + dumpjson(data=mem, filename=fname) + print(f"Time taken for up to {self.max_stoichiometry}-ary: {t2 - t1}") + return mem + + +if __name__ == "__main__": + gen = AtomicDBGenerator( + elements=["Mg", "B", "C"], + max_stoichiometry=2, # Can be set to any desired order + model_path="/wrk/knc6/Software/atomgpt_opt/atomgpt/lora_model_m/", + batch_size=10, + ) + gen.generate_samples() diff --git a/atomgpt/inverse_models/inference.py b/atomgpt/inverse_models/inference.py deleted file mode 100644 index e8135cd..0000000 --- a/atomgpt/inverse_models/inference.py +++ /dev/null @@ -1,106 +0,0 @@ -"""Module for inference.""" -from jarvis.db.jsonutils import loadjson -from unsloth import FastLanguageModel -import torch -from datasets import load_dataset -from trl import SFTTrainer -from transformers import TrainingArguments -from jarvis.core.atoms import Atoms -from jarvis.db.figshare import data -from jarvis.db.jsonutils import loadjson, dumpjson -import numpy as np -from jarvis.core.atoms import Atoms -from jarvis.core.lattice import Lattice -from tqdm import tqdm -from jarvis.io.vasp.inputs import Poscar - -import os -#os.environ['CUDA_VISIBLE_DEVICES']='0' -#torch.cuda.is_available = lambda : False -alpaca_prompt = """Below is a description of a superconductor material.. - -### Instruction: -{} - -### Input: -{} - -### Output: -{}""" - -alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. - -### Instruction: -{} - -### Input: -{} - -### Response: -{}""" - -max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally! -dtype = None # -load_in_4bit = True -model, tokenizer = FastLanguageModel.from_pretrained( - model_name = "/wrk/knc6/AtomGPT/SuperCon/atomgpt_bulk_gen_formation_energy_peratom/lora_model_m", # YOUR MODEL YOU USED FOR TRAINING - #model_name = "lora_model_mo", # YOUR MODEL YOU USED FOR TRAINING - max_seq_length = max_seq_length, - dtype = dtype, - load_in_4bit = load_in_4bit, - device_map="auto" - -) -FastLanguageModel.for_inference(model) # Enable native 2x faster inference - - -def text2atoms(response): - tmp_atoms_array = response.split('\n') - - lat_lengths = np.array(tmp_atoms_array[1].split(),dtype='float') - lat_angles = np.array(tmp_atoms_array[2].split(),dtype='float') - - lat = Lattice.from_parameters(lat_lengths[0], lat_lengths[1], lat_lengths[2], lat_angles[0], lat_angles[1], lat_angles[2]) - elements=[] - coords=[] - for ii,i in enumerate(tmp_atoms_array): - if ii>2 and ii", "").split("### Output:")[1] - for output in outputs_decoded - ] - print("outputs_decoded", outputs_decoded) - f.write("id,target,prediction\n") - - for ii, i in tqdm(enumerate(test_set), total=len(test_set)): - # try: - - gen_mat = ( - Poscar(text2atoms(outputs_decoded[ii])) - .to_string() - .replace("\n", "\\n") + print("Testing\n", len(prompts)) + if batch_size is None: + batch_size = len(prompts) + outputs_decoded = [] + for batch_start in tqdm(range(0, len(prompts), batch_size)): + batch_end = min(batch_start + batch_size, len(prompts)) + batch_prompts = prompts[batch_start:batch_end] + + # Tokenize and prepare inputs + inputs = tokenizer( + [ + config.alpaca_prompt.format(config.instruction, msg, "") + for msg in batch_prompts + ], + return_tensors="pt", + padding=True, + truncation=True, + ).to("cuda") + + # Generate outputs using the model + outputs = model.generate( + **inputs, + max_new_tokens=config.max_seq_length, + use_cache=True, ) - if target_exists: - target_mat = ( - Poscar(text2atoms("\n" + i["output"])) - .to_string() - .replace("\n", "\\n") + + # Decode outputs + outputs_decoded_temp = tokenizer.batch_decode(outputs) + # print('outputs_decoded_temp',outputs_decoded_temp) + for output in outputs_decoded_temp: + outputs_decoded.append( + output.replace("", "") + .split("### Output:")[1] + .strip("") ) - else: - target_mat = "" - print("target_mat", target_mat) - print("genmat", gen_mat) - line = ids[ii] + "," + target_mat + "," + gen_mat + "\n" - f.write(line) - print() - # except Exception as exp: - # print("Error", exp) - # pass + + # print("outputs_decoded", outputs_decoded) + f.write("id,target,prediction\n") + + for ii, i in tqdm(enumerate(outputs_decoded), total=len(outputs_decoded)): + try: + # print("outputs_decoded[ii]",i) + atoms = text2atoms(i) + gen_mat = Poscar(atoms).to_string().replace("\n", "\\n") + gen_atoms.append(atoms.to_dict()) + if target_exists: + target_mat = ( + Poscar(text2atoms("\n" + i["output"])) + .to_string() + .replace("\n", "\\n") + ) + else: + target_mat = "" + # print("target_mat", target_mat) + # print("genmat", gen_mat) + line = ids[ii] + "," + target_mat + "," + gen_mat + "\n" + f.write(line) + # print() + except Exception as exp: + print("Error", exp) + pass f.close() + return gen_atoms -def main(config_file="config.json"): +def main(config_file=None): + if config_file is None: + + args = parser.parse_args(sys.argv[1:]) + config_file = args.config_name if not torch.cuda.is_available(): raise ValueError("Currently model training is possible with GPU only.") figlet = get_figlet() print(figlet) t1 = time.time() + print("config_file", config_file) config = loadjson(config_file) config = TrainingPropConfig(**config) pprint.pprint(config.dict()) @@ -271,10 +321,10 @@ def main(config_file="config.json"): if not os.path.exists(config.model_save_path): os.makedirs(config.model_save_path) tmp = config.dict() - f = open(os.path.join(config.output_dir, "config.json"), "w") + f = open(os.path.join(config.output_dir, "atomgpt_config.json"), "w") f.write(json.dumps(tmp, indent=4)) f.close() - f = open(os.path.join(config.model_save_path, "config.json"), "w") + f = open(os.path.join(config.model_save_path, "atomgpt_config.json"), "w") f.write(json.dumps(tmp, indent=4)) f.close() id_prop_path = config.id_prop_path @@ -295,7 +345,7 @@ def main(config_file="config.json"): info["id"] = i[0] ids.append(i[0]) tmp = [float(j) for j in i[1:]] - print("tmp", tmp) + # print("tmp", tmp) if len(tmp) == 1: tmp = str(float(tmp[0])) else: @@ -305,7 +355,7 @@ def main(config_file="config.json"): # tmp = "\n".join([str(round(float(j), 2)) for j in i[1].split(";")]) # else: # tmp = str(round(float(i[1]), 3)) - info["prop"] = ( + info[config.prop] = ( tmp # float(i[1]) # [float(j) for j in i[1:]] # float(i[1] ) pth = os.path.join(run_path, info["id"]) @@ -328,10 +378,11 @@ def main(config_file="config.json"): m_train = make_alpaca_json( dataset=dat, jids=train_ids, - prop=config.property_name, - instruction=config.instruction, - chem_info=config.chem_info, - output_prompt=config.output_prompt, + config=config, + # prop=config.property_name, + # instruction=config.instruction, + # chem_info=config.chem_info, + # output_prompt=config.output_prompt, ) dumpjson(data=m_train, filename="alpaca_prop_train.json") print("Sample:\n", m_train[0]) @@ -339,11 +390,12 @@ def main(config_file="config.json"): m_test = make_alpaca_json( dataset=dat, jids=test_ids, - prop="prop", + config=config, + # prop="prop", include_jid=True, - instruction=config.instruction, - chem_info=config.chem_info, - output_prompt=config.output_prompt, + # instruction=config.instruction, + # chem_info=config.chem_info, + # output_prompt=config.output_prompt, ) dumpjson(data=m_test, filename="alpaca_prop_test.json") @@ -434,12 +486,13 @@ def main(config_file="config.json"): load_in_4bit=config.load_in_4bit, ) FastLanguageModel.for_inference(model) # Enable native 2x faster inference + model, tokenizer, config = load_model(path=config.model_save_path) # batch_evaluate( - # prompts=[i["input"] for i in m_test], - # model=model, - # tokenizer=tokenizer, - # csv_out=config.csv_out, - # config=config, + # prompts=[i["input"] for i in m_test], + # model=model, + # tokenizer=tokenizer, + # csv_out=config.csv_out, + # config=config, # ) # t1 = time.time() # batch_evaluate( @@ -469,3 +522,4 @@ def main(config_file="config.json"): main(config_file=args.config_name) # config_file="config.json" # ) + # x=load_model(path="/wrk/knc6/Software/atomgpt_opt/atomgpt/lora_model_m/") diff --git a/atomgpt/inverse_models/saver.py b/atomgpt/inverse_models/saver.py deleted file mode 100644 index d678531..0000000 --- a/atomgpt/inverse_models/saver.py +++ /dev/null @@ -1,63 +0,0 @@ -from atomgpt.inverse_models.loader import FastLanguageModel -from atomgpt.inverse_models.save import save_to_gguf,unsloth_save_pretrained_gguf -import json -fourbit_models = [ - "unsloth/mistral-7b-bnb-4bit", - "unsloth/mistral-7b-instruct-v0.2-bnb-4bit", - "unsloth/llama-2-7b-bnb-4bit", - "unsloth/llama-2-13b-bnb-4bit", - "unsloth/codellama-34b-bnb-4bit", - "unsloth/tinyllama-bnb-4bit", -] # More models at https://huggingface.co/unsloth - -nm = "unsloth/mistral-7b-bnb-4bit" -nm = fourbit_models[-2] -nm = fourbit_models[0] -max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally! -dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+ -load_in_4bit = ( - True # Use 4bit quantization to reduce memory usage. Can be False. -) -model, tokenizer = FastLanguageModel.from_pretrained( - model_name=nm, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B - max_seq_length=max_seq_length, - dtype=dtype, - load_in_4bit=load_in_4bit, - # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf -) -model = FastLanguageModel.get_peft_model( - model, - r=16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 - target_modules=[ - "q_proj", - "k_proj", - "v_proj", - "o_proj", - "gate_proj", - "up_proj", - "down_proj", - ], - lora_alpha=16, - lora_dropout=0, # Supports any, but = 0 is optimized - bias="none", # Supports any, but = "none" is optimized - use_gradient_checkpointing=True, - random_state=3407, - use_rslora=False, # We support rank stabilized LoRA - loftq_config=None, # And LoftQ -) -model.save_pretrained("unsloth_finetuned_model") -tokenizer.save_pretrained("unsloth_finetuned_model") -#model.save_pretrained_gguf("xyz",tokenizer=tokenizer) -max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally! -dtype = None # -load_in_4bit = True -model, tokenizer = FastLanguageModel.from_pretrained( - model_name = "xyz-unsloth.Q8_0.gguf", # YOUR MODEL YOU USED FOR TRAINING - max_seq_length = max_seq_length, - dtype = dtype, - load_in_4bit = load_in_4bit, - device_map="auto" - -) -FastLanguageModel.for_inference(model) # Enable native 2x faster inference -print(model) diff --git a/setup.py b/setup.py index 09310ab..5f7e4c3 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ "protobuf", # "alignn", ], - scripts=["atomgpt/train_prop.py"], + # scripts=["atomgpt/train_prop.py"], entry_points={ "console_scripts": [ "atomgpt_forward=atomgpt.forward_models.forward_models:main",