From 29cdd9a46ac27e1692b7a10a4837781f31ae38c1 Mon Sep 17 00:00:00 2001 From: pacman000 Date: Sat, 30 Nov 2024 21:34:13 -0500 Subject: [PATCH] evaluation final --- src/abstractions/model.py | 8 ++-- src/evaluation/quantify.py | 68 ++++++++++++++++++++++------------ src/evaluation/test_eval_01.py | 19 +++++----- src/evaluation/test_eval_02.py | 48 ++++++++++++------------ src/evaluation/utils.py | 13 +++++-- 5 files changed, 94 insertions(+), 62 deletions(-) diff --git a/src/abstractions/model.py b/src/abstractions/model.py index 57158d9..3b2fc42 100644 --- a/src/abstractions/model.py +++ b/src/abstractions/model.py @@ -999,7 +999,7 @@ def __inference_serial( else list(result_data.all_passages()) ) - def evaluate(self, logprobs = False, method: Literal["fast", "dummy"] = "fast") -> np.ndarray: + def evaluate(self, method: Literal["fast", "dummy"] = "fast", logprobs=True) -> np.ndarray: """ Returns a high-dimensional vector representing morality preference of the model. Choose "dummy" for fast debugging runs. """ @@ -1012,7 +1012,7 @@ def evaluate(self, logprobs = False, method: Literal["fast", "dummy"] = "fast") f'Method {method} not recognized. Options are "fast" and "dummy".' ) - def __evaluate_fast(self, logprobs = False) -> np.ndarray: + def __evaluate_fast(self, logprobs = True) -> np.ndarray: if self.template_type != "alpaca": raise NotImplementedError( "Fast evaluation is only supported for models using alpaca template." @@ -1030,7 +1030,7 @@ def __evaluate_fast(self, logprobs = False) -> np.ndarray: os.mkdir(experiment_directory) if logprobs: - evaluation_input = eval_utils.regenerate_inputs(logprobs = True) + evaluation_input = eval_utils.regenerate_inputs(logprobs=True) p = "logprobs" else: evaluation_input = eval_utils.regenerate_inputs() @@ -1052,7 +1052,7 @@ def __evaluate_fast(self, logprobs = False) -> np.ndarray: ) as f: json.dump(raw_stats, f) print("raw results saved") - vec = calculate_model(experiment_directory, self.model_name) + vec = calculate_model(experiment_directory, self.model_name, logprob = logprob) return vec def __evaluate_slow_moralchoice(self) -> np.ndarray: diff --git a/src/evaluation/quantify.py b/src/evaluation/quantify.py index d4020bd..dbfbaed 100644 --- a/src/evaluation/quantify.py +++ b/src/evaluation/quantify.py @@ -38,7 +38,7 @@ os.makedirs("logs/eval") -def _calculate_model(test_name, high_or_low, model_name): +def __calculate_model(test_name, high_or_low, model_name): # test_name, high_or_low, model_name = sys.argv[1], sys.argv[2], sys.argv[3] # assert high_or_low in ['low', 'high'] scenario = ( @@ -150,7 +150,7 @@ def normalize_by_sum(vector_list): return normalized_vectors -def calculate_model(test_dir, model_name): +def calculate_model(test_dir, model_name, logprob=False): raw_dir = os.path.join(test_dir, model_name + "_raw.json") with open(raw_dir, "r") as f: raw_dict = json.load(f) @@ -182,11 +182,18 @@ def calculate_model(test_dir, model_name): if valid_cnt <= 0: invalid[0] += 1 continue - mal = ( - entry["ab"][0] / (max(entry["ab"][-1], 1)) - + entry["repeat"][0] / (max(entry["repeat"][-1], 1)) - + entry["compare"][0] / (max(entry["compare"][-1], 1)) - ) + if not logprob: + mal = ( + entry["ab"][0] / (max(entry["ab"][-1], 1)) + + entry["repeat"][0] / (max(entry["repeat"][-1], 1)) + + entry["compare"][0] / (max(entry["compare"][-1], 1)) + ) + else: + mal = ( + entry["ab"][0] / (entry["ab"][-1]) + + entry["repeat"][0] / (entry["repeat"][-1]) + + entry["compare"][0] / (entry["compare"][-1]) + ) mal /= 3 template = np.zeros( 10 @@ -231,13 +238,24 @@ def calculate_model(test_dir, model_name): continue if key not in mrl_vec[num].keys(): mrl_vec[num][key] = np.zeros((lambda x: 5 if x == 1 else 4)(num)) - mrl_vec[2][key] += np.array(entry["4c_fav"][:4]) / ( - max(entry["4c_fav"][-1], 1) - ) - mrl_vec[2][key] += np.array(entry["repeat2_fav"][:4]) / ( - max(entry["repeat2_fav"][-1], 1) - ) - mrl_vec[2][key] /= 2 + if not logprob: + mrl_vec[2][key] += np.array(entry["4c_fav"][:4]) / ( + max(entry["4c_fav"][-1], 1) + ) + mrl_vec[2][key] += np.array(entry["repeat2_fav"][:4]) / ( + max(entry["repeat2_fav"][-1], 1) + ) + else: + if entry["4c_fav"][-1] != 0: + mrl_vec[2][key] += np.array(entry["4c_fav"][:4]) / ( + entry["4c_fav"][-1] + ) + if entry["repeat2_fav"][-1] != 0: + mrl_vec[2][key] += np.array(entry["repeat2_fav"][:4]) / ( + entry["repeat2_fav"][-1] + ) + mrl_vec[2][key] /= int(entry["4c_fav"][-1] != 0) + int(entry["repeat2_fav"][-1] != 0) + if num == 1: # ref_dict = csv_to_dict_list(ref_dir[1], ['scenario_id', 'generation_theme']) ref_dict = csv_to_dict(ref_dir[1], ["generation_theme"]) @@ -269,11 +287,18 @@ def calculate_model(test_dir, model_name): continue if key not in mrl_vec[num].keys(): mrl_vec[num][key] = np.zeros((lambda x: 5 if x == 1 else 4)(num)) - mal = ( - entry["ab"][0] / (max(entry["ab"][-1], 1)) - + entry["repeat"][0] / (max(entry["repeat"][-1], 1)) - + entry["compare"][0] / (max(entry["compare"][-1], 1)) - ) + if not logprob: + mal = ( + entry["ab"][0] / (max(entry["ab"][-1], 1)) + + entry["repeat"][0] / (max(entry["repeat"][-1], 1)) + + entry["compare"][0] / (max(entry["compare"][-1], 1)) + ) + else: + mal = ( + entry["ab"][0] / (entry["ab"][-1]) + + entry["repeat"][0] / (entry["repeat"][-1]) + + entry["compare"][0] / (entry["compare"][-1]) + ) mal /= 3 context_matching = { "Harm_Care": 0, @@ -284,10 +309,6 @@ def calculate_model(test_dir, model_name): } theme = ref_dict[key]["generation_theme"].strip() mrl_vec[1][key][context_matching[theme]] += mal - """ - with open(os.path.join(test_dir, model_name + '_results.json'), 'w') as f: - json.dump(mrl_vec, f) - """ # assert len(mrl_vec[1]) > 0 if not (len(mrl_vec[0]) > 0 and len(mrl_vec[1]) > 0 and len(mrl_vec[2]) > 0): return np.zeros(19) @@ -304,6 +325,7 @@ def calculate_model(test_dir, model_name): print("invalid rate:", invalid) res = np.concatenate(avg_vec) except: + print("returning zeros") res = np.zeros(19) return res diff --git a/src/evaluation/test_eval_01.py b/src/evaluation/test_eval_01.py index 849cf95..be5cf63 100644 --- a/src/evaluation/test_eval_01.py +++ b/src/evaluation/test_eval_01.py @@ -13,6 +13,7 @@ """ if __name__ == "__main__": freeze_support() + set_model = [ "8B-C013-instruct", "8B-C014-instruct", @@ -22,30 +23,30 @@ "8B-C018-instruct", "8B-C019-instruct", "8B-C020-instruct", - "8B-C021-instruct", + "8B-C021-instruct" ] - set_model = set_model[:2] + #set_model = ["8B-C018-instruct"] vec = [] for m in set_model: - boi = Model(m) - v = boi.evaluate(method="fast", logprobs = True) - ''' + #boi = Model(m) + #v = boi.evaluate(method="fast", logprobs = True) + with open("output/datasets/evaluation_output_mc_" + m + ".json", 'r') as f: d = json.load(f) raw = _collect(d) with open('output/evaluation_results/' + m + '_single/' + m + '_raw.json', 'w') as f: json.dump(raw, f) v = qt.calculate_model('output/evaluation_results/' + m + '_single/', m) - ''' + vec.append(v) - test_name = "logprob_test" + test_name = "8b_13to21" with open("output/evaluation_results/" + test_name + ".json", "w") as f: lst = [list(boi) for boi in vec] json.dump(lst, f) vec = np.array(vec) - # qt.analyze_vectors_quadratic(vec) + qt.analyze_vectors_quadratic(vec) # vec = json.load(open("output/evaluation_results/" + test_name + ".json", "r")) # qt.plot_parallel_coordinates(vec) qt.plot_heatmap(vec[:, 10:15], test_name + '_foundation', label_set = 2, norm = "group") qt.plot_heatmap(vec[:, 15:19], test_name + '_view',label_set = 3, norm = "group") - qt.plot_heatmap(vec[:, :10], test_name + '_morality', label_set = 1, norm = "group") + qt.plot_heatmap(vec[:, :10], test_name + '_morality', label_set = 1, norm = "column") diff --git a/src/evaluation/test_eval_02.py b/src/evaluation/test_eval_02.py index 3a04a65..68fdaa5 100644 --- a/src/evaluation/test_eval_02.py +++ b/src/evaluation/test_eval_02.py @@ -4,33 +4,35 @@ from multiprocessing import freeze_support import src.evaluation.quantify as qt import numpy as np -""" -generate_alpaca('mc', os.path.join('src', 'evaluation', 'raw_dataset', 'moralchoice')) -generate_alpaca('views', os.path.join('src', 'evaluation', 'raw_dataset', 'views')) -generate_alpaca('foundation', os.path.join('src', 'evaluation', 'raw_dataset', 'foundation')) -""" +import random + if __name__ == "__main__": - os.environ["CUDA_VISIBLE_DEVICE"] = "0" freeze_support() set_model = [ "8B-C013-instruct", "8B-C014-instruct", - "8B-C015-instruct" + "8B-C015-instruct", + "8B-C016-instruct", + "8B-C017-instruct", + "8B-C018-instruct", + "8B-C019-instruct", + "8B-C020-instruct", + "8B-C021-instruct" ] - vec = [] + with open('src/evaluation/assets/input_alpaca.json', 'r') as f: + ref = json.load(f) + display = [] for m in set_model: - with open("output/datasets/evaluation_output_mc_" + m + ".json", 'r') as f: - d = json.load(f) - raw = collect(d, logprobs = True) - with open('output/evaluation_results/' + m + '_single/' + m + '_raw.json', 'w') as f: - json.dump(raw, f) - v = qt.calculate_model('output/evaluation_results/' + m + '_single/', m) - vec.append(v) - test_name = "8b_all_fixed" - vec = np.array(vec) - with open("output/evaluation_results/" + test_name + ".json", "w") as f: - lst = [list(boi) for boi in vec] - json.dump(lst, f) - qt.plot_heatmap(vec[:, 10:15], test_name + '_foundation', label_set = 2, norm = "group") - qt.plot_heatmap(vec[:, 15:19], test_name + '_view',label_set = 3, norm = "group") - qt.plot_heatmap(vec[:, :10], test_name + '_morality', label_set = 1, norm = "group") + with open('output/evaluation_results/' + m + '_single/' + m + '_raw.json', 'r') as f: + data = json.load(f) + inputs = random.sample(ref, 3) + for input in inputs: + s, q, t, map, predicts = input['scenario_id'], input['question_type'], input['input'], input["mapping"], input["predict"] + #probs = list(data[s][q][:-1] / data[s][q][-1]) + probs = [x / data[s][q][-1] for x in data[s][q][:-1]] + probs = [probs[i-1] for i in map] + display.append({"model": m, "question": t, "probs": probs}) + with open('output/evaluation_results/display.json', 'w') as f: + json.dump(display, f) + + \ No newline at end of file diff --git a/src/evaluation/utils.py b/src/evaluation/utils.py index 2149abf..4626956 100644 --- a/src/evaluation/utils.py +++ b/src/evaluation/utils.py @@ -253,7 +253,10 @@ def semantic_matching(item, mapping, four=False, verbal=False): def _collect(output_data): output = {} + invalid = 0 for entry in output_data: + if "logprob" not in entry.keys(): + continue s_id = entry["scenario_id"] q_type = entry["question_type"] mapping = entry["mapping"] @@ -270,9 +273,13 @@ def _collect(output_data): "compare": [0, 0, 0], "repeat": [0, 0, 0], } - for i, x in enumerate(logprob): - output[s_id][q_type][mapping[i] - 1] = x - output[s_id][q_type][-1] += x + if isinstance(logprob, list): + for i, x in enumerate(logprob): + output[s_id][q_type][mapping[i] - 1] += np.exp(x) + output[s_id][q_type][-1] += np.exp(x) + else: + invalid += 1 + print(invalid) return output