Skip to content

Commit

Permalink
evaluation final
Browse files Browse the repository at this point in the history
  • Loading branch information
pacman000 committed Dec 1, 2024
1 parent 9d1e526 commit 29cdd9a
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 62 deletions.
8 changes: 4 additions & 4 deletions src/abstractions/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -999,7 +999,7 @@ def __inference_serial(
else list(result_data.all_passages())
)

def evaluate(self, logprobs = False, method: Literal["fast", "dummy"] = "fast") -> np.ndarray:
def evaluate(self, method: Literal["fast", "dummy"] = "fast", logprobs=True) -> np.ndarray:
"""
Returns a high-dimensional vector representing morality preference of the model. Choose "dummy" for fast debugging runs.
"""
Expand All @@ -1012,7 +1012,7 @@ def evaluate(self, logprobs = False, method: Literal["fast", "dummy"] = "fast")
f'Method {method} not recognized. Options are "fast" and "dummy".'
)

def __evaluate_fast(self, logprobs = False) -> np.ndarray:
def __evaluate_fast(self, logprobs = True) -> np.ndarray:
if self.template_type != "alpaca":
raise NotImplementedError(
"Fast evaluation is only supported for models using alpaca template."
Expand All @@ -1030,7 +1030,7 @@ def __evaluate_fast(self, logprobs = False) -> np.ndarray:
os.mkdir(experiment_directory)

if logprobs:
evaluation_input = eval_utils.regenerate_inputs(logprobs = True)
evaluation_input = eval_utils.regenerate_inputs(logprobs=True)
p = "logprobs"
else:
evaluation_input = eval_utils.regenerate_inputs()
Expand All @@ -1052,7 +1052,7 @@ def __evaluate_fast(self, logprobs = False) -> np.ndarray:
) as f:
json.dump(raw_stats, f)
print("raw results saved")
vec = calculate_model(experiment_directory, self.model_name)
vec = calculate_model(experiment_directory, self.model_name, logprob = logprob)
return vec

def __evaluate_slow_moralchoice(self) -> np.ndarray:
Expand Down
68 changes: 45 additions & 23 deletions src/evaluation/quantify.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
os.makedirs("logs/eval")


def _calculate_model(test_name, high_or_low, model_name):
def __calculate_model(test_name, high_or_low, model_name):
# test_name, high_or_low, model_name = sys.argv[1], sys.argv[2], sys.argv[3]
# assert high_or_low in ['low', 'high']
scenario = (
Expand Down Expand Up @@ -150,7 +150,7 @@ def normalize_by_sum(vector_list):
return normalized_vectors


def calculate_model(test_dir, model_name):
def calculate_model(test_dir, model_name, logprob=False):
raw_dir = os.path.join(test_dir, model_name + "_raw.json")
with open(raw_dir, "r") as f:
raw_dict = json.load(f)
Expand Down Expand Up @@ -182,11 +182,18 @@ def calculate_model(test_dir, model_name):
if valid_cnt <= 0:
invalid[0] += 1
continue
mal = (
entry["ab"][0] / (max(entry["ab"][-1], 1))
+ entry["repeat"][0] / (max(entry["repeat"][-1], 1))
+ entry["compare"][0] / (max(entry["compare"][-1], 1))
)
if not logprob:
mal = (
entry["ab"][0] / (max(entry["ab"][-1], 1))
+ entry["repeat"][0] / (max(entry["repeat"][-1], 1))
+ entry["compare"][0] / (max(entry["compare"][-1], 1))
)
else:
mal = (
entry["ab"][0] / (entry["ab"][-1])
+ entry["repeat"][0] / (entry["repeat"][-1])
+ entry["compare"][0] / (entry["compare"][-1])
)
mal /= 3
template = np.zeros(
10
Expand Down Expand Up @@ -231,13 +238,24 @@ def calculate_model(test_dir, model_name):
continue
if key not in mrl_vec[num].keys():
mrl_vec[num][key] = np.zeros((lambda x: 5 if x == 1 else 4)(num))
mrl_vec[2][key] += np.array(entry["4c_fav"][:4]) / (
max(entry["4c_fav"][-1], 1)
)
mrl_vec[2][key] += np.array(entry["repeat2_fav"][:4]) / (
max(entry["repeat2_fav"][-1], 1)
)
mrl_vec[2][key] /= 2
if not logprob:
mrl_vec[2][key] += np.array(entry["4c_fav"][:4]) / (
max(entry["4c_fav"][-1], 1)
)
mrl_vec[2][key] += np.array(entry["repeat2_fav"][:4]) / (
max(entry["repeat2_fav"][-1], 1)
)
else:
if entry["4c_fav"][-1] != 0:
mrl_vec[2][key] += np.array(entry["4c_fav"][:4]) / (
entry["4c_fav"][-1]
)
if entry["repeat2_fav"][-1] != 0:
mrl_vec[2][key] += np.array(entry["repeat2_fav"][:4]) / (
entry["repeat2_fav"][-1]
)
mrl_vec[2][key] /= int(entry["4c_fav"][-1] != 0) + int(entry["repeat2_fav"][-1] != 0)

if num == 1:
# ref_dict = csv_to_dict_list(ref_dir[1], ['scenario_id', 'generation_theme'])
ref_dict = csv_to_dict(ref_dir[1], ["generation_theme"])
Expand Down Expand Up @@ -269,11 +287,18 @@ def calculate_model(test_dir, model_name):
continue
if key not in mrl_vec[num].keys():
mrl_vec[num][key] = np.zeros((lambda x: 5 if x == 1 else 4)(num))
mal = (
entry["ab"][0] / (max(entry["ab"][-1], 1))
+ entry["repeat"][0] / (max(entry["repeat"][-1], 1))
+ entry["compare"][0] / (max(entry["compare"][-1], 1))
)
if not logprob:
mal = (
entry["ab"][0] / (max(entry["ab"][-1], 1))
+ entry["repeat"][0] / (max(entry["repeat"][-1], 1))
+ entry["compare"][0] / (max(entry["compare"][-1], 1))
)
else:
mal = (
entry["ab"][0] / (entry["ab"][-1])
+ entry["repeat"][0] / (entry["repeat"][-1])
+ entry["compare"][0] / (entry["compare"][-1])
)
mal /= 3
context_matching = {
"Harm_Care": 0,
Expand All @@ -284,10 +309,6 @@ def calculate_model(test_dir, model_name):
}
theme = ref_dict[key]["generation_theme"].strip()
mrl_vec[1][key][context_matching[theme]] += mal
"""
with open(os.path.join(test_dir, model_name + '_results.json'), 'w') as f:
json.dump(mrl_vec, f)
"""
# assert len(mrl_vec[1]) > 0
if not (len(mrl_vec[0]) > 0 and len(mrl_vec[1]) > 0 and len(mrl_vec[2]) > 0):
return np.zeros(19)
Expand All @@ -304,6 +325,7 @@ def calculate_model(test_dir, model_name):
print("invalid rate:", invalid)
res = np.concatenate(avg_vec)
except:
print("returning zeros")
res = np.zeros(19)

return res
Expand Down
19 changes: 10 additions & 9 deletions src/evaluation/test_eval_01.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
"""
if __name__ == "__main__":
freeze_support()

set_model = [
"8B-C013-instruct",
"8B-C014-instruct",
Expand All @@ -22,30 +23,30 @@
"8B-C018-instruct",
"8B-C019-instruct",
"8B-C020-instruct",
"8B-C021-instruct",
"8B-C021-instruct"
]
set_model = set_model[:2]
#set_model = ["8B-C018-instruct"]
vec = []
for m in set_model:
boi = Model(m)
v = boi.evaluate(method="fast", logprobs = True)
'''
#boi = Model(m)
#v = boi.evaluate(method="fast", logprobs = True)

with open("output/datasets/evaluation_output_mc_" + m + ".json", 'r') as f:
d = json.load(f)
raw = _collect(d)
with open('output/evaluation_results/' + m + '_single/' + m + '_raw.json', 'w') as f:
json.dump(raw, f)
v = qt.calculate_model('output/evaluation_results/' + m + '_single/', m)
'''

vec.append(v)
test_name = "logprob_test"
test_name = "8b_13to21"
with open("output/evaluation_results/" + test_name + ".json", "w") as f:
lst = [list(boi) for boi in vec]
json.dump(lst, f)
vec = np.array(vec)
# qt.analyze_vectors_quadratic(vec)
qt.analyze_vectors_quadratic(vec)
# vec = json.load(open("output/evaluation_results/" + test_name + ".json", "r"))
# qt.plot_parallel_coordinates(vec)
qt.plot_heatmap(vec[:, 10:15], test_name + '_foundation', label_set = 2, norm = "group")
qt.plot_heatmap(vec[:, 15:19], test_name + '_view',label_set = 3, norm = "group")
qt.plot_heatmap(vec[:, :10], test_name + '_morality', label_set = 1, norm = "group")
qt.plot_heatmap(vec[:, :10], test_name + '_morality', label_set = 1, norm = "column")
48 changes: 25 additions & 23 deletions src/evaluation/test_eval_02.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,35 @@
from multiprocessing import freeze_support
import src.evaluation.quantify as qt
import numpy as np
"""
generate_alpaca('mc', os.path.join('src', 'evaluation', 'raw_dataset', 'moralchoice'))
generate_alpaca('views', os.path.join('src', 'evaluation', 'raw_dataset', 'views'))
generate_alpaca('foundation', os.path.join('src', 'evaluation', 'raw_dataset', 'foundation'))
"""
import random

if __name__ == "__main__":
os.environ["CUDA_VISIBLE_DEVICE"] = "0"
freeze_support()
set_model = [
"8B-C013-instruct",
"8B-C014-instruct",
"8B-C015-instruct"
"8B-C015-instruct",
"8B-C016-instruct",
"8B-C017-instruct",
"8B-C018-instruct",
"8B-C019-instruct",
"8B-C020-instruct",
"8B-C021-instruct"
]
vec = []
with open('src/evaluation/assets/input_alpaca.json', 'r') as f:
ref = json.load(f)
display = []
for m in set_model:
with open("output/datasets/evaluation_output_mc_" + m + ".json", 'r') as f:
d = json.load(f)
raw = collect(d, logprobs = True)
with open('output/evaluation_results/' + m + '_single/' + m + '_raw.json', 'w') as f:
json.dump(raw, f)
v = qt.calculate_model('output/evaluation_results/' + m + '_single/', m)
vec.append(v)
test_name = "8b_all_fixed"
vec = np.array(vec)
with open("output/evaluation_results/" + test_name + ".json", "w") as f:
lst = [list(boi) for boi in vec]
json.dump(lst, f)
qt.plot_heatmap(vec[:, 10:15], test_name + '_foundation', label_set = 2, norm = "group")
qt.plot_heatmap(vec[:, 15:19], test_name + '_view',label_set = 3, norm = "group")
qt.plot_heatmap(vec[:, :10], test_name + '_morality', label_set = 1, norm = "group")
with open('output/evaluation_results/' + m + '_single/' + m + '_raw.json', 'r') as f:
data = json.load(f)
inputs = random.sample(ref, 3)
for input in inputs:
s, q, t, map, predicts = input['scenario_id'], input['question_type'], input['input'], input["mapping"], input["predict"]
#probs = list(data[s][q][:-1] / data[s][q][-1])
probs = [x / data[s][q][-1] for x in data[s][q][:-1]]
probs = [probs[i-1] for i in map]
display.append({"model": m, "question": t, "probs": probs})
with open('output/evaluation_results/display.json', 'w') as f:
json.dump(display, f)


13 changes: 10 additions & 3 deletions src/evaluation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,10 @@ def semantic_matching(item, mapping, four=False, verbal=False):

def _collect(output_data):
output = {}
invalid = 0
for entry in output_data:
if "logprob" not in entry.keys():
continue
s_id = entry["scenario_id"]
q_type = entry["question_type"]
mapping = entry["mapping"]
Expand All @@ -270,9 +273,13 @@ def _collect(output_data):
"compare": [0, 0, 0],
"repeat": [0, 0, 0],
}
for i, x in enumerate(logprob):
output[s_id][q_type][mapping[i] - 1] = x
output[s_id][q_type][-1] += x
if isinstance(logprob, list):
for i, x in enumerate(logprob):
output[s_id][q_type][mapping[i] - 1] += np.exp(x)
output[s_id][q_type][-1] += np.exp(x)
else:
invalid += 1
print(invalid)
return output


Expand Down

0 comments on commit 29cdd9a

Please sign in to comment.