-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathllm_classifier_text.ex
98 lines (84 loc) · 3.38 KB
/
llm_classifier_text.ex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
defmodule LLMClassifierTest do
defmacro __using__(opts) do
quote do
import LLMClassifierTest
@prompt_name unquote(opts[:prompt_name] || "default_prompt")
@categories []
@model_function unquote(opts[:model_function] || quote(do: &default_model_function/3))
@before_compile LLMClassifierTest
end
end
defmacro __before_compile__(_env) do
quote do
def run_all_tests(model_name) do
results = Enum.map(@categories, fn {name, tests} ->
run_category_tests(name, tests, model_name, @prompt_name)
end)
print_overall_summary(results)
end
defp default_model_function(text, _model_name, _prompt_name) do
# This is a mock implementation. In practice, this should be overridden.
["default_category"]
end
end
end
defmacro category(name, do: block) do
quote do
@categories [{unquote(name), unquote(block)} | @categories]
end
end
defmacro positive(text) do
quote do
{:positive, unquote(text)}
end
end
defmacro negative(text, expected_category \\ nil) do
quote do
{:negative, unquote(text), unquote(expected_category)}
end
end
def run_category_tests(category_name, tests, model_name, prompt_name) do
IO.puts("Running tests for category: #{category_name}")
IO.puts("Model: #{model_name}, Prompt: #{prompt_name}")
results = Enum.reduce(tests, %{positive: %{passed: 0, failed: 0}, negative: %{passed: 0, failed: 0}}, fn test, acc ->
case test do
{:positive, text} -> run_positive_test(category_name, text, model_name, prompt_name, acc)
{:negative, text, expected_category} -> run_negative_test(category_name, text, expected_category, model_name, prompt_name, acc)
end
end)
print_category_summary(category_name, results)
{category_name, results}
end
defp run_positive_test(category_name, text, model_name, prompt_name, results) do
case apply(@model_function, [text, model_name, prompt_name]) do
categories when category_name in categories ->
IO.puts(" ✅ Positive: #{text}")
update_in(results, [:positive, :passed], &(&1 + 1))
categories ->
IO.puts(" ❌ Positive: #{text}")
IO.puts(" Expected #{category_name}, got #{Enum.join(categories, ", ")}")
update_in(results, [:positive, :failed], &(&1 + 1))
end
end
defp run_negative_test(category_name, text, expected_category, model_name, prompt_name, results) do
case apply(@model_function, [text, model_name, prompt_name]) do
categories when category_name in categories ->
IO.puts(" ❌ Negative: #{text}")
IO.puts(" Expected #{category_name} not to be present, got #{Enum.join(categories, ", ")}")
update_in(results, [:negative, :failed], &(&1 + 1))
categories when is_nil(expected_category) or expected_category in categories ->
IO.puts(" ✅ Negative: #{text}")
update_in(results, [:negative, :passed], &(&1 + 1))
categories ->
IO.puts(" ❌ Negative: #{text}")
IO.puts(" Expected #{expected_category}, got #{Enum.join(categories, ", ")}")
update_in(results, [:negative, :failed], &(&1 + 1))
end
end
defp print_category_summary(category_name, results) do
# Implementation remains the same
end
defp print_overall_summary(results) do
# Implementation remains the same
end
end