-
Notifications
You must be signed in to change notification settings - Fork 280
/
Copy pathrun_benchmark.py
51 lines (39 loc) · 1.36 KB
/
run_benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
"""Test our agent against a benchmark dataset.
This uses Langsmith. Please set your LangSmith API key. See
create_benchmark to create the benchmark dataset.
"""
import os
from config import set_environment
from langchain.chains import LLMChain
from langchain.smith import RunEvalConfig, run_on_dataset
from langchain_openai import ChatOpenAI
from langsmith import Client
set_environment()
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "My Project"
client = Client()
shared_dataset_name = "Reasoning and Bias"
llm = ChatOpenAI(model="gpt-4", temperature=0.0)
# Use constructor function to initialize for each input:
def construct_chain():
return LLMChain.from_string(
llm,
template="Help out as best you can.\nQuestion: {input}\nResponse: ",
)
evaluation_config = RunEvalConfig(
evaluators=[
# Arbitrary criterion as a key: value pair in the criteria dict:
RunEvalConfig.Criteria({"helpfulness": "Is the response helpful?"}),
RunEvalConfig.Criteria({"insightful": "Is the response carefully thought out?"}),
]
)
prototype_results = run_on_dataset(
client=client,
dataset_name=shared_dataset_name,
llm_or_chain_factory=construct_chain,
evaluation=evaluation_config,
verbose=True,
)
prototype_project_name = prototype_results["project_name"]
if __name__ == "__main__":
pass