You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Bug Description
I tried to obtain Answer Relevancy, Groundedness, Context Relevance, Ground Truth Aggrement metrics of a rag chain (created with langchain) and a query engine (created with llama-index) with trulens, however I keep getting the following error:
"Failed to process and remove trivial statements. Proceeding with all statetements."
I created rag chaing with langchain and as evaluator I tried to use LiteLLM with Ollama, OpenAI from langchain_community.llms but I keep getting same error for each evaluator.
Also, I created query engine with LlamaIndex and try to obtain rag triad metrics with trullama. This setup also gave the same error.
To Reproduce
Here is the code that I used:
import os
import gc
import numpy as np
import json
import wandb
import torch
from tqdm.auto import tqdm
with wandb.init(job_type='upload_docs', project='rag-tracking') as run:
# you can also write wandb.init(entir='',project='',job_type='')...
artifact = wandb.Artifact('raw_doc',type='dataset')
artifact.add_dir(local_path='./docs', name='docs_sample')
run.log_artifact(artifact)
%%
from langchain_community.document_loaders import PDFPlumberLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
def create_chunks(artifact_path, run):
""" Retrieves the dataset, parses the contents and
splits the text into chunks to be sent.
Arguments:
artifact_path -- string: Document version
run -- wandb run object
Returns:
List
"""
# Use the artifact
artifact = run.use_artifact(f'rag-tracking/{artifact_path}', type='dataset')
# Download the artifact directory
artifact_dir = artifact.download()
# Initialize a list to hold all document sections
all_document_sections = []
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=20)
# Loop over each PDF file in the downloaded artifact directory
for filename in os.listdir(f'{artifact_dir}/docs_sample'):
if filename.endswith('.pdf'):
# Initialize the PDFPlumberLoader for each PDF file
loader = PDFPlumberLoader(os.path.join(f'{artifact_dir}/docs_sample', filename))
# Load the document
docs = loader.load()
# Split the document into sections
document_sections = text_splitter.split_documents(docs)
# Extend the list with the sections from this document
all_document_sections.extend(document_sections)
return all_document_sections
%%
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
def prepare_retriever(embedding_method, document_sections, top_k = 3):
""" Initiates a vector database depending on the
specified embedding method and documents
and prepares a retriever that fetches top_k
amount of chunks
Arguments:
embedding_method -- string: 'base' or 'small'
document_sections -- list: document sections from create_chunks
top_k -- int: number of chunks to be retrieved
Returns:
FAISS/TFIDF retriever object, depending on the config
"""
if embedding_method in ['base', 'small']:
if embedding_method == 'base':
embedder = HuggingFaceEmbeddings(
model_name="BAAI/bge-base-en-v1.5"
)
vector_store = Chroma(persist_directory='./chroma1', embedding_function=embedder)
elif embedding_method == 'small':
embedder = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
vector_store = Chroma(persist_directory='./chroma2', embedding_function=embedder)
#vector_store = Chroma.from_documents(document_sections,embedder)
retriever = vector_store.as_retriever(
search_type="similarity", search_kwargs={"k": top_k}
)
return retriever
%%
Prompt version 1
prompt_template = '''
You are an assistant for question-answering tasks. Use the following pieces of
retrieved context to answer the question. If you don't know the answer, just
say that you don't know.
Context: {context}
Question: {question}
Answer:
'''
file_name = 'prompt_template.txt'
with open(file_name, 'w') as file:
file.write(prompt_template)
%%
Prompt version 2
prompt_template2 = '''Use the following pieces of retrieved
context to answer the question. If you don't know the answer, just say
that you don't know. Keep the answer concise. Always cite which page
of the document certain statements can be traced back to.
Context: {context}
Question: {question}
Answer:
'''
file_name = 'prompt_template2.txt'
with open(file_name, 'w') as file:
file.write(prompt_template2)
Logging prompt templates on W&B.
Notice how the same "prompt_template" artifact is being logged twice with
different versions saved in our local directory. W&B will handle the
versioning by itself, denoting the first as "prompt_template:v0" and
iterate along. (You can always access the latest version by refering to
"prompt_template:latest")
with wandb.init(job_type='upload_prompt_template', project='rag-tracking') as run:
artifact = wandb.Artifact('prompt_template',type='dataset')
artifact.add_file(local_path='prompt_template.txt', name='prompt_template.txt')
run.log_artifact(artifact)
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
def format_docs(docs):
""" Formats retrieved documents to be a
single string. Each section is augmented
with the page number at the end.
Arguments:
docs -- list of Document objects
Returns:
str: each document merged into one
"""
return '\n\n'.join(f"{doc.page_content} at page:{doc.metadata['page']} " for doc in docs)
def load_chain(wandb_run: wandb.run, config):
""" According to the sweep config, defines the rag chain
def garbage_disposal(config):
""" Frees up unallocated GPU memory """
# os.system(f'ollama stop {config.model_name}');
# print(f'Ollama {config.model_name} service was stopped.')
gc.collect()
torch.cuda.empty_cache()
print('Embedder copies purged from memory.')
from trulens.providers.litellm import LiteLLM
from trulens.apps.langchain import TruChain
from trulens.core import Feedback
from trulens.core import TruSession
from trulens.feedback import GroundTruthAgreement
from langchain_community.llms import OpenAI
from trulens.providers.langchain import Langchain
from langchain_community.llms import OpenAI
def evaluate_rag(rag_chain, config):
""" Given the defined rag system, evaluates the model
on a subset of evaluation Q&A pairs using TruLens.
Also handles aggregating results and logging to W&B.
Arguments:
rag_chain -- Langchain object
config -- sweep config
"""
gpt4_llm = OpenAI(model="gpt-4o-mini-2024-07-18", temperature=0)
provider = Langchain(chain = gpt4_llm)
with open('responses.json') as f:
eval_questions = json.load(f)['responses']
subset = eval_questions
session = TruSession()
session.reset_database()
# provider = LiteLLM(
# model_engine='ollama/llama3.1', api_base='http://localhost:11435'
# )
context = TruChain.select_context(rag_chain)
f_qa_relevance = (
Feedback(provider.relevance_with_cot_reasons,
name='Answer Relevance')
.on_input()
.on_output())
f_qs_relevance = (
Feedback(provider.context_relevance_with_cot_reasons,
name = 'Context Relevance')
.on_input()
.on(context)
.aggregate(np.mean)
)
f_groundedness = (
Feedback(provider.groundedness_measure_with_cot_reasons,
name = "Groundedness")
.on(context.collect())
.on_output()
)
f_ground_truth = (
Feedback(GroundTruthAgreement(subset, provider=provider).agreement_measure,
name='Ground Truth Agreement'
)
.on_input_output()
)
tru_recorder = TruChain(
rag_chain,
app_name="ChatApplication",
feedbacks=[
f_qa_relevance, f_qs_relevance,
f_groundedness, f_ground_truth
]
)
with tru_recorder as recording:
for row in tqdm(subset):
rag_chain.invoke(row['query'])
records, feedback = session.get_records_and_feedback()
score_means = [np.mean(records[x]) for x in feedback]
score_std = [np.std(records[x]) for x in feedback]
score_dict = {x:y for x,y in zip(feedback, score_means)}
score_dict.update({f'{x}_std':y for x,y in zip(feedback, score_std)})
score_dict.update({
'Mean Score':np.mean(score_means),
'Latency':np.mean(records.latency)
}
)
wandb.log(score_dict)
columns = [
'input',
'output',
'Answer Relevance_calls',
'Context Relevance_calls',
'Groundedness_calls',
'Ground Truth Agreement_calls',
'latency'
]
try:
wandb.log({'eval_table': wandb.Table(dataframe=records[columns])})
except:
remaining = set(records.columns) & set(columns)
wandb.log({'eval_table': wandb.Table(dataframe=records[list(remaining)])})
garbage_disposal(config)
%%
Sweep configuration is defined as a nested dictionary.
Sweep methods can be: grid, random or bayesian.
For bayesian search to actually maximise the metric,
the name has to match the key in the logged dictionary
Here you can set a certain number of runs for random
and bayesian search algorithms.
def model_pipeline_sweep(config=None):
""" Initialize a new run with the config given by the agent """
with wandb.init(config=config, project='rag-tracking') as run:
config = wandb.config
rag_chain = load_chain(run, config)
evaluate_rag(rag_chain, config)
Expected behavior
I want to obtain relevant rag triad metrics to evaluate my rag chain
Relevant Logs/Tracebacks
Here is the error:
c:\users\pc\desktop\llm\rag_wandb_sweep.py:250: LangChainDeprecationWarning: The class OpenAI was deprecated in LangChain 0.0.10 and will be removed in 1.0. An updated version of the class exists in the :class:~langchain-openai package and should be used instead. To use it run pip install -U :class:~langchain-openai and import as from :class:~langchain_openai import OpenAI``.
gpt4_llm = OpenAI(model="gpt-4o-mini-2024-07-18", temperature=0)
🦑 Initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the database_redact_keys option of `TruSession` to prevent this.
Updating app_name and app_version in apps table: 0it [00:00, ?it/s]
Updating app_id in records table: 0it [00:00, ?it/s]
Updating app_json in apps table: 0it [00:00, ?it/s]
✅ In Answer Relevance, input prompt will be set to record.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to record.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input question will be set to record.main_input or `Select.RecordInput` .
✅ In Context Relevance, input context will be set to record.app.first.steps__.context.first.invoke.rets[:].page_content .
✅ In Groundedness, input source will be set to record.app.first.steps__.context.first.invoke.rets[:].page_content.collect() .
✅ In Groundedness, input statement will be set to record.main_output or `Select.RecordOutput` .
✅ In Ground Truth Agreement, input prompt will be set to record.main_input or `Select.RecordInput` .
✅ In Ground Truth Agreement, input response will be set to record.main_output or `Select.RecordOutput` .
0%| | 0/179 [00:00<?, ?it/s]
C:\Users\PC\anaconda3\Lib\site-packages\trulens\feedback\llm_provider.py:1521: UserWarning: Failed to process and remove trivial statements. Proceeding with all statements.
warnings.warn(
C:\Users\PC\anaconda3\Lib\site-packages\trulens\feedback\llm_provider.py:1521: UserWarning: Failed to process and remove trivial statements. Proceeding with all statements.
warnings.warn(
C:\Users\PC\anaconda3\Lib\site-packages\trulens\feedback\llm_provider.py:1521: UserWarning: Failed to process and remove trivial statements. Proceeding with all statements.
Environment:
OS: Windows 11 Pro
Python Version: 3.12.7
TruLens version: 1.2.10
Additional context
Add any other context about the problem here.
The text was updated successfully, but these errors were encountered:
Bug Description
I tried to obtain Answer Relevancy, Groundedness, Context Relevance, Ground Truth Aggrement metrics of a rag chain (created with langchain) and a query engine (created with llama-index) with trulens, however I keep getting the following error:
"Failed to process and remove trivial statements. Proceeding with all statetements."
I created rag chaing with langchain and as evaluator I tried to use LiteLLM with Ollama, OpenAI from langchain_community.llms but I keep getting same error for each evaluator.
Also, I created query engine with LlamaIndex and try to obtain rag triad metrics with trullama. This setup also gave the same error.
To Reproduce
Here is the code that I used:
import os
import gc
import numpy as np
import json
import wandb
import torch
from tqdm.auto import tqdm
wandb.login(key = "KEY")
os.environ['OPENAI_API_KEY'] = 'KEY'
%%
Logging the raw dataset as an artifact.
with wandb.init(job_type='upload_docs', project='rag-tracking') as run:
# you can also write wandb.init(entir='',project='',job_type='')...
artifact = wandb.Artifact('raw_doc',type='dataset')
artifact.add_dir(local_path='./docs', name='docs_sample')
run.log_artifact(artifact)
%%
from langchain_community.document_loaders import PDFPlumberLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
def create_chunks(artifact_path, run):
""" Retrieves the dataset, parses the contents and
splits the text into chunks to be sent.
%%
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
def prepare_retriever(embedding_method, document_sections, top_k = 3):
""" Initiates a vector database depending on the
specified embedding method and documents
and prepares a retriever that fetches top_k
amount of chunks
%%
Prompt version 1
prompt_template = '''
You are an assistant for question-answering tasks. Use the following pieces of
retrieved context to answer the question. If you don't know the answer, just
say that you don't know.
Context: {context}
Question: {question}
Answer:
'''
file_name = 'prompt_template.txt'
with open(file_name, 'w') as file:
file.write(prompt_template)
%%
Prompt version 2
prompt_template2 = '''Use the following pieces of retrieved
context to answer the question. If you don't know the answer, just say
that you don't know. Keep the answer concise. Always cite which page
of the document certain statements can be traced back to.
Context: {context}
Question: {question}
Answer:
'''
file_name = 'prompt_template2.txt'
with open(file_name, 'w') as file:
file.write(prompt_template2)
Logging prompt templates on W&B.
Notice how the same "prompt_template" artifact is being logged twice with
different versions saved in our local directory. W&B will handle the
versioning by itself, denoting the first as "prompt_template:v0" and
iterate along. (You can always access the latest version by refering to
"prompt_template:latest")
with wandb.init(job_type='upload_prompt_template', project='rag-tracking') as run:
artifact = wandb.Artifact('prompt_template',type='dataset')
artifact.add_file(local_path='prompt_template.txt', name='prompt_template.txt')
run.log_artifact(artifact)
%%
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
def format_docs(docs):
""" Formats retrieved documents to be a
single string. Each section is augmented
with the page number at the end.
def load_chain(wandb_run: wandb.run, config):
""" According to the sweep config, defines the rag chain
def garbage_disposal(config):
""" Frees up unallocated GPU memory """
from trulens.providers.litellm import LiteLLM
from trulens.apps.langchain import TruChain
from trulens.core import Feedback
from trulens.core import TruSession
from trulens.feedback import GroundTruthAgreement
from langchain_community.llms import OpenAI
from trulens.providers.langchain import Langchain
from langchain_community.llms import OpenAI
def evaluate_rag(rag_chain, config):
""" Given the defined rag system, evaluates the model
on a subset of evaluation Q&A pairs using TruLens.
Also handles aggregating results and logging to W&B.
%%
Sweep configuration is defined as a nested dictionary.
Sweep methods can be: grid, random or bayesian.
For bayesian search to actually maximise the metric,
the name has to match the key in the logged dictionary
sweep_config = {
'method': 'random',
'metric': {
'name': 'Mean Score',
'goal': 'maximize'
},
}
parameters_dict = {
'model_name': {
'values': ["llama3.2", "llama3.1"]
},
'prompt_version':{
'values': ['prompt_template:v0',
'prompt_template:v1']
},
'doc_version':{
'value': 'raw_doc:latest'
},
'embedding_method':{
'values': ['base', 'small']
}
}
sweep_config['parameters'] = parameters_dict
%%
Define the function to be run by the sweep agent.
Here you can set a certain number of runs for random
and bayesian search algorithms.
def model_pipeline_sweep(config=None):
""" Initialize a new run with the config given by the agent """
with wandb.init(config=config, project='rag-tracking') as run:
config = wandb.config
rag_chain = load_chain(run, config)
evaluate_rag(rag_chain, config)
sweep_id = wandb.sweep(sweep_config)
wandb.agent(sweep_id=sweep_id, function=model_pipeline_sweep, count=1)
Expected behavior
I want to obtain relevant rag triad metrics to evaluate my rag chain
Relevant Logs/Tracebacks
Here is the error:
c:\users\pc\desktop\llm\rag_wandb_sweep.py:250: LangChainDeprecationWarning: The class
OpenAI
was deprecated in LangChain 0.0.10 and will be removed in 1.0. An updated version of the class exists in the :class:~langchain-openai package and should be used instead. To use it run
pip install -U :class:~langchain-openai
and import asfrom :class:
~langchain_openai import OpenAI``.gpt4_llm = OpenAI(model="gpt-4o-mini-2024-07-18", temperature=0)
🦑 Initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the
database_redact_keys
option of `TruSession` to prevent this.Updating app_name and app_version in apps table: 0it [00:00, ?it/s]
Updating app_id in records table: 0it [00:00, ?it/s]
Updating app_json in apps table: 0it [00:00, ?it/s]
✅ In Answer Relevance, input prompt will be set to record.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to record.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input question will be set to record.main_input or `Select.RecordInput` .
✅ In Context Relevance, input context will be set to record.app.first.steps__.context.first.invoke.rets[:].page_content .
✅ In Groundedness, input source will be set to record.app.first.steps__.context.first.invoke.rets[:].page_content.collect() .
✅ In Groundedness, input statement will be set to record.main_output or `Select.RecordOutput` .
✅ In Ground Truth Agreement, input prompt will be set to record.main_input or `Select.RecordInput` .
✅ In Ground Truth Agreement, input response will be set to record.main_output or `Select.RecordOutput` .
0%| | 0/179 [00:00<?, ?it/s]
C:\Users\PC\anaconda3\Lib\site-packages\trulens\feedback\llm_provider.py:1521: UserWarning: Failed to process and remove trivial statements. Proceeding with all statements.
warnings.warn(
C:\Users\PC\anaconda3\Lib\site-packages\trulens\feedback\llm_provider.py:1521: UserWarning: Failed to process and remove trivial statements. Proceeding with all statements.
warnings.warn(
C:\Users\PC\anaconda3\Lib\site-packages\trulens\feedback\llm_provider.py:1521: UserWarning: Failed to process and remove trivial statements. Proceeding with all statements.
Environment:
Additional context
Add any other context about the problem here.
The text was updated successfully, but these errors were encountered: