Skip to content

Commit

Permalink
added custom criteria (#30)
Browse files Browse the repository at this point in the history
  • Loading branch information
oindrillac authored Jan 25, 2024
1 parent b93c425 commit 6d05625
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 40 deletions.
69 changes: 33 additions & 36 deletions app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
generate_text_using_OpenAI,
eval_using_model,
indicate_key_presence,
eval_using_langchain,
)
from feedback import store_feedback
import os
Expand Down Expand Up @@ -136,7 +137,8 @@ def OPENAI_API_KEY() -> str:

instruction = st.text_area(
"Instruction",
"""You are an AI system specialized at generating API documentation for the provided Python code. You will be provided functions, classes, or Python scripts. Your documentation should include:
"""
You are an AI system specialized at generating API documentation for the provided Python code. You will be provided functions, classes, or Python scripts. Your documentation should include:
1. Introduction: Briefly describe the purpose of the API and its intended use.
2. Functions: Document each API function, including:
Expand Down Expand Up @@ -222,7 +224,6 @@ def main(prompt_success: bool, prompt_diff: int, actual_doc: str):
top_p,
GENAI_KEY(),
)

col1, col2, col3 = st.columns([1.5, 1.5, 0.5])

with col1:
Expand All @@ -243,22 +244,38 @@ def main(prompt_success: bool, prompt_diff: int, actual_doc: str):

with col3:
st.subheader("Evaluation Metrics")
# rouge score addition
scorer = rouge_scorer.RougeScorer(
["rouge1", "rouge2", "rougeL"], use_stemmer=True
st.markdown(
"**GenAI evaluation on Overall Quality:**",
help="Use OpenAI GPT 3 to evaluate the result of the generated API doc",
)

score = eval_using_model(result, openai_key=OPENAI_API_KEY())
st.write(score)

st.markdown(
"**LangChain evaluation on grammar, descriptiveness and helpfulness:**",
help="Use Langchain to evaluate on cutsom criteria (this list can be updated based on what we are looking to see from the generated docs"
)
rouge_scores = scorer.score(actual_doc, result)

lc_score = eval_using_langchain(prompt, result)
st.markdown(
f"ROUGE-1 Score:{rouge_scores['rouge1'].fmeasure:.2f}",
help="ROUGE-1 refers to the overlap of unigrams (each word) between the system and reference summaries",
f"Grammatical: {lc_score[0]['score']}",
help="Checks if the output grammatically correct. Binary integer 0 to 1, where 1 would mean that the output is gramatically accurate and 0 means it is not",
)

st.markdown(
f"ROUGE-2 Score: {rouge_scores['rouge2'].fmeasure:.2f}",
help="ROUGE-2 refers to the overlap of bigrams between the system and reference summaries",
f"Descriptiveness: {lc_score[1]['score']}",
help="Checks if the output descriptive. Binary integer 0 to 1, where 1 would mean that the output is descriptive and 0 means it is not",
)

st.markdown(
f"Helpfulness: {lc_score[2]['score']}",
help="Checks if the output helpful for the end user. Binary integer 0 to 1, where 1 would mean that the output is helpful and 0 means it is not"
)

st.markdown(
f"ROUGE-L Score: {rouge_scores['rougeL'].fmeasure:.2f}",
help="Longest common subsequence problem takes into account sentence-level structure similarity naturally and identifies longest co-occurring in sequence n-grams automatically",
"**Consistency:**",
help="Evaluate how similar or divergent the generated document is to the actual documentation",
)

# calc cosine similarity
Expand All @@ -270,39 +287,19 @@ def main(prompt_success: bool, prompt_diff: int, actual_doc: str):
help="0 cosine similarity means no similarity between generated and actual API documentation, 1 means they are same",
)
st.markdown("###") # add a line break

st.markdown(
"**GenAI evaluation scores:**",
help="Use OpenAI GPT 3 to evaluate the result of the generated API doc",
"**Readability Scores:**",
help="Evaluate how readable the generated text is",
)
score = eval_using_model(result, openai_key=OPENAI_API_KEY())
st.write(score)

# Readability Scores
st.subheader("Readability Metrics")


# Flesch Reading Ease
flesch_reading_ease = textstat.flesch_reading_ease(result)
st.markdown(
f"Flesch Reading Ease: {flesch_reading_ease:.2f}",
help="Flesch Reading Ease measures how easy a text is to read. Higher scores indicate easier readability. Ranges 0-100 and a negative score indicates a more challenging text.",
)

# Dale Chall Readability
dale_chall_readability = textstat.dale_chall_readability_score(result)
st.markdown(
f"Dale Chall Readability: {dale_chall_readability:.2f}",
help="The Dale-Chall Formula is a readability formula based on the use of familiar words, rather than syllable or letter counts. Lower scores mean more difficult words. No fixed ranges.",
)

# Automated Readability Index (ARI)
ari = textstat.automated_readability_index(result)
st.markdown(
f"ARI (Automated Readability Index): {ari:.2f}",
help="ARI relies on a factor of characters per word, instead of the usual syllables per word. ARI corresponds to a U.S. grade level. Higher scores indicate more advanced reading levels.",
)


if st.button("Generate API Documentation"):
if model_id != "OpenAI/gpt3.5":
prompt_success, prompt_diff = check_prompt_token_limit(
Expand Down
3 changes: 2 additions & 1 deletion app/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ py-readability-metrics
openai
textstat
scikit-learn
streamlit-feedback
streamlit-feedback
langchain
47 changes: 44 additions & 3 deletions app/utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
import json
import os
from openai import OpenAI
from genai import Credentials, Client
from genai.text.generation import TextGenerationParameters
from genai.text.tokenization import (
TextTokenizationParameters,
TextTokenizationReturnOptions,
TextTokenizationCreateResults,
)
from langchain.evaluation import (
Criteria,
load_evaluator,
EvaluatorType
)
import os
import json
from openai import OpenAI
from langchain_community.chat_models import ChatOpenAI


def generate_prompt(
Expand Down Expand Up @@ -261,3 +267,38 @@ def indicate_key_presence(env: str) -> str:
return "*" * len(key)
else:
return ""

def eval_using_langchain(prediction: str, query: str):

evaluation = []
llm = ChatOpenAI(model="gpt-4", temperature=0)

# If you wanted to specify multiple criteria. Generally not recommended
custom_criterion_1 = {
"grammatical": "Is the output grammatically correct?",
}

eval_chain = load_evaluator(EvaluatorType.CRITERIA, llm=llm, criteria=custom_criterion_1)

eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)
evaluation.append(eval_result)

custom_criterion_2 = {
"descriptive": "Does the output describe a piece of code and its intended functionality?"
}

eval_chain = load_evaluator(EvaluatorType.CRITERIA, llm=llm, criteria=custom_criterion_2)

eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)
evaluation.append(eval_result)

evaluator = load_evaluator("criteria", llm=llm, criteria="helpfulness")

eval_result = evaluator.evaluate_strings(prediction=prediction,input=query)
evaluation.append(eval_result)

return evaluation




0 comments on commit 6d05625

Please sign in to comment.