Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Week 4 #27

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 23 additions & 5 deletions course/week2/monitor_project/monitor/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ def get_ks_score(tr_probs, te_probs):
# te_probs: torch.Tensor
# predicted probabilities from test test
# score: float - between 0 and 1
pass # remove me
tr_probs_np = tr_probs.numpy()
te_probs_np = te_probs.numpy()
_, score = ks_2samp(tr_probs_np, te_probs_np)
# ============================
return score

Expand Down Expand Up @@ -68,7 +70,16 @@ def get_hist_score(tr_probs, te_probs, bins=10):
#
# Read the documentation for `np.histogram` carefully, in
# particular what `bin_edges` represent.
pass # remove me
tr_heights, bin_edges = np.histogram(tr_probs.numpy(), bins=bins, density=True)
te_heights, _ = np.histogram(te_probs.numpy(), bins=bin_edges, density=True)

score = 0
for i in range(len(bin_edges) - 1):
bin_diff = bin_edges[i+1] - bin_edges[i]
tr_area = bin_diff * tr_heights[i]
te_area = bin_diff * te_heights[i]
intersect = min(tr_area, te_area)
score += intersect
# ============================
return score

Expand Down Expand Up @@ -97,7 +108,12 @@ def get_vocab_outlier(tr_vocab, te_vocab):
# te_vocab: dict[str, int]
# Map from word to count for test examples
# score: float (between 0 and 1)
pass # remove me
num_seen = sum(1 for word in te_vocab if word in tr_vocab)
num_total = len(te_vocab)
if num_total == 0:
score = 0 # If test vocab is empty, we consider no outliers
else:
score = 1 - (num_seen / num_total)
# ============================
return score

Expand Down Expand Up @@ -132,7 +148,9 @@ def calibrate(self, tr_probs, tr_labels, te_probs):
# it to a torch.Tensor.
#
# `te_probs_cal`: torch.Tensor
pass # remove me
iso_reg = IsotonicRegression(out_of_bounds='clip')
tr_probs_cal = torch.tensor(iso_reg.fit_transform(tr_probs.numpy(), tr_labels.numpy()))
te_probs_cal = torch.tensor(iso_reg.predict(te_probs.numpy()))
# ============================
return tr_probs_cal, te_probs_cal

Expand All @@ -149,4 +167,4 @@ def monitor(self, te_vocab, te_probs):
'hist_score': hist_score,
'outlier_score': outlier_score,
}
return metrics
return metrics
2 changes: 1 addition & 1 deletion course/week2/testing_project/configs/test.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"data": "mnist",
"model": "linear.ckpt"
"model": "mlp.ckpt"
}
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
11 changes: 11 additions & 0 deletions course/week2/testing_project/images/integration/labels.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
path,label
zero.png,0
one.png,1
two.png,2
three.png,3
four.png,4
five.png,5
six.png,6
seven.png,7
eight.png,8
nine.png,9
Binary file not shown.
5 changes: 3 additions & 2 deletions course/week2/testing_project/testing/directionality.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def test(self, trainer, system):
preds_raw = torch.argmax(logits_raw, dim=1)
preds_transformed = torch.argmax(logits_transformed, dim=1)

batch_metric = 0 # store metric here
#batch_metric = 0 # store metric here
# ================================
# FILL ME OUT
#
Expand All @@ -166,7 +166,8 @@ def test(self, trainer, system):
# --
# batch_metric: float (not torch.Tensor!)
# Metric computed on a minibatch
pass # remove me

batch_metric = (preds_raw == preds_transformed).float().mean().item()
# ================================
metric.append(batch_metric)
pbar.update()
Expand Down
3 changes: 2 additions & 1 deletion course/week2/testing_project/testing/integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ def test(self, trainer, system):
# Notes:
# --
# Nothing to return here
pass # remove me
loader = self.get_dataloader()
trainer.test(system, dataloaders=loader)
# ================================


Expand Down
4 changes: 3 additions & 1 deletion course/week2/testing_project/testing/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,9 @@ def build_regression_test(system, loader):
# batch_is_correct: List[int] (not a torch.Tensor!)
# List of integers - 1 if the model got that element correct
# - 0 if the model got that element incorrect
pass # remove me
batch_is_correct = (preds == labels).long().numpy().tolist()
batch_loss = F.cross_entropy(logits, labels, reduction='none')
batch_loss = batch_loss.numpy().tolist()
# ================================
losses.extend(batch_loss)
is_correct.extend(batch_is_correct)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id": "318bdee7-354b-4de8-830d-1e3025073ba1", "name": "dcdl-week4-michael-goeliza-new-all-minilm-l6-v2-hyde", "dimensionality": 384}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id": "f9e8731e-6cd7-4a2e-9b7b-9951dfd3bb07", "name": "dcdl-week4-michael-goeliza-new-all-minilm-l6-v2", "dimensionality": 384}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id": "3616e187-136f-4d6c-a09e-8d4c97c2603d", "name": "dcdl-week4-michael-goeliza-new-thenlper-gte-small-hyde", "dimensionality": 384}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id": "142478b1-118e-4e44-8867-f39d507b7eb9", "name": "dcdl-week4-michael-goeliza-new-thenlper-gte-small", "dimensionality": 384}
830 changes: 830 additions & 0 deletions course/week4/data/questions/questions.csv

Large diffs are not rendered by default.

19 changes: 19 additions & 0 deletions course/week4/scripts/build_eval_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@ def write_questions(self):
# Save the generated question (as a string) into the `question` variable.
# TODO
# ===========================
question = query_openai(
self.openai_api_key,
get_question_prompt(chunk)
)

assert len(question) > 0, f"Did you complete the coding section in `write_questions`?"
questions.append(question)
doc_ids.append(doc_id) # save the doc id for each
Expand Down Expand Up @@ -94,6 +99,15 @@ def grade_questions(self):
# Set the rating to 0 if integer casting fails.
# TODO
# ===========================
try:
response = query_openai(
self.openai_api_key,
get_question_judge_prompt(self.contexts[i], self.questions[i])
)
rating = int(response)
except ValueError:
rating = 0

assert rating >= 0, f"Did you complete the coding section in `grade_questions`?"
ratings.append(rating)

Expand All @@ -118,6 +132,11 @@ def write_hypothetical_answers(self):
# See `rag/prompts` for a bank of relevant prompts to use. You may edit any prompts in there.
# TODO
# ===========================
hypo_answer = query_openai(
self.openai_api_key,
get_hyde_response_prompt(self.questions[i])
)

assert len(hypo_answer) > 0, f"Did you complete the coding section in `write_hypothetical_answers`?"
hypo_answers.append(hypo_answer)

Expand Down
123 changes: 65 additions & 58 deletions course/week4/scripts/insert_docs.py
Original file line number Diff line number Diff line change
@@ -1,76 +1,83 @@
from tqdm import tqdm
from os.path import join
from sentence_transformers import SentenceTransformer

from rag.llm import embedding_name_to_dim
from rag.vector import get_my_collection_name, insert_documents
from rag.dataset import load_documents
from rag.paths import DATA_DIR


def main(args):
r"""Inserts new documents in the collection.
"""
collection_name = get_my_collection_name(
env['GITHUB_USERNAME'],
embedding=args.embedding,
hyde=args.hyde,
)
r"""Inserts new documents in the collection.
"""
collection_name = get_my_collection_name(
env['GITHUB_USERNAME'],
embedding=args.embedding,
hyde=args.hyde,
)

# Load raw documents as a Pandas Dataframe with two columns
# - doc_id: Document ID
# - text: Content for the document
raw = load_documents(override_doc_dir=args.doc_dir)
print(f'Found {len(raw)} documents to upload.')
# Load raw documents as a Pandas Dataframe with two columns
# - doc_id: Document ID
# - text: Content for the document
raw = load_documents(override_doc_dir=args.doc_dir)
print(f'Found {len(raw)} documents to upload.')

# Use the embedding model to embed docs
embedding_dim = embedding_name_to_dim(args.embedding)
embedding_model = SentenceTransformer(args.embedding)
print(f'Loaded the {args.embedding} model.')
# Use the embedding model to embed docs
embedding_dim = embedding_name_to_dim(args.embedding)
embedding_model = SentenceTransformer(args.embedding)
print(f'Loaded the {args.embedding} model.')

documents = []
for i in tqdm(range(len(raw)), desc='Inserting into db'):
doc = ""
# ===========================
# FILL ME OUT
# Prepare the documents to be inserted into the vector db
# You will need compute embeddings. Make sure to cast the embedding to a list.
# Please refer to `config.json` for which embedding to use:
# Example document:
# {
# "embeddings": {
# "values": [0.1, 0.2, 0.3, 0.4, 0.5],
# "dimensionality": 5,
# }, # single vector document
# "metadata": {
# "doc_id": "...",
# }
# }
# Please add the document ID to the metadata under the key `doc_id`.
# Please see docs here: https://docs.starpoint.ai/create-documents
# TODO
# ===========================
assert len(doc) > 0, f"Did you complete the code in `insert_docs.py`?"
documents.append(doc)
documents = []
for i in tqdm(range(len(raw)), desc='Inserting into db'):
doc = {}
# ===========================
# FILL ME OUT
# Prepare the documents to be inserted into the vector db
# You will need compute embeddings. Make sure to cast the embedding to a list.
# Please refer to `config.json` for which embedding to use:
# Example document:
# {
# "embeddings": {
# "values": [0.1, 0.2, 0.3, 0.4, 0.5],
# "dimensionality": 5,
# }, # single vector document
# "metadata": {
# "doc_id": "...",
# }
# }
# Please add the document ID to the metadata under the key `doc_id`.
# Please see docs here: https://docs.starpoint.ai/create-documents
# TODO
# ===========================
doc = {
"embeddings": {
"values": embedding_model.encode(raw.iloc[i]['text']).tolist(),
"dimensionality": embedding_dim,
},
"metadata": {
"doc_id": raw.iloc[i]['doc_id'],
"text": raw.iloc[i]['text']
}
}
assert len(doc) > 0, f"Did you complete the code in `insert_docs.py`?"
documents.append(doc)

assert len(documents) > 0, f"Please remember to append to the documents array"

print(f'Inserting documents into Starpoint collection {collection_name}')
insert_documents(args.starpoint_api_key, collection_name, documents)
print(f'Done. {len(documents)} inserted.')
assert len(documents) > 0, f"Please remember to append to the documents array"

print(f'Inserting documents into Starpoint collection {collection_name}')
insert_documents(args.starpoint_api_key, collection_name, documents)
print(f'Done. {len(documents)} inserted.')

if __name__ == "__main__":
from os import environ as env
from dotenv import load_dotenv
load_dotenv()
from os import environ as env
from dotenv import load_dotenv
load_dotenv()
import argparse

import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--embedding', type=str, default='all-MiniLM-L6-v2', help='Embedding to use (default: all-MiniLM-L6-v2)')
parser.add_argument('--hyde', action='store_true', default=False, help='Use hyde embeddings (default: False)')
parser.add_argument('--doc-dir', type=str, default=join(DATA_DIR, 'documents/summer'), help='Document directory')
parser.add_argument('--starpoint-api-key', type=str, default=env['STARPOINT_API_KEY'], help='Starpoint API key')
args = parser.parse_args()
parser = argparse.ArgumentParser()
parser.add_argument('--embedding', type=str, default='all-MiniLM-L6-v2', help='Embedding to use (default: all-MiniLM-L6-v2)')
parser.add_argument('--hyde', action='store_true', default=False, help='Use hyde embeddings (default: False)')
parser.add_argument('--doc-dir', type=str, default=join(DATA_DIR, 'documents/winter'), help='Document directory')
parser.add_argument('--starpoint-api-key', type=str, default=env['STARPOINT_API_KEY'], help='Starpoint API key')
args = parser.parse_args()

main(args)
main(args)
46 changes: 46 additions & 0 deletions course/week4/scripts/optimize_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,22 @@ def get_search_space(self):
# over 100,000s but this should illustruate the point.
# TODO
# ===========================

hparams: List[DotMap] = []
embedding_models = ["all-MiniLM-L6-v2", "thenlper/gte-small"]
text_search_weights = [0, 0.5]
hyde_embeddings_options = [False, True]

for embedding in embedding_models:
for weight in text_search_weights:
for hyde in hyde_embeddings_options:
hparam = DotMap({
"embedding": embedding,
"text_search_weight": weight,
"hyde_embeddings": hyde,
})
hparams.append(hparam)

assert len(hparams) > 0, "Remember to complete the code in `get_search_space`"
assert len(hparams) == 8, "You should have 8 configurations"
self.hparams = hparams
Expand Down Expand Up @@ -108,6 +124,36 @@ def optimize(self):
# +1 to `hits` if it does. +0 to `hits` if not.
# TODO
# ===========================
hits = 0
for i in tqdm(range(len(questions))):
question = questions.question.iloc[i]
gt_id = questions.doc_id.iloc[i]

# Compute embedding
if self.input.hyde_embeddings:
hypo_answer = questions.hypo_answers.iloc[i]
embedding = embedding_model.encode(hypo_answer).tolist()
else:
embedding = embedding_model.encode(question).tolist()

# Retrieve documents
results = retrieve_documents(
self.starpoint_api_key,
collection_name=collection_name,
query=question,
query_embedding=embedding,
top_k=3,
text_search_weight=self.input.text_search_weight,
)

# Check if the correct document is in the top 3
retrieved_ids = [result['metadata']['doc_id'] for result in results]
if gt_id in retrieved_ids:
hits += 1

hit_rate = hits / float(len(questions))
self.hit_rate = hit_rate # save to class
self.hparam = self.input

hit_rate = hits / float(len(questions))
self.hit_rate = hit_rate # save to class
Expand Down