mhw32 · michael-goeliza · Jul 17, 2024 · Jul 21, 2024 · Jul 21, 2024 · Jul 21, 2024
diff --git a/course/week2/monitor_project/monitor/metrics.py b/course/week2/monitor_project/monitor/metrics.py
@@ -25,7 +25,9 @@ def get_ks_score(tr_probs, te_probs):
   # te_probs: torch.Tensor
   #   predicted probabilities from test test
   # score: float - between 0 and 1
-  pass  # remove me
+  tr_probs_np = tr_probs.numpy()
+  te_probs_np = te_probs.numpy()
+  _, score = ks_2samp(tr_probs_np, te_probs_np)
   # ============================
   return score
 
@@ -68,7 +70,16 @@ def get_hist_score(tr_probs, te_probs, bins=10):
   # 
   # Read the documentation for `np.histogram` carefully, in
   # particular what `bin_edges` represent.
-  pass  # remove me
+  tr_heights, bin_edges = np.histogram(tr_probs.numpy(), bins=bins, density=True)
+  te_heights, _ = np.histogram(te_probs.numpy(), bins=bin_edges, density=True)
+
+  score = 0
+  for i in range(len(bin_edges) - 1):
+      bin_diff = bin_edges[i+1] - bin_edges[i]
+      tr_area = bin_diff * tr_heights[i]
+      te_area = bin_diff * te_heights[i]
+      intersect = min(tr_area, te_area)
+      score += intersect
   # ============================
   return score
 
@@ -97,7 +108,12 @@ def get_vocab_outlier(tr_vocab, te_vocab):
   # te_vocab: dict[str, int]
   #   Map from word to count for test examples
   # score: float (between 0 and 1)
-  pass  # remove me
+  num_seen = sum(1 for word in te_vocab if word in tr_vocab)
+  num_total = len(te_vocab)
+  if num_total == 0:
+    score = 0  # If test vocab is empty, we consider no outliers
+  else:
+    score = 1 - (num_seen / num_total)
   # ============================
   return score
 
@@ -132,7 +148,9 @@ def calibrate(self, tr_probs, tr_labels, te_probs):
     # it to a torch.Tensor.
     # 
     # `te_probs_cal`: torch.Tensor
-    pass  # remove me
+    iso_reg = IsotonicRegression(out_of_bounds='clip')
+    tr_probs_cal = torch.tensor(iso_reg.fit_transform(tr_probs.numpy(), tr_labels.numpy()))
+    te_probs_cal = torch.tensor(iso_reg.predict(te_probs.numpy()))
     # ============================
     return tr_probs_cal, te_probs_cal
 
@@ -149,4 +167,4 @@ def monitor(self, te_vocab, te_probs):
       'hist_score': hist_score,
       'outlier_score': outlier_score,
     }
-    return metrics
+    return metrics
diff --git a/course/week2/testing_project/configs/test.json b/course/week2/testing_project/configs/test.json
@@ -1,4 +1,4 @@
 {
   "data": "mnist",
-  "model": "linear.ckpt"
+  "model": "mlp.ckpt"
 }
diff --git a/course/week2/testing_project/images/integration/digits-processed/eight.png b/course/week2/testing_project/images/integration/digits-processed/eight.png
diff --git a/course/week2/testing_project/images/integration/digits-processed/five.png b/course/week2/testing_project/images/integration/digits-processed/five.png
diff --git a/course/week2/testing_project/images/integration/digits-processed/four.png b/course/week2/testing_project/images/integration/digits-processed/four.png
diff --git a/course/week2/testing_project/images/integration/digits-processed/nine.png b/course/week2/testing_project/images/integration/digits-processed/nine.png
diff --git a/course/week2/testing_project/images/integration/digits-processed/one.png b/course/week2/testing_project/images/integration/digits-processed/one.png
diff --git a/course/week2/testing_project/images/integration/digits-processed/seven.png b/course/week2/testing_project/images/integration/digits-processed/seven.png
diff --git a/course/week2/testing_project/images/integration/digits-processed/six.png b/course/week2/testing_project/images/integration/digits-processed/six.png
diff --git a/course/week2/testing_project/images/integration/digits-processed/three.png b/course/week2/testing_project/images/integration/digits-processed/three.png
diff --git a/course/week2/testing_project/images/integration/digits-processed/two.png b/course/week2/testing_project/images/integration/digits-processed/two.png
diff --git a/course/week2/testing_project/images/integration/digits-processed/zero.png b/course/week2/testing_project/images/integration/digits-processed/zero.png
diff --git a/course/week2/testing_project/images/integration/digits-raw/eight.jpg b/course/week2/testing_project/images/integration/digits-raw/eight.jpg
diff --git a/course/week2/testing_project/images/integration/digits-raw/five.jpg b/course/week2/testing_project/images/integration/digits-raw/five.jpg
diff --git a/course/week2/testing_project/images/integration/digits-raw/four.jpg b/course/week2/testing_project/images/integration/digits-raw/four.jpg
diff --git a/course/week2/testing_project/images/integration/digits-raw/nine.jpg b/course/week2/testing_project/images/integration/digits-raw/nine.jpg
diff --git a/course/week2/testing_project/images/integration/digits-raw/one.jpg b/course/week2/testing_project/images/integration/digits-raw/one.jpg
diff --git a/course/week2/testing_project/images/integration/digits-raw/seven.jpg b/course/week2/testing_project/images/integration/digits-raw/seven.jpg
diff --git a/course/week2/testing_project/images/integration/digits-raw/six.jpg b/course/week2/testing_project/images/integration/digits-raw/six.jpg
diff --git a/course/week2/testing_project/images/integration/digits-raw/three.jpg b/course/week2/testing_project/images/integration/digits-raw/three.jpg
diff --git a/course/week2/testing_project/images/integration/digits-raw/two.jpg b/course/week2/testing_project/images/integration/digits-raw/two.jpg
diff --git a/course/week2/testing_project/images/integration/digits-raw/zero.jpg b/course/week2/testing_project/images/integration/digits-raw/zero.jpg
diff --git a/course/week2/testing_project/images/integration/labels.csv b/course/week2/testing_project/images/integration/labels.csv
@@ -0,0 +1,11 @@
+path,label
+zero.png,0
+one.png,1
+two.png,2
+three.png,3
+four.png,4
+five.png,5
+six.png,6
+seven.png,7
+eight.png,8
+nine.png,9
diff --git a/course/week2/testing_project/images/regression/test-data.pt b/course/week2/testing_project/images/regression/test-data.pt
diff --git a/course/week2/testing_project/testing/directionality.py b/course/week2/testing_project/testing/directionality.py
@@ -146,7 +146,7 @@ def test(self, trainer, system):
       preds_raw = torch.argmax(logits_raw, dim=1)
       preds_transformed = torch.argmax(logits_transformed, dim=1)
 
-      batch_metric = 0  # store metric here
+      #batch_metric = 0  # store metric here
       # ================================
       # FILL ME OUT
       # 
@@ -166,7 +166,8 @@ def test(self, trainer, system):
       # --
       # batch_metric: float (not torch.Tensor!)
       #   Metric computed on a minibatch
-      pass  # remove me
+
+      batch_metric = (preds_raw == preds_transformed).float().mean().item()
       # ================================
       metric.append(batch_metric)
       pbar.update()

diff --git a/course/week2/testing_project/testing/integration.py b/course/week2/testing_project/testing/integration.py
@@ -64,7 +64,8 @@ def test(self, trainer, system):
     # Notes:
     # --
     # Nothing to return here
-    pass  # remove me
+    loader = self.get_dataloader()
+    trainer.test(system, dataloaders=loader)
     # ================================
 
 

diff --git a/course/week2/testing_project/testing/regression.py b/course/week2/testing_project/testing/regression.py
@@ -99,7 +99,9 @@ def build_regression_test(system, loader):
     # batch_is_correct: List[int] (not a torch.Tensor!)
     #   List of integers - 1 if the model got that element correct 
     #                    - 0 if the model got that element incorrect
-    pass # remove me
+    batch_is_correct = (preds == labels).long().numpy().tolist()
+    batch_loss = F.cross_entropy(logits, labels, reduction='none')
+    batch_loss = batch_loss.numpy().tolist()
     # ================================
     losses.extend(batch_loss)
     is_correct.extend(batch_is_correct)

diff --git a/course/week4/data/collections/dcdl-week4-michael-goeliza-new-all-minilm-l6-v2-hyde.json b/course/week4/data/collections/dcdl-week4-michael-goeliza-new-all-minilm-l6-v2-hyde.json
@@ -0,0 +1 @@
+{"id": "318bdee7-354b-4de8-830d-1e3025073ba1", "name": "dcdl-week4-michael-goeliza-new-all-minilm-l6-v2-hyde", "dimensionality": 384}
diff --git a/course/week4/data/collections/dcdl-week4-michael-goeliza-new-all-minilm-l6-v2.json b/course/week4/data/collections/dcdl-week4-michael-goeliza-new-all-minilm-l6-v2.json
@@ -0,0 +1 @@
+{"id": "f9e8731e-6cd7-4a2e-9b7b-9951dfd3bb07", "name": "dcdl-week4-michael-goeliza-new-all-minilm-l6-v2", "dimensionality": 384}
diff --git a/course/week4/data/collections/dcdl-week4-michael-goeliza-new-thenlper-gte-small-hyde.json b/course/week4/data/collections/dcdl-week4-michael-goeliza-new-thenlper-gte-small-hyde.json
@@ -0,0 +1 @@
+{"id": "3616e187-136f-4d6c-a09e-8d4c97c2603d", "name": "dcdl-week4-michael-goeliza-new-thenlper-gte-small-hyde", "dimensionality": 384}
diff --git a/course/week4/data/collections/dcdl-week4-michael-goeliza-new-thenlper-gte-small.json b/course/week4/data/collections/dcdl-week4-michael-goeliza-new-thenlper-gte-small.json
@@ -0,0 +1 @@
+{"id": "142478b1-118e-4e44-8867-f39d507b7eb9", "name": "dcdl-week4-michael-goeliza-new-thenlper-gte-small", "dimensionality": 384}
diff --git a/course/week4/data/questions/questions.csv b/course/week4/data/questions/questions.csv
diff --git a/course/week4/scripts/build_eval_set.py b/course/week4/scripts/build_eval_set.py
@@ -65,6 +65,11 @@ def write_questions(self):
         # Save the generated question (as a string) into the `question` variable.
         # TODO
         # ===========================
+        question = query_openai(
+          self.openai_api_key,
+          get_question_prompt(chunk)
+        )
+
         assert len(question) > 0, f"Did you complete the coding section in `write_questions`?"
         questions.append(question)
         doc_ids.append(doc_id) # save the doc id for each 
@@ -94,6 +99,15 @@ def grade_questions(self):
       #       Set the rating to 0 if integer casting fails.
       # TODO
       # ===========================
+      try:
+        response = query_openai(
+          self.openai_api_key,
+          get_question_judge_prompt(self.contexts[i], self.questions[i])
+        )
+        rating = int(response)
+      except ValueError:
+        rating = 0
+
       assert rating >= 0, f"Did you complete the coding section in `grade_questions`?"
       ratings.append(rating)
 
@@ -118,6 +132,11 @@ def write_hypothetical_answers(self):
       # See `rag/prompts` for a bank of relevant prompts to use. You may edit any prompts in there.
       # TODO
       # ===========================
+      hypo_answer = query_openai(
+        self.openai_api_key,
+        get_hyde_response_prompt(self.questions[i])
+      )
+
       assert len(hypo_answer) > 0, f"Did you complete the coding section in `write_hypothetical_answers`?"
       hypo_answers.append(hypo_answer)
 

diff --git a/course/week4/scripts/insert_docs.py b/course/week4/scripts/insert_docs.py
@@ -1,76 +1,83 @@
 from tqdm import tqdm
 from os.path import join
 from sentence_transformers import SentenceTransformer
-
 from rag.llm import embedding_name_to_dim
 from rag.vector import get_my_collection_name, insert_documents
 from rag.dataset import load_documents
 from rag.paths import DATA_DIR
 
-
 def main(args):
-  r"""Inserts new documents in the collection.
-  """
-  collection_name = get_my_collection_name(
-    env['GITHUB_USERNAME'],
-    embedding=args.embedding, 
-    hyde=args.hyde,
-  )
+    r"""Inserts new documents in the collection.
+    """
+    collection_name = get_my_collection_name(
+        env['GITHUB_USERNAME'],
+        embedding=args.embedding,
+        hyde=args.hyde,
+    )
 
-  # Load raw documents as a Pandas Dataframe with two columns
-  # - doc_id: Document ID 
-  # - text: Content for the document
-  raw = load_documents(override_doc_dir=args.doc_dir)
-  print(f'Found {len(raw)} documents to upload.')
+    # Load raw documents as a Pandas Dataframe with two columns
+    # - doc_id: Document ID
+    # - text: Content for the document
+    raw = load_documents(override_doc_dir=args.doc_dir)
+    print(f'Found {len(raw)} documents to upload.')
 
-  # Use the embedding model to embed docs
-  embedding_dim = embedding_name_to_dim(args.embedding)
-  embedding_model = SentenceTransformer(args.embedding)
-  print(f'Loaded the {args.embedding} model.')
+    # Use the embedding model to embed docs
+    embedding_dim = embedding_name_to_dim(args.embedding)
+    embedding_model = SentenceTransformer(args.embedding)
+    print(f'Loaded the {args.embedding} model.')
 
-  documents = []
-  for i in tqdm(range(len(raw)), desc='Inserting into db'):
-    doc = ""
-    # ===========================
-    # FILL ME OUT
-    # Prepare the documents to be inserted into the vector db
-    # You will need compute embeddings. Make sure to cast the embedding to a list.
-    # Please refer to `config.json` for which embedding to use:
-    # Example document:
-    # {
-    #   "embeddings": {
-    #     "values": [0.1, 0.2, 0.3, 0.4, 0.5],
-    #     "dimensionality": 5,
-    #   }, # single vector document
-    #   "metadata": {
-    #     "doc_id": "...",
-    #   }
-    # }
-    # Please add the document ID to the metadata under the key `doc_id`.
-    # Please see docs here: https://docs.starpoint.ai/create-documents 
-    # TODO
-    # ===========================
-    assert len(doc) > 0, f"Did you complete the code in `insert_docs.py`?"
-    documents.append(doc)
+    documents = []
+    for i in tqdm(range(len(raw)), desc='Inserting into db'):
+        doc = {}
+        # ===========================
+        # FILL ME OUT
+        # Prepare the documents to be inserted into the vector db
+        # You will need compute embeddings. Make sure to cast the embedding to a list.
+        # Please refer to `config.json` for which embedding to use:
+        # Example document:
+        # {
+        # "embeddings": {
+        # "values": [0.1, 0.2, 0.3, 0.4, 0.5],
+        # "dimensionality": 5,
+        # }, # single vector document
+        # "metadata": {
+        # "doc_id": "...",
+        # }
+        # }
+        # Please add the document ID to the metadata under the key `doc_id`.
+        # Please see docs here: https://docs.starpoint.ai/create-documents
+        # TODO
+        # ===========================
+        doc = {
+            "embeddings": {
+                "values": embedding_model.encode(raw.iloc[i]['text']).tolist(),
+                "dimensionality": embedding_dim,
+            },
+            "metadata": {
+                "doc_id": raw.iloc[i]['doc_id'],
+                "text": raw.iloc[i]['text']
+            }
+        }
+        assert len(doc) > 0, f"Did you complete the code in `insert_docs.py`?"
+        documents.append(doc)
 
-  assert len(documents) > 0, f"Please remember to append to the documents array"
-
-  print(f'Inserting documents into Starpoint collection {collection_name}')
-  insert_documents(args.starpoint_api_key, collection_name, documents)
-  print(f'Done. {len(documents)} inserted.')
+    assert len(documents) > 0, f"Please remember to append to the documents array"
 
+    print(f'Inserting documents into Starpoint collection {collection_name}')
+    insert_documents(args.starpoint_api_key, collection_name, documents)
+    print(f'Done. {len(documents)} inserted.')
 
 if __name__ == "__main__":
-  from os import environ as env
-  from dotenv import load_dotenv
-  load_dotenv()
+    from os import environ as env
+    from dotenv import load_dotenv
+    load_dotenv()
+    import argparse
 
-  import argparse
-  parser = argparse.ArgumentParser()
-  parser.add_argument('--embedding', type=str, default='all-MiniLM-L6-v2', help='Embedding to use (default: all-MiniLM-L6-v2)')
-  parser.add_argument('--hyde', action='store_true', default=False, help='Use hyde embeddings (default: False)')
-  parser.add_argument('--doc-dir', type=str, default=join(DATA_DIR, 'documents/summer'), help='Document directory')
-  parser.add_argument('--starpoint-api-key', type=str, default=env['STARPOINT_API_KEY'], help='Starpoint API key')
-  args = parser.parse_args()
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--embedding', type=str, default='all-MiniLM-L6-v2', help='Embedding to use (default: all-MiniLM-L6-v2)')
+    parser.add_argument('--hyde', action='store_true', default=False, help='Use hyde embeddings (default: False)')
+    parser.add_argument('--doc-dir', type=str, default=join(DATA_DIR, 'documents/winter'), help='Document directory')
+    parser.add_argument('--starpoint-api-key', type=str, default=env['STARPOINT_API_KEY'], help='Starpoint API key')
+    args = parser.parse_args()
 
-  main(args)
+    main(args)
diff --git a/course/week4/scripts/optimize_params.py b/course/week4/scripts/optimize_params.py
@@ -71,6 +71,22 @@ def get_search_space(self):
     # over 100,000s but this should illustruate the point.
     # TODO
     # ===========================
+
+    hparams: List[DotMap] = []
+    embedding_models = ["all-MiniLM-L6-v2", "thenlper/gte-small"]
+    text_search_weights = [0, 0.5]
+    hyde_embeddings_options = [False, True]
+
+    for embedding in embedding_models:
+        for weight in text_search_weights:
+            for hyde in hyde_embeddings_options:
+                hparam = DotMap({
+                    "embedding": embedding,
+                    "text_search_weight": weight,
+                    "hyde_embeddings": hyde,
+                })
+                hparams.append(hparam)
+
     assert len(hparams) > 0, "Remember to complete the code in `get_search_space`"
     assert len(hparams) == 8, "You should have 8 configurations" 
     self.hparams = hparams
@@ -108,6 +124,36 @@ def optimize(self):
       #      +1 to `hits` if it does. +0 to `hits` if not.
       # TODO
       # ===========================
+      hits = 0
+      for i in tqdm(range(len(questions))):
+          question = questions.question.iloc[i]
+          gt_id = questions.doc_id.iloc[i]
+
+          # Compute embedding
+          if self.input.hyde_embeddings:
+              hypo_answer = questions.hypo_answers.iloc[i]
+              embedding = embedding_model.encode(hypo_answer).tolist()
+          else:
+              embedding = embedding_model.encode(question).tolist()
+
+          # Retrieve documents
+          results = retrieve_documents(
+              self.starpoint_api_key,
+              collection_name=collection_name,
+              query=question,
+              query_embedding=embedding,
+              top_k=3,
+              text_search_weight=self.input.text_search_weight,
+          )
+
+          # Check if the correct document is in the top 3
+          retrieved_ids = [result['metadata']['doc_id'] for result in results]
+          if gt_id in retrieved_ids:
+              hits += 1
+
+      hit_rate = hits / float(len(questions))
+      self.hit_rate = hit_rate  # save to class
+      self.hparam = self.input
 
     hit_rate = hits / float(len(questions))
     self.hit_rate = hit_rate  # save to class
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"id": "318bdee7-354b-4de8-830d-1e3025073ba1", "name": "dcdl-week4-michael-goeliza-new-all-minilm-l6-v2-hyde", "dimensionality": 384}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"id": "f9e8731e-6cd7-4a2e-9b7b-9951dfd3bb07", "name": "dcdl-week4-michael-goeliza-new-all-minilm-l6-v2", "dimensionality": 384}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"id": "3616e187-136f-4d6c-a09e-8d4c97c2603d", "name": "dcdl-week4-michael-goeliza-new-thenlper-gte-small-hyde", "dimensionality": 384}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"id": "142478b1-118e-4e44-8867-f39d507b7eb9", "name": "dcdl-week4-michael-goeliza-new-thenlper-gte-small", "dimensionality": 384}