Skip to content

Commit

Permalink
Merge pull request #2 from pbansal5/debugging
Browse files Browse the repository at this point in the history
Debugging
  • Loading branch information
Hprairie authored Mar 20, 2024
2 parents 6c5bfa9 + 3f240c3 commit e61791f
Show file tree
Hide file tree
Showing 21 changed files with 306 additions and 249 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
src/.ipynb_checkpoints
src/.ipynb_checkpoints
jsons/
125 changes: 0 additions & 125 deletions jsons/wikitext_forbidden_titles.txt

This file was deleted.

19 changes: 19 additions & 0 deletions logs/baseline/args.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
**************** Benchmark Params ****************
auth_token None
cache_dir None
dataset_name wikitext-103-v1
dataset_path wikitext
dataset_split validation
load_from hf
max_length 32
model_layer None
model_name gpt2
model_parallelism False
normalization_level word
num_docs_to_rank -1
output_dir logs/baseline
ranking_logprob_past_tokens 16
ranking_strategy first
retrieved_file None
retrieved_max_length 256
stride 4
1 change: 1 addition & 0 deletions logs/baseline/eval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"eval_perplexity": 1.0391204357147217}
Binary file added logs/baseline/ppls.pkl
Binary file not shown.
40 changes: 40 additions & 0 deletions logs/results-gp2-bert-base/args.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
**************** Benchmark Params **************** auth_token None
cache_dir None
dataset_name wikitext-103-v1
dataset_path wikitext
dataset_split validation
load_from hf
max_length 32
model_layer 12
model_name gpt2
model_parallelism False
normalization_level word
num_docs_to_rank 16
output_dir logs/results-gp2-bert-base
ranking_logprob_past_tokens 16
ranking_strategy colbert
retrieved_file jsons/bert_reranked_wikitext_rql_32_rs_4_topK_16.json
retrieved_max_length 32
stride 4



**************** BM25 Logging INFO **************** data_dir
forbidden_titles jsons/wikitext_forbidden_titles.txt
output_file jsons/wikitext_rql_32_rs_4_topK_100.json
query_corpus wikitext
retrieval_corpus wikipedia-dpr-100w
retrieval_query_length 32
retrieval_stride 4
tokenizer gpt2
topK 100



**************** Reranking Logging INFO **************** bm25_file jsons/wikitext_rql_32_rs_4_topK_100.json
data_dir
max_length 256
rerank_model bert-base-uncased
reranked_file jsons/bert_reranked_wikitext_rql_32_rs_4_topK_16.json
retrieval_corpus wikipedia-dpr-100w
topK 16
1 change: 1 addition & 0 deletions logs/results-gp2-bert-base/eval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"eval_perplexity": 1.1255316734313965, "num_input_no_retrieval": 0}
Binary file added logs/results-gp2-bert-base/ppls.pkl
Binary file not shown.
11 changes: 11 additions & 0 deletions scripts/baseline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
export MODEL_NAME='gpt2'
export OUTPUT_DIR='logs/baseline'

python3 -m benchmark.eval_lm \
--model_name $MODEL_NAME \
--dataset_path wikitext \
--dataset_name wikitext-103-v1 \
--dataset_split 'validation' \
--output_dir $OUTPUT_DIR \
--stride 4 \
--max_length 32 \
18 changes: 18 additions & 0 deletions scripts/benchmark-perplexity.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
export MODEL_NAME='gpt2'
export OUTPUT_DIR='logs/results-gp2-bert-base'
export RETRIEVAL_FILE='jsons/bert_reranked_wikitext_rql_32_rs_4_topK_16.json'

python3 -m benchmark.eval_lm \
--model_name $MODEL_NAME \
--dataset_path wikitext \
--dataset_name wikitext-103-v1 \
--dataset_split 'validation' \
--output_dir $OUTPUT_DIR \
--stride 4 \
--max_length 32 \
--retrieved_file $RETRIEVAL_FILE \
--ranking_strategy 'colbert' \
--num_docs_to_rank 16 \
--ranking_logprob_past_tokens 16 \
--retrieved_max_length 32 \
--model_layer 12
Binary file added src/benchmark/__pycache__/__init__.cpython-38.pyc
Binary file not shown.
Binary file added src/benchmark/__pycache__/eval_lm.cpython-38.pyc
Binary file not shown.
Binary file not shown.
70 changes: 43 additions & 27 deletions src/benchmark/eval_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import numpy as np
import torch
import transformers
import torch
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
from datasets import load_dataset
Expand All @@ -27,27 +28,30 @@ def evaluate_logprob_with_retrieved_docs(
num_tokens_to_rank,
retrieval_max_length,
num_docs=-1,
model_layer=None,
*args
):
input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)

match ranking_strategy:
case 'first':
assert num_docs in [-1, 1], f"In 'first' ranking strategy, unexpected number of docs to rank: {num_docs}"
num_docs = 1
chosen_doc_id = 0
case 'random':
chosen_doc_id = np.random.randint(num_docs)
retrieved_item["retrieved_docs"] = [retrieved_item["retrieved_docs"][chosen_doc_id]]
num_docs = 1
case 'colbert':
best_doc = None
for doc_info in retrieved_item["reranked_retrieved_docs"][f'layer{args.layer}']:
if best_doc is None or doc_info['rank'] > best_doc['rank']:
best_doc = doc_info
chosen_doc_id = best_doc['docid']
num_docs = 1
retrieved_item["retrieved_docs"] = [best_doc]
if ranking_strategy == 'first':
assert num_docs in [-1, 1], f"In 'first' ranking strategy, unexpected number of docs to rank: {num_docs}"
num_docs = 1
chosen_doc_id = 0
elif ranking_strategy == 'random':
chosen_doc_id = np.random.randint(num_docs)
retrieved_item["retrieved_docs"] = [retrieved_item["retrieved_docs"][chosen_doc_id]]
num_docs = 1
elif ranking_strategy == 'colbert':
assert model_layer is not None, "ColBert was selected, but now model layer was specified"
best_doc = None
for doc_info in retrieved_item["reranked_retrieved_docs"][f'layer{model_layer}']:
if best_doc is None or doc_info['rank'] > best_doc['rank']:
best_doc = doc_info
chosen_doc_id = best_doc['docid']
num_docs = 1
retrieved_item["retrieved_docs"] = [best_doc]
else:
raise NotImplementedError('Unknown Reranking Strategy')


num_docs_in_retrieved = len(retrieved_item["retrieved_docs"])
Expand All @@ -58,6 +62,7 @@ def evaluate_logprob_with_retrieved_docs(
labels_for_ranking = input_ids.clone()
assert input_ids.size() == (num_docs, end_loc-begin_loc)


for doc_id in range(num_docs):
retrieved_example = retrieved_item["retrieved_docs"][doc_id]

Expand All @@ -67,7 +72,10 @@ def evaluate_logprob_with_retrieved_docs(
doc_text = doc_title + "\n" + doc_text
encoded_retrieved_text = tokenizer.encode(doc_text, max_length=retrieval_max_length, truncation=True)

# Changing this
input_ids[doc_id, :len(encoded_retrieved_text)] = torch.tensor(encoded_retrieved_text, device=device)
# to this
#input_ids[doc_id].concat(torch.tensor(encoded_retrieved_text, device=device))

loss_fct = CrossEntropyLoss(reduction="none")

Expand Down Expand Up @@ -96,7 +104,7 @@ def evaluate_logprob_with_retrieved_docs(

# Calculate logprob of the chosen doc:
lm_logits = lm_logits[batch_doc_id, -trg_len-1:-1, :]
labels = target_ids[batch_doc_id, -trg_len:]
labels = target_ids[batch_doc_id, -trg_len:] # Changed this
loss = loss_fct(lm_logits, labels)
token_ppls = loss.cpu()
tokens_to_predict = labels.view(-1).cpu().tolist()
Expand All @@ -118,7 +126,8 @@ def eval_dataset(
retrieval_max_length=256,
ranking_strategy="first",
num_docs_to_rank=1,
num_tokens_to_rank_logprob=16
num_tokens_to_rank_logprob=16,
model_layer=None # Used for ColBERT
):
encodings = tokenizer(dataset, add_special_tokens=False, return_tensors="pt")

Expand All @@ -138,8 +147,9 @@ def eval_dataset(


# Get the retrieved dataset
if retrieved_info:
retrieval_dataset = retrieved_info['query_to_retrieved_docs']
retrieval_dataset = None
if retrieval_info:
retrieval_dataset = retrieval_info['query_to_retrieved_docs']

nlls = []
prev_end_loc = 0
Expand All @@ -149,20 +159,22 @@ def eval_dataset(
all_tokens_to_predict = []
all_chosen_doc_ids = [None]
num_inputs_no_retrieval = 0
for begin_loc in tqdm(range(0, dataset_len, stride)):
for begin_loc in tqdm(range(0, dataset_len, stride)[:500]): # Change this before benchmarking
end_loc = min(begin_loc + max_length, dataset_len)
trg_len = end_loc - prev_end_loc # may be different from stride on last loop
if idx > 0 and retrieval_dataset is not None and len(retrieval_dataset[idx]["retrieved_docs"]) > 0:
retrieved_example = retrieval_dataset[idx]
assert retrieved_example["begin_location"] == prev_end_loc
assert retrieved_example["end_location"] == end_loc
assert retrieved_example["begin_location"] == begin_loc, f"{retrieved_example['begin_location']} is different from {prev_end_loc}"
assert retrieved_example["end_location"] == end_loc, f"{retrieved_example['end_location']} is different from {end_loc}"
#print(retrieved_example["begin_location"], retrieved_example["end_location"], begin_loc, end_loc, prev_end_loc)

neg_log_likelihood, chosen_doc_id, token_ppls, tokens_to_predict = evaluate_logprob_with_retrieved_docs(
model, tokenizer, device, encodings, begin_loc, end_loc, trg_len, retrieved_example,
ranking_strategy=ranking_strategy,
num_tokens_to_rank=num_tokens_to_rank_logprob,
retrieval_max_length=retrieval_max_length,
num_docs=num_docs_to_rank
num_docs=num_docs_to_rank,
model_layer=model_layer
)
all_chosen_doc_ids.append(chosen_doc_id)
else:
Expand Down Expand Up @@ -195,7 +207,7 @@ def eval_dataset(
all_tokens_to_predict.append(tokens_to_predict)
assert len(all_token_ppls) == len(all_tokens_to_predict)

prev_end_loc = end_loc
prev_end_loc = end_loc
idx += 1
if end_loc == dataset_len:
break
Expand Down Expand Up @@ -265,6 +277,7 @@ def main(args):
ranking_strategy=args.ranking_strategy,
num_docs_to_rank=args.num_docs_to_rank,
num_tokens_to_rank_logprob=args.ranking_logprob_past_tokens,
model_layer=args.model_layer
)


Expand All @@ -291,10 +304,13 @@ def main(args):
# retrieval params
parser.add_argument("--retrieved_file", type=str, default=None)
parser.add_argument("--retrieved_max_length", type=int, default=256)
parser.add_argument("--ranking_strategy", type=str, choices=["first", "logprob", "oracle", "random"], default="first")
parser.add_argument("--ranking_strategy", type=str, choices=["first", "logprob", "oracle", "random", "colbert"], default="first")
parser.add_argument("--num_docs_to_rank", type=int, default=-1)
parser.add_argument("--ranking_logprob_past_tokens", type=int, default=16)

# ColBERT params
parser.add_argument("--model_layer", type=int, default=None, help='Which layer to use from the reranker')

args = parser.parse_args()

main(args)
Binary file modified src/cbralm/__pycache__/__init__.cpython-38.pyc
Binary file not shown.
Binary file modified src/cbralm/__pycache__/bm25_retrieval.cpython-38.pyc
Binary file not shown.
Binary file added src/cbralm/__pycache__/model_utils.cpython-38.pyc
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit e61791f

Please sign in to comment.