Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RAG implementation #168

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions rag/db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from sentence_transformers import SentenceTransformer
from datasets import load_dataset

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

### Build and embed a dataset to the FAISS vector db

dataset = load_dataset("databricks/databricks-dolly-15k")

contexts = ''.join(dataset['train']['context'])

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150, length_function=len, is_separator_regex=False)

docs = text_splitter.create_documents([contexts])

embedder_path = "rag-embed-model"

embedder = HuggingFaceEmbeddings(
model_name=embedder_path,
model_kwargs={'device':'cpu'},
encode_kwargs={'normalize_embeddings': False}
)

db = FAISS.from_documents(docs, embedder)

db.save_local("faiss_index")
Binary file added rag/faiss_index/index.faiss
Binary file not shown.
Binary file added rag/faiss_index/index.pkl
Binary file not shown.
56 changes: 56 additions & 0 deletions rag/query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from sentence_transformers import SentenceTransformer
from datasets import load_dataset

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain import HuggingFacePipeline
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import torch

embedder_path = "rag-embed-model"

embedder = HuggingFaceEmbeddings(
model_name=embedder_path,
model_kwargs={'device':'cpu'},
encode_kwargs={'normalize_embeddings': False}
)

db = FAISS.load_local("faiss_index", embedder)
retriever = db.as_retriever(search_kwargs={"k": 2})


#################
query = "What is on top of the Main Building at the University of Notre Dame?"
model_name = "Intel/dynamic_tinybert"
#################


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

def run(query):
context = ''.join(c.page_content for c in retriever.get_relevant_documents(query))
inputs = tokenizer(query, context, return_tensors="pt")

outputs = model(**inputs)


start_logits = outputs.start_logits
end_logits = outputs.end_logits

# Get the answer span (start and end positions)
start_index = torch.argmax(start_logits)
end_index = torch.argmax(end_logits)

# Decode the answer from the token indices
answer = tokenizer.decode(inputs["input_ids"][0, start_index:end_index+1])

return answer

print(run(query))
9 changes: 9 additions & 0 deletions rag/rag-embed-model/1_Pooling/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"word_embedding_dimension": 384,
"pooling_mode_cls_token": false,
"pooling_mode_mean_tokens": true,
"pooling_mode_max_tokens": false,
"pooling_mode_mean_sqrt_len_tokens": false,
"pooling_mode_weightedmean_tokens": false,
"pooling_mode_lasttoken": false
}
92 changes: 92 additions & 0 deletions rag/rag-embed-model/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
---
library_name: sentence-transformers
pipeline_tag: sentence-similarity
tags:
- sentence-transformers
- feature-extraction
- sentence-similarity

---

# {MODEL_NAME}

This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.

<!--- Describe your model here -->

## Usage (Sentence-Transformers)

Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:

```
pip install -U sentence-transformers
```

Then you can use the model like this:

```python
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('{MODEL_NAME}')
embeddings = model.encode(sentences)
print(embeddings)
```



## Evaluation Results

<!--- Describe how your model was evaluated -->

For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME})


## Training
The model was trained with the parameters:

**DataLoader**:

`torch.utils.data.dataloader.DataLoader` of length 5475 with parameters:
```
{'batch_size': 16, 'sampler': 'torch.utils.data.sampler.RandomSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
```

**Loss**:

`sentence_transformers.losses.ContrastiveLoss.ContrastiveLoss` with parameters:
```
{'distance_metric': 'SiameseDistanceMetric.COSINE_DISTANCE', 'margin': 0.5, 'size_average': True}
```

Parameters of the fit()-Method:
```
{
"epochs": 1,
"evaluation_steps": 0,
"evaluator": "NoneType",
"max_grad_norm": 1,
"optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
"optimizer_params": {
"lr": 2e-05
},
"scheduler": "WarmupLinear",
"steps_per_epoch": null,
"warmup_steps": 100,
"weight_decay": 0.01
}
```


## Full Model Architecture
```
SentenceTransformer(
(0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel
(1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
(2): Normalize()
)
```

## Citing & Authors

<!--- Describe where people can find more information -->
26 changes: 26 additions & 0 deletions rag/rag-embed-model/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
"architectures": [
"BertModel"
],
"attention_probs_dropout_prob": 0.1,
"classifier_dropout": null,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 384,
"initializer_range": 0.02,
"intermediate_size": 1536,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 6,
"pad_token_id": 0,
"position_embedding_type": "absolute",
"torch_dtype": "float32",
"transformers_version": "4.37.2",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 30522
}
7 changes: 7 additions & 0 deletions rag/rag-embed-model/config_sentence_transformers.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"__version__": {
"sentence_transformers": "2.0.0",
"transformers": "4.6.1",
"pytorch": "1.8.1"
}
}
Binary file added rag/rag-embed-model/model.safetensors
Binary file not shown.
20 changes: 20 additions & 0 deletions rag/rag-embed-model/modules.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[
{
"idx": 0,
"name": "0",
"path": "",
"type": "sentence_transformers.models.Transformer"
},
{
"idx": 1,
"name": "1",
"path": "1_Pooling",
"type": "sentence_transformers.models.Pooling"
},
{
"idx": 2,
"name": "2",
"path": "2_Normalize",
"type": "sentence_transformers.models.Normalize"
}
]
4 changes: 4 additions & 0 deletions rag/rag-embed-model/sentence_bert_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"max_seq_length": 256,
"do_lower_case": false
}
37 changes: 37 additions & 0 deletions rag/rag-embed-model/special_tokens_map.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"cls_token": {
"content": "[CLS]",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"mask_token": {
"content": "[MASK]",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": {
"content": "[PAD]",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"sep_token": {
"content": "[SEP]",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"unk_token": {
"content": "[UNK]",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
}
Loading