-
Notifications
You must be signed in to change notification settings - Fork 3
/
5.deprecated_Evaluating_Embeddings.py
384 lines (274 loc) · 11.5 KB
/
5.deprecated_Evaluating_Embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
# Databricks notebook source
# MAGIC %md
# MAGIC # Understanding Embeddings
# MAGIC Embeddings are just vectors and we can visualise and analyse them as such \
# MAGIC A common way to look at and explore embeddings is to use TSNE visualisations. \
# MAGIC This can be applied to our VectorDB Data too.
# MAGIC
# MAGIC See: https://www.kaggle.com/code/colinmorris/visualizing-embeddings-with-t-sne
# MAGIC
# MAGIC An open source tool that you might want to investigate for this as well is Arize Phoenix \
# MAGIC See: https://docs.arize.com/phoenix/
# COMMAND ----------
# "arize-phoenix[experimental]" pandas==1.5.3
%pip install -U llama_index==0.8.54 faiss-cpu datashader bokeh holoviews scikit-image colorcet "arize-phoenix[experimental]"
# COMMAND ----------
dbutils.library.restartPython()
# COMMAND ----------
# MAGIC %md
# MAGIC # Setup configs
# COMMAND ----------
# MAGIC %run ./utils
# COMMAND ----------
import os
import numpy as np
# COMMAND ----------
# DBTITLE 1,Configurations
# test_pdf = f'{dbfs_source_docs}/2010.11934.pdf'
test_pdf = '/dbfs/bootcamp_data/pdf_data/2302.09419.pdf'
test_pdf
# COMMAND ----------
# MAGIC %md
# MAGIC
# MAGIC # Understanding Embeddings
# MAGIC
# MAGIC Lets explore how data embeds a bit more in order to see how we can improve retrieval \
# MAGIC We will use a model deployed on Databricks Model Serving
# COMMAND ----------
# DBTITLE 1,Setup some embedding algorithms
browser_host = dbutils.notebook.entry_point.getDbutils().notebook().getContext().browserHostName().get()
db_host = f"https://{browser_host}"
db_token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
serving_uri = 'vicuna_13b'
serving_model_uri = f"{db_host}/serving-endpoints/{serving_uri}/invocations"
embedding_uri = 'brian_embedding_endpoint'
embedding_model_uri = f"{db_host}/serving-endpoints/{embedding_uri}/invocations"
llm_model = ServingEndpointLLM(endpoint_url=serving_model_uri, token=db_token)
embeddings = ModelServingEndpointEmbeddings(db_api_token=db_token)
# COMMAND ----------
# MAGIC %md
# MAGIC ## Simple Exploration w ReRank
# COMMAND ----------
# most vector stores use cosine_similarity
import faiss
example_sentences = ["The kangaroo population in Australia is declining due to habitat loss and hunting.",
"Australia has a diverse population of people from all over the world.",
"The kangaroo is a symbol of Australia and appears on its coat of arms.",
"The population of Australia is projected to reach 50 million by 2050.",
"Kangaroos are unique to Australia and can only be found in the wild there.",
"The indigenous population of Australia has been marginalized for centuries.",
"Australia is home to a variety of fascinating animals, including the kangaroo.",
"The population density of Australia is relatively low compared to other countries.",
"Kangaroos play an important role in maintaining the ecosystem balance in Australia.",
"Australia has strict laws regulating the hunting and trade of kangaroos to protect their population."]
encoded_sentences = [embeddings.embed_query(sentence) for sentence in example_sentences]
vector_format_encode = np.array(encoded_sentences, dtype=np.float32)
vector_format_encode /= np.linalg.norm(vector_format_encode, axis=1)[:, np.newaxis]
# we will create a vector index
vector_index = faiss.IndexFlatIP(vector_format_encode.shape[1])
vector_index.add(vector_format_encode)
test_question = "What is affecting the population of kangaroos?"
embedded_query = np.array(embeddings.embed_query(test_question))
# COMMAND ----------
# we can look at the retrieved entries and how it has been processed
k = 4
scores, index = vector_index.search(np.array([embedded_query]), k)
# look up the index for sentences
top_sentences = [example_sentences[i] for i in index[0]]
human_readable_result = list(zip(scores.reshape(-1, 1), top_sentences))
for score, sentence in human_readable_result:
print(f"Score: {score[0]:.4f}, Sentence: {sentence}")
# COMMAND ----------
# we can use a rerank to try to improve the result
format_top = []
for i in range(len(top_sentences)):
format_top.append(
f"Document {1}:\n"
f"{top_sentences[i]}"
)
context_str = "\n\n".join(format_top)
## Our Reranking prompt
rerank_prompt = ("A list of documents is shown below. Each document has a number next to it along "
"with a summary of the document. A question is also provided. \n"
"Respond with the numbers of the documents "
"you should consult to answer the question, in order of relevance, as well \n"
"as the relevance score. The relevance score is a number from 1-10 based on "
"how relevant you think the document is to the question.\n"
"Do not include any documents that are not relevant to the question. \n"
"Example format: \n"
"Document 1:\n<summary of document 1>\n\n"
"Document 2:\n<summary of document 2>\n\n"
"...\n\n"
"Document 10:\n<summary of document 10>\n\n"
"Question: <question>\n"
"Answer:\n"
"Doc: 9, Relevance: 7\n"
"Doc: 3, Relevance: 4\n"
"Doc: 7, Relevance: 3\n\n"
"Let's try this now: \n\n"
f"{context_str}\n"
f"Question: {test_question}\n"
"Answer:\n")
reranked_result = llm_model(rerank_prompt)
print(reranked_result)
# COMMAND ----------
# MAGIC %md
# MAGIC ### Visualising Embeddings
# COMMAND ----------
# So we can use reranking in order to better craft our results.
# Can we also look at our embeddings to understand the content?
# We will use umap and bokeh for this
import pandas as pd
import umap
from umap import plot
import plotly.express as px
from bokeh.resources import CDN
from bokeh.embed import file_html
umap_2d = umap.UMAP(n_components=2, init='random', random_state=0)
#umap_3d = umap.UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit(vector_format_encode)
hover_data = pd.DataFrame({'index': np.arange(len(example_sentences)) ,
'text': example_sentences})
# COMMAND ----------
plot.output_notebook()
# COMMAND ----------
# MAGIC %md
# MAGIC We can now visualise the data, note that we don't have a lot of datapoints \
# MAGIC so there aren't any obvious patterns in these but as you add more points patterns should appear
# COMMAND ----------
# hover_data=hover_data,
p = plot.interactive(proj_2d, point_size=10)
html = file_html(p, CDN, "Sample Sentences")
displayHTML(html)
# COMMAND ----------
# MAGIC %md
# MAGIC # Embeddings with Whole Document
# MAGIC
# COMMAND ----------
# MAGIC %md ## Setup Service Context
# MAGIC By default, llama_index assumes that OpenAI is the service context \
# MAGIC We are using AzureOpen AI so the setup is a little different. \
# MAGIC Azure OpenAI notably requires two deployments, an embedder and the model \
# MAGIC We will demonstrate a hybrid setup here where we use a huggingface sentence transformer \
# MAGIC that will do the embeddings for our vector store \
# MAGIC Whilst AzureOpenAI (gpt-3.5-turbo) provides the brains
# COMMAND ----------
from llama_index import (
ServiceContext,
set_global_service_context,
LLMPredictor
)
from llama_index.embeddings import LangchainEmbedding
from llama_index.callbacks import CallbackManager, OpenInferenceCallbackHandler, LlamaDebugHandler
callback_handler = OpenInferenceCallbackHandler()
callback_manager = CallbackManager([callback_handler])
llm_predictor = LLMPredictor(llm=llm_model)
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor,
embed_model=embeddings,
callback_manager = callback_manager
)
# we can now set this context to be a global default
set_global_service_context(service_context)
# COMMAND ----------
# MAGIC %md
# MAGIC ## Load and Chunk Document
# MAGIC We will load a sample doc to test on, firstly with a naive default chunking strategy
# COMMAND ----------
# DBTITLE 1,Create Index
# chunk the output
from llama_index import (
download_loader, VectorStoreIndex
)
from llama_index.evaluation import DatasetGenerator
from pathlib import Path
PDFReader = download_loader('PDFReader')
loader = PDFReader()
# This produces a list of llama_index document objects
documents = loader.load_data(file=Path(test_pdf))
# we are just setting up a simple in memory Vectorstore here
index = VectorStoreIndex.from_documents(documents)
# COMMAND ----------
# Lets have a quick look at the embeddings
text_obj = [document.text for document in documents]
encoded_chunks = [embeddings.embed_query(document_text) for document_text in text_obj]
vector_chunks = np.array(encoded_chunks, dtype=np.float32)
vector_chunks /= np.linalg.norm(vector_chunks, axis=1)[:, np.newaxis]
# COMMAND ----------
# DBTITLE 1,Examine Chunk text
pd.set_option('display.max_colwidth', 1000)
hover_data
# COMMAND ----------
# DBTITLE 1,Visualise Chunk Text
umap_2d = umap.UMAP(n_components=2, init='random', random_state=0)
#umap_3d = umap.UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit(vector_chunks)
hover_data = pd.DataFrame({'index': np.arange(len(text_obj)) ,
'text': text_obj})
# hover_data=hover_data,
p = plot.interactive(proj_2d, point_size=10)
html = file_html(p, CDN, "Research Doc")
displayHTML(html)
# COMMAND ----------
# MAGIC %md TODO BIER comparison of embedding algorithms
# COMMAND ----------
# DBTITLE 1,Create Sample Questions
import nest_asyncio
nest_asyncio.apply()
# and turning it into a query engine
query_engine = index.as_query_engine()
# this is the question generator. Note that it has additional settings to customise prompt etc
data_generator = DatasetGenerator.from_documents(documents=documents,
service_context=service_context)
# this is the call to generate the questions
eval_questions = data_generator.generate_questions_from_nodes()
# COMMAND ----------
# MAGIC %md
# MAGIC # (WIP) Create Phoenix Visualisations
# MAGIC TODO We are working with the Arize team to make Phoenix work \
# MAGIC till that happens this code will not be of use for now
# COMMAND ----------
# Extract out nodes
# test parse index data
document_ids = []
document_texts = []
document_embeddings = []
docstore = index.storage_context.docstore
for node_id, node in docstore.docs.items():
document_ids.append(node.hash) # use node hash as the document ID
document_texts.append(node.text)
document_embeddings.append(np.array(index.storage_context.vector_store.get(node_id)))
dataset_df = pd.DataFrame(
{
"document_id": document_ids,
"text": document_texts,
"text_vector": document_embeddings,
}
)
# COMMAND ----------
# create the query frame
from llama_index.callbacks.open_inference_callback import as_dataframe
callback_handler = OpenInferenceCallbackHandler()
query_data_buffer = callback_handler.flush_query_data_buffer()
sample_query_df = as_dataframe(query_data_buffer)
sample_query_df
# COMMAND ----------
import phoenix as px
### Create the schema for the documents
database_schema = px.Schema(
prediction_id_column_name="document_id",
prompt_column_names=px.EmbeddingColumnNames(
vector_column_name="text_vector",
raw_data_column_name="text",
),
)
database_ds = px.Dataset(
dataframe=dataset_df,
schema=database_schema,
name="database",
)
query_ds = px.Dataset.from_open_inference(sample_query_df)
# COMMAND ----------
# MAGIC %md
# MAGIC # Start Visualisation App
# COMMAND ----------
session = px.launch_app(primary=query_ds, corpus=database_ds, host='0.0.0.0', port='10101')