-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis_visualization.py
91 lines (78 loc) · 3.64 KB
/
analysis_visualization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# mostly doing the analysis - more clean up possible
import numpy as np
import faiss
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import logging
import time
import torch
import pandas as pd
import plotly.express as px
from scraping_processing import search_for_query_embeddings, ensure_dimension
import PyPDF2
def extract_title_from_pdf(pdf_path):
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
metadata = reader.getDocumentInfo()
title = metadata.get('/Title')
if title:
return title
else:
# Attempt to extract the title from the text (e.g., first non-empty line)
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
text = page.extractText().strip()
lines = text.split('\n')
for line in lines:
if line.strip():
return line.strip()
return None
def search_with_embeddings(query_embeddings, faissindex, k=5):
# Search in the FAISS index
query_embeddings=ensure_dimension(query_embeddings)
D, I = faissindex.search(query_embeddings, k)
return D, I
def embed_search_terms(search_terms, bert_tokenizer, bert_model):
search_embeddings = []
for term in search_terms:
# Tokenize the term
inputs = bert_tokenizer(term, return_tensors="pt", padding=True, truncation=True)
# Pass the tokenized input to the BERT model
with torch.no_grad():
outputs = bert_model(**inputs)
# Averaging the last hidden state or other desired processing
embedding = np.mean(outputs.last_hidden_state.cpu().numpy(), axis=1)
# embedding = np.squeeze(embedding) # Squeeze out the unnecessary dimension
# Ensuring the correct dimension
embedding = ensure_dimension(embedding)
search_embeddings.append(embedding)
return np.array(search_embeddings) # make sure the result is an array, not a list
def visualize_embeddings(embeddings, titles=None):
# Apply t-SNE for dimensionality reduction to 3 components
if isinstance(embeddings, list):
embeddings = np.vstack(embeddings)
print("Embeddings shape:", embeddings.shape)
tsne = TSNE(n_components=3, perplexity=1, random_state=0)
reduced_embeddings = tsne.fit_transform(embeddings)
# Create a DataFrame to hold the reduced embeddings and titles
df_embeddings = pd.DataFrame(reduced_embeddings, columns=['Component 1', 'Component 2', 'Component 3'])
if titles is not None:
df_embeddings['Title'] = titles
# Create the 3D scatter plot
hover_data = ['Title'] if titles is not None else None
fig = px.scatter_3d(df_embeddings, x='Component 1', y='Component 2', z='Component 3', hover_data=hover_data)
fig.show()
def analyze_and_visualize(embeddings, search_terms, pdf_urls, bert_tokenizer, bert_model, faissindex, k=5):
logging.debug('analyze and visualize function')
# Embed the search terms using BERT
search_embeddings = embed_search_terms(search_terms, bert_tokenizer, bert_model)
# Perform the search using the query embeddings and the existing FAISS index
D, I = search_for_query_embeddings(search_embeddings, faissindex, k)
# interpret_results(search_terms, I, pdf_urls)
visualize_embeddings(embeddings)
def display_results(D, I, pdf_urls):
logging.debug('display results function')
print(f"Top {len(I[0])} documents related to the query:")
for i, idx in enumerate(I[0]):
likelihood_score = D[0][i]
print(f" - Document {idx}: {pdf_urls[idx]}, Likelihood Score: {likelihood_score}")