-
Notifications
You must be signed in to change notification settings - Fork 0
/
GNNstuff
96 lines (69 loc) · 3.02 KB
/
GNNstuff
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import requests
from bs4 import BeautifulSoup
# a function that takes a URL and returns the PDFs
def get_pdf_urls(url):
# Fetch the HTML content
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Find the table (you might need to adjust this based on the table's HTML structure)
table = soup.find('table')
# Extract URLs from the table (adjust based on the table's structure)
pdf_urls = [a['href'] for a in table.find_all('a') if 'pdf' in a['href']]
return pdf_urls
# function for reading the PDFs
import PyPDF2
from io import BytesIO
def read_pdf(pdf_url):
response = requests.get(pdf_url)
pdf_reader = PyPDF2.PdfReader(BytesIO(response.content))
# Read PDF from URL
# Extract text
text = ""
for page_num in range(len(pdf_reader.pages)):
text += pdf_reader.pages[page_num].extract_text()
return text
# Gather based on URL
url = 'https://github.com/epowell101/graph-fraud-detection-papers'
pdf_urls = get_pdf_urls(url)
# Print the first 5 URLs
print(pdf_urls[:5])
# Summarization using BART not Bert
import torch
from transformers import BartForConditionalGeneration, BartTokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
def summarize_and_extract_embeddings(text):
# Tokenize and summarize
inputs = tokenizer([text], max_length=1024, return_tensors='pt', truncation=True)
summary_ids = model.generate(inputs['input_ids'], num_beams=4, min_length=30, max_length=150, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
# Extract embeddings from encoder
with torch.no_grad():
encoder_outputs = model.get_encoder()(inputs['input_ids'])
embeddings = encoder_outputs.last_hidden_state.mean(dim=1)
return summary, embeddings
# process embeddings - may want to shift to max pool and also elsewhere will want to look at keywords
def process_embeddings(embeddings):
# Average across the token dimension to get a single vector for the entire text
return embeddings.mean(dim=0)
# use FAISS as a vector DB and add embeddings to the model
import faiss
d = 1024 # This is the dimension for BARTforConditionalGeneration embeddings
# Create an index
index = faiss.IndexFlatL2(d)
recent_summaries = []
for pdf_url in pdf_urls[-5:]: # Limiting to last 5 for printing recent summaries
# Read the PDF
pdf_text = read_pdf(pdf_url)
# Summarize and extract embeddings
summary, embeddings = summarize_and_extract_embeddings(pdf_text)
# Store the summary
recent_summaries.append((pdf_url, summary))
# Store embeddings in FAISS (after processing them if needed)
processed_embeddings = process_embeddings(embeddings)
processed_embeddings = processed_embeddings.reshape(1, -1)
index.add(processed_embeddings)
# Print recent summaries
for url, summary in recent_summaries:
print(f"URL: {url}")
print(f"Summary: {summary}\n")