-
Notifications
You must be signed in to change notification settings - Fork 0
/
GNNpdf.py
144 lines (116 loc) · 5.07 KB
/
GNNpdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import requests
from bs4 import BeautifulSoup
import PyPDF2
from io import BytesIO
import torch
from transformers import BartForConditionalGeneration, BartTokenizer
import faiss
from collections import defaultdict
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
def get_pdf_urls(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table')
pdf_urls = [a['href'] for a in table.find_all('a') if 'pdf' in a['href']]
return pdf_urls
def read_pdf(pdf_url):
response = requests.get(pdf_url)
pdf_reader = PyPDF2.PdfReader(BytesIO(response.content))
text = "".join(page.extract_text() for page in pdf_reader.pages)
return text
def summarize_and_extract_embeddings(text):
inputs = tokenizer([text], max_length=1024, return_tensors='pt', truncation=True)
summary_ids = model.generate(inputs['input_ids'], num_beams=4, min_length=30, max_length=150, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
encoder_outputs = model.get_encoder()(inputs['input_ids'])
embeddings = encoder_outputs.last_hidden_state.mean(dim=1)
return summary, embeddings
def process_embeddings(embeddings, save_path='c:\cloudODC\PDPdata\processed_embeddings.npy'):
processed_embeddings = embeddings.mean(dim=0).cpu().detach().numpy().reshape(1, -1)
np.save(save_path, processed_embeddings)
return processed_embeddings
def is_directed(text, threshold_X=2):
keywords = ["directed graph", "directed edges", "directed nodes"]
count = sum(text.lower().count(keyword) for keyword in keywords)
return count > threshold_X
def is_heterogeneous(text, threshold_X=2):
keywords = ["heterogeneous graph", "mixed types", "multiple types"]
count = sum(text.lower().count(keyword) for keyword in keywords)
return count > threshold_X
def is_continuous(text, threshold_X=2):
keywords = ["continuous graph", "continuous data", "continuous nodes"]
count = sum(text.lower().count(keyword) for keyword in keywords)
return count > threshold_X
def get_documents_from_url(url):
pdf_urls = get_pdf_urls(url)
documents = [{'url': pdf_url, 'text': read_pdf(pdf_url)} for pdf_url in pdf_urls]
return documents
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
# FAISS index
d = 1024
index = faiss.IndexFlatL2(d)
# Gather based on URL
url = 'https://github.com/epowell101/graph-fraud-detection-papers'
pdf_urls = get_pdf_urls(url)
recent_summaries = []
for pdf_url in tqdm(pdf_urls[-5:]): # Limiting to last 5 for printing recent summaries
# Read the PDF
pdf_text = read_pdf(pdf_url)
# Summarize and extract embeddings
summary, embeddings = summarize_and_extract_embeddings(pdf_text)
# Store the summary
recent_summaries.append((pdf_url, summary))
# Store embeddings in FAISS (after processing them if needed)
processed_embeddings = process_embeddings(embeddings)
processed_embeddings = embeddings.cpu().detach().numpy().reshape(1, -1) # Detach from computation graph and reshape
# Detach from computation graph and reshape
index.add(processed_embeddings)
# Print recent summaries
for url, summary in recent_summaries:
print(f"URL: {url}")
print(f"Summary: {summary}\n")
# URL containing PDF links
url = 'https://github.com/epowell101/graph-fraud-detection-papers'
# Gather and process documents
documents = get_documents_from_url(url)
recent_summaries = []
for doc in documents[-5:]:
summary, embeddings = summarize_and_extract_embeddings(doc['text'])
recent_summaries.append((doc['url'], summary))
processed_embeddings = process_embeddings(embeddings).reshape(1, -1)
index.add(processed_embeddings)
# Analyze and print results
counts = defaultdict(int)
classified_docs = defaultdict(list)
for url, summary in recent_summaries:
for func, criteria in [(is_directed, 'directed'), (is_heterogeneous, 'heterogeneous'), (is_continuous, 'continuous')]:
if func(summary):
counts[criteria] += 1
classified_docs[criteria].append(url)
print("Criteria\tCount\tLinks")
for criteria, count in counts.items():
links = ", ".join(classified_docs[criteria])
print(f"{criteria}\t{count}\t{links}")
def count_keywords(text, keywords):
counts = {keyword: text.lower().count(keyword) for keyword in keywords}
return counts
# List of keywords to count
keywords = ["directed graph", "directed edges", "directed nodes",
"heterogeneous graph", "mixed types", "multiple types",
"continuous graph", "continuous data", "continuous nodes"]
# Collect counts across all summaries
all_counts = defaultdict(int)
for url, summary in recent_summaries:
counts = count_keywords(summary, keywords)
for keyword, count in counts.items():
all_counts[keyword] += count
# Plotting the histogram
plt.bar(all_counts.keys(), all_counts.values())
plt.xticks(rotation=90)
plt.xlabel('Keywords')
plt.ylabel('Counts')
plt.title('Keyword Counts in Summaries')
plt.show()