-
Notifications
You must be signed in to change notification settings - Fork 4
/
(Overall_pipeline)_SLS_on_eng.py
97 lines (79 loc) · 2.86 KB
/
(Overall_pipeline)_SLS_on_eng.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#! pip install transformers
#! pip install -U sentence-transformers
#! pip install sentencepiece
#! pip install faiss-gpu
#! pip install funcy pickle5
######################################
# STEP 1 : Load Dataset & PLMs (Eng) #
######################################
import pandas as pd
# 1. Load arXiv dataset(Cornell University., 2022)
df = pd.read_csv('./data/arxiv_meta.csv')
print(">> arxiv-meta data size : ", len(df))
# 2. Load pre-trained language model on English dataset
my_plms = "all-mpnet-base-v2"
#####################################################
# STEP 2 : Parallel Clustering-based Topic Modeling #
#####################################################
from models.parallel_clustering_TM import *
# 1. Obtain Embeddings
target_text = 'abstract'
cluster = ParallelCluster(
dataframe = df,
tgt_col = target_text,
model_name = my_plms,
use_sentence_bert = True
)
# 2. Parallel Clustering
clusters, unclusters = cluster.parallel_cluster(
clusters = None,
threshold = 0.52,
page_size = 2000,
iterations = 20
)
# 3. Stack : Stack the clustered results in order of cluster size
col_list = ['title', 'abstract', 'year']
new_df = cluster.cluster_stack(
col_list = col_list,
clusters = clusters,
unclusters = unclusters
)
# 4. Extract Keywords from each documents
top_n_words = cluster.extract_top_n_words_per_topic(
dataframe = new_df,
n = 20,
en = True
)
new_df['keywords'] = [', '.join(top_n_words[i]) for i in new_df['Topic'].values]
# 5. Save the Parallel Clusted Dataset
new_df.to_csv("./data/clusted_arxiv_df.csv", sep=',', na_rep="NaN")
#################################################################################
# STEP 3 : Embeddings modelization(Split-merge) and Scoring(Multi-interactions) #
#################################################################################
from models.semantic_searcher_eng import *
# 1. Load SLS framework
sls = SLS(
dataframe = new_df,
doc_col = 'abstract',
key_col = 'keywords',
model_name = my_plms,
use_sentence_bert = True,
split_and_merge = True,
multi_inter = True,
)
# 2. Build the Index
# (Strategy 1) : All Distance Metric
all_index = sls.all_distance_metric()
# (Strategy 2) : Restricted Distance Metric
#restricted_index = sls.restricted_distance_metric(nlist = 200, nprobe = 6)
#####################################
# STEP 4 : Semantic search with SLS #
#####################################
# 3. Semantic documents search (Question-Answering)
my_query = "Research about the Transformer network architecture, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely."
outputs, _ = sls.semantic_search(
user_query = my_query,
top_k = 10,
index = all_index,
print_results = True,
)