7
7
import functools
8
8
import orgparse
9
9
10
- from nltk .stem import SnowballStemmer
10
+ from nltk .stem import SnowballStemmer , api
11
11
from sklearn .feature_extraction .text import TfidfVectorizer
12
12
from sklearn .metrics .pairwise import cosine_similarity
13
13
@@ -23,6 +23,7 @@ def get_stopwords() -> list:
23
23
stopwords = f .read ().split ("\n " )
24
24
return stopwords
25
25
26
+
26
27
def get_junkchars () -> list :
27
28
"""Get a list of junk characters from JUNKCHARS file."""
28
29
with open (SCRIPT_PATH / JUNKCHARS_FILENAME , "r" ) as f :
@@ -89,15 +90,21 @@ def parse_args():
89
90
return p .parse_args ()
90
91
91
92
92
- def read_file (filename : Path ) -> str :
93
- """Safely reads a filename and returns its content."""
94
- with open (filename , "r" ) as open_file :
95
- return open_file .read ()
96
-
97
-
98
- def get_tokens (text , stemmer , junkchars : list , stopwords : list ) -> list :
93
+ def get_tokens (
94
+ text : str , stemmer : api .StemmerI , junkchars : list , stopwords : list
95
+ ) -> list :
99
96
"""
100
97
Preprocess a text and returns a list of tokens.
98
+
99
+ Args:
100
+ text (str): Text whose tokens will be extracted from.
101
+ stemmer (nltk's stemmer): Stemmer provided by the nltk API.
102
+ junkchars (list): List of junk characters to be stripped from the text.
103
+ stopwords (list): List of stopwords to be removed from the text.
104
+
105
+ Returns:
106
+ List of tokens after the text has been pre-processed.
107
+
101
108
"""
102
109
text = text .lower ()
103
110
# Replaces the stupid apostrophe with a normal one.
@@ -122,13 +129,28 @@ def get_tokens(text, stemmer, junkchars: list, stopwords: list) -> list:
122
129
return tokens
123
130
124
131
125
- def get_scores (input_filename , target_filenames , stemmer ):
132
+ def get_scores (
133
+ input_filename : Path , target_filenames : Path , stemmer : api .StemmerI
134
+ ):
126
135
"""Create a document similarity table based on TF-IDF and cosine dist.
127
136
128
137
This function scans the a directory for org files and creates a sparse
129
138
matrix with all found tokens via tf-idf algorithm (short for term
130
139
frequency-inverse document frequency), which penalizes words that appear too
131
- often in a text."""
140
+ often in a text.
141
+
142
+ Args:
143
+ input_filename (Path): path to the filename that will be used as
144
+ reference.
145
+ target_filenames (Path): Glob containing the path to the documents
146
+ whose similarity with the input filename will be estimated.
147
+ stemmer (nltk stemmer): Instance of an nltk stemmer provided by the
148
+ nltk API.
149
+
150
+ Returns:
151
+ List of similarity scores with the same number of documents in
152
+ target_filenames plus one (accounting for the input_filename).
153
+ """
132
154
stopwords = get_stopwords ()
133
155
junkchars = get_junkchars ()
134
156
base_document = orgparse .load (input_filename ).get_body (format = "plain" )
@@ -138,7 +160,9 @@ def get_scores(input_filename, target_filenames, stemmer):
138
160
# To make uniformed vectors, both documents need to be combined first.
139
161
documents .insert (0 , base_document )
140
162
141
- tokenizer = functools .partial (get_tokens , stemmer = stemmer , junkchars = junkchars , stopwords = stopwords )
163
+ tokenizer = functools .partial (
164
+ get_tokens , stemmer = stemmer , junkchars = junkchars , stopwords = stopwords
165
+ )
142
166
vectorizer = TfidfVectorizer (tokenizer = tokenizer , token_pattern = None )
143
167
embeddings = vectorizer .fit_transform (documents )
144
168
scores = cosine_similarity (embeddings [0 ], embeddings [1 :]).flatten ()
@@ -154,7 +178,26 @@ def format_results(
154
178
id_links : bool ,
155
179
show_scores : bool ,
156
180
) -> list :
157
- """Format results in an org-compatible format with links."""
181
+ """Format results in an org-compatible format with links.
182
+
183
+ Args:
184
+ input_filename (Path): path to the filename that will be used as
185
+ reference.
186
+ target_filenames (Path): Glob containing the path to the documents
187
+ whose similarity with the input filename will be estimated.
188
+ scores (array-like): List of similarity scores with the same number of
189
+ documents in target_filenames plus one (accounting for the
190
+ input_filename).
191
+ num_results (int): How many similar entries to list at the end of the buffer.
192
+ id_links (bool): Whether the resulting list of similar documents will
193
+ point to ID property or filename. Recommend setting it to True
194
+ if you use `org-roam' v2.
195
+ show_scores (bool): Whether to prepend the results with the similarity score.
196
+
197
+ Returns:
198
+ List of org formatted links to the most similar documents, sorted in descending
199
+ order of similarity.
200
+ """
158
201
results = zip (scores , targets )
159
202
sorted_results = sorted (results , key = lambda x : x [0 ], reverse = True )
160
203
valid_results = sorted_results [:num_results ]
0 commit comments