Skip to content

Commit 8797d73

Browse files
author
Bruno Arine
committed
Add docstrings to Python file
1 parent b9fe55c commit 8797d73

File tree

1 file changed

+55
-12
lines changed

1 file changed

+55
-12
lines changed

orgsimilarity/__main__.py

+55-12
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import functools
88
import orgparse
99

10-
from nltk.stem import SnowballStemmer
10+
from nltk.stem import SnowballStemmer, api
1111
from sklearn.feature_extraction.text import TfidfVectorizer
1212
from sklearn.metrics.pairwise import cosine_similarity
1313

@@ -23,6 +23,7 @@ def get_stopwords() -> list:
2323
stopwords = f.read().split("\n")
2424
return stopwords
2525

26+
2627
def get_junkchars() -> list:
2728
"""Get a list of junk characters from JUNKCHARS file."""
2829
with open(SCRIPT_PATH / JUNKCHARS_FILENAME, "r") as f:
@@ -89,15 +90,21 @@ def parse_args():
8990
return p.parse_args()
9091

9192

92-
def read_file(filename: Path) -> str:
93-
"""Safely reads a filename and returns its content."""
94-
with open(filename, "r") as open_file:
95-
return open_file.read()
96-
97-
98-
def get_tokens(text, stemmer, junkchars: list, stopwords: list) -> list:
93+
def get_tokens(
94+
text: str, stemmer: api.StemmerI, junkchars: list, stopwords: list
95+
) -> list:
9996
"""
10097
Preprocess a text and returns a list of tokens.
98+
99+
Args:
100+
text (str): Text whose tokens will be extracted from.
101+
stemmer (nltk's stemmer): Stemmer provided by the nltk API.
102+
junkchars (list): List of junk characters to be stripped from the text.
103+
stopwords (list): List of stopwords to be removed from the text.
104+
105+
Returns:
106+
List of tokens after the text has been pre-processed.
107+
101108
"""
102109
text = text.lower()
103110
# Replaces the stupid apostrophe with a normal one.
@@ -122,13 +129,28 @@ def get_tokens(text, stemmer, junkchars: list, stopwords: list) -> list:
122129
return tokens
123130

124131

125-
def get_scores(input_filename, target_filenames, stemmer):
132+
def get_scores(
133+
input_filename: Path, target_filenames: Path, stemmer: api.StemmerI
134+
):
126135
"""Create a document similarity table based on TF-IDF and cosine dist.
127136
128137
This function scans the a directory for org files and creates a sparse
129138
matrix with all found tokens via tf-idf algorithm (short for term
130139
frequency-inverse document frequency), which penalizes words that appear too
131-
often in a text."""
140+
often in a text.
141+
142+
Args:
143+
input_filename (Path): path to the filename that will be used as
144+
reference.
145+
target_filenames (Path): Glob containing the path to the documents
146+
whose similarity with the input filename will be estimated.
147+
stemmer (nltk stemmer): Instance of an nltk stemmer provided by the
148+
nltk API.
149+
150+
Returns:
151+
List of similarity scores with the same number of documents in
152+
target_filenames plus one (accounting for the input_filename).
153+
"""
132154
stopwords = get_stopwords()
133155
junkchars = get_junkchars()
134156
base_document = orgparse.load(input_filename).get_body(format="plain")
@@ -138,7 +160,9 @@ def get_scores(input_filename, target_filenames, stemmer):
138160
# To make uniformed vectors, both documents need to be combined first.
139161
documents.insert(0, base_document)
140162

141-
tokenizer = functools.partial(get_tokens, stemmer=stemmer, junkchars=junkchars, stopwords=stopwords)
163+
tokenizer = functools.partial(
164+
get_tokens, stemmer=stemmer, junkchars=junkchars, stopwords=stopwords
165+
)
142166
vectorizer = TfidfVectorizer(tokenizer=tokenizer, token_pattern=None)
143167
embeddings = vectorizer.fit_transform(documents)
144168
scores = cosine_similarity(embeddings[0], embeddings[1:]).flatten()
@@ -154,7 +178,26 @@ def format_results(
154178
id_links: bool,
155179
show_scores: bool,
156180
) -> list:
157-
"""Format results in an org-compatible format with links."""
181+
"""Format results in an org-compatible format with links.
182+
183+
Args:
184+
input_filename (Path): path to the filename that will be used as
185+
reference.
186+
target_filenames (Path): Glob containing the path to the documents
187+
whose similarity with the input filename will be estimated.
188+
scores (array-like): List of similarity scores with the same number of
189+
documents in target_filenames plus one (accounting for the
190+
input_filename).
191+
num_results (int): How many similar entries to list at the end of the buffer.
192+
id_links (bool): Whether the resulting list of similar documents will
193+
point to ID property or filename. Recommend setting it to True
194+
if you use `org-roam' v2.
195+
show_scores (bool): Whether to prepend the results with the similarity score.
196+
197+
Returns:
198+
List of org formatted links to the most similar documents, sorted in descending
199+
order of similarity.
200+
"""
158201
results = zip(scores, targets)
159202
sorted_results = sorted(results, key=lambda x: x[0], reverse=True)
160203
valid_results = sorted_results[:num_results]

0 commit comments

Comments
 (0)