Skip to content

Commit f5b46cf

Browse files
authored
Merge pull request #4 from brunoarine/feat/ignore-front-matter
feat: ignore front matter
2 parents b410c17 + bc188af commit f5b46cf

9 files changed

+227
-79
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ Here's the breakdown of the available options:
116116
| `-H, --heading TEXT` | Text to show as the list heading. Default is "". Example: `findlike reference_file.txt -H "## Similar files"` |
117117
| `-F, --format [plain, json]` | This option sets the output format. Default is "plain". Example: `findlike reference_file.txt -F json` |
118118
| `-t, --threshold FLOAT` | Similarity score threshold. All results whose score are below the determined threshold will be omitted. Default is 0.05. Example: `findlike reference_file.txt -t 0` |
119+
| `-i, --ignore-front-matter` | Tries to strip the front-matter from markup files like Markdown and Org-mode. |
119120

120121
## Examples
121122

findlike/cli.py

+28-17
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@
66
from nltk.stem import SnowballStemmer
77
from stop_words import get_stop_words
88

9+
from .constants import ALGORITHM_CLASSES, FORMATTER_CLASSES, TEXT_FILE_EXT
910
from .preprocessing import (
1011
Corpus,
1112
Processor,
1213
)
13-
from .utils import try_read_file, collect_paths
14-
from .constants import FORMATTER_CLASSES, ALGORITHM_CLASSES, TEXT_FILE_EXT
14+
from .utils import collect_paths
1515

1616

1717
@click.command()
@@ -115,6 +115,13 @@
115115
help="remove REFERENCE_FILE from results",
116116
required=False,
117117
)
118+
@click.option(
119+
"--ignore-front-matter",
120+
"-i",
121+
is_flag=True,
122+
help="ignore front-matter from supported markup languages",
123+
required=False,
124+
)
118125
@click.option(
119126
"--heading",
120127
"-H",
@@ -159,6 +166,7 @@ def cli(
159166
format,
160167
threshold,
161168
absolute_paths,
169+
ignore_front_matter,
162170
):
163171
"""'findlike' is a program that scans a given directory and returns the most
164172
similar documents in relation to REFERENCE_FILE or --query QUERY.
@@ -172,38 +180,41 @@ def cli(
172180
$ findlike -q "There is only one good, knowledge, and one evil, ignorance"
173181
"""
174182

175-
# Set up the reference text.
176-
if reference_file:
177-
reference_content = try_read_file(Path(reference_file))
178-
elif query:
179-
reference_content = query
180-
else:
181-
raise click.UsageError(
182-
"Neither REFERENCE_FILE nor --query QUERY was provided."
183-
)
184-
185183
# Put together the list of documents to be analyzed.
186184
directory_path = Path(directory)
187-
extensions: list[str] = [filename_pattern] if filename_pattern else TEXT_FILE_EXT
185+
extensions: list[str] = (
186+
[filename_pattern] if filename_pattern else TEXT_FILE_EXT
187+
)
188188
document_paths = collect_paths(
189189
directory=directory_path, extensions=extensions, recursive=recursive
190190
)
191191

192192
# Create a corpus with the collected documents.
193-
corpus = Corpus(paths=document_paths, min_chars=min_chars)
194-
corpus.add_document(document=reference_content)
193+
corpus = Corpus(
194+
paths=document_paths,
195+
min_chars=min_chars,
196+
ignore_front_matter=ignore_front_matter,
197+
)
198+
if reference_file:
199+
corpus.add_from_file(path=Path(reference_file), is_reference=True)
200+
elif query:
201+
corpus.add_from_query(query=query)
202+
else:
203+
raise click.UsageError(
204+
"Neither REFERENCE_FILE nor --query QUERY was provided."
205+
)
195206

196207
# Set up the documents pre-processor.
197208
stemmer = SnowballStemmer(language).stem
198209
processor = Processor(
199210
stopwords=get_stop_words(language=language),
200211
stemmer=stemmer,
201212
)
202-
213+
203214
# Set up the similarity model.
204215
model = ALGORITHM_CLASSES[algorithm](processor=processor)
205216
model.fit(corpus.documents_) # Add reference to avoid zero division
206-
scores = model.get_scores(source=reference_content)
217+
scores = model.get_scores(source=corpus.reference_)
207218

208219
# Format and print results.
209220
formatter = FORMATTER_CLASSES[format](

findlike/constants.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@
198198
"*.nuspec",
199199
"*.nvmrc",
200200
"*.ops",
201-
"org",
201+
"*.org",
202202
"*.pas",
203203
"*.pasm",
204204
"*.patch",

findlike/markup.py

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import re
2+
3+
class Markup:
4+
def __init__(self, extension: str):
5+
self.extension = extension
6+
self._MARKUP_EXTENSIONS = {
7+
".org": self._strip_org_frontmatter
8+
}
9+
10+
def strip_frontmatter(self, text: str) -> str:
11+
if self.extension in self._MARKUP_EXTENSIONS.keys():
12+
return self._MARKUP_EXTENSIONS[self.extension](text)
13+
else:
14+
return text
15+
16+
def _strip_org_frontmatter(self, content: str) -> str:
17+
"""
18+
Remove front matter from a string representing an Org-mode file.
19+
20+
This function removes all lines from `:PROPERTIES:` to `:END:`
21+
and any lines starting with `#+` from the given content string.
22+
23+
Args:
24+
content (str): The content of an Org-mode file as a string.
25+
26+
Returns:
27+
str: The content with the front matter removed.
28+
29+
Example:
30+
>>> content = '''
31+
... :PROPERTIES:
32+
... :ID: 123
33+
... :END:
34+
... #+TITLE: Example
35+
... This is some text.
36+
... ** A heading
37+
... Some more text.
38+
... '''
39+
>>> cleaned_content = remove_front_matter(content)
40+
>>> print(cleaned_content)
41+
This is some text.
42+
** A heading
43+
Some more text.
44+
"""
45+
# Remove :PROPERTIES: to :END: block
46+
content = re.sub(r':PROPERTIES:(.|\n)*?:END:', '', content)
47+
48+
# Remove lines starting with #+
49+
pattern = r'^\s*#\+[a-zA-Z0-9_]+.*?$'
50+
content = re.sub(pattern, '', content, flags=re.MULTILINE)
51+
52+
return content.strip()

findlike/preprocessing.py

+42-36
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
from pathlib import Path
55
from typing import Callable
66

7-
from .utils import try_read_file, compress
7+
from .markup import Markup
8+
from .utils import compress, try_read_file
89

910
WORD_RE = re.compile(r"(?u)\b\w{2,}\b")
1011
URL_RE = re.compile(r"\S*https?:\S*")
@@ -59,7 +60,6 @@ def _stemmize(self, tokens: list[str]) -> list[str]:
5960
"""Get only the stems from a list of words."""
6061
return [self.stemmer(w) for w in tokens]
6162

62-
6363
class Corpus:
6464
"""This wrapper provides easy access to a filtered corpus.
6565
@@ -77,46 +77,52 @@ def __init__(
7777
self,
7878
paths: list[Path],
7979
min_chars: int,
80+
ignore_front_matter: bool = False,
8081
):
8182
self.paths = paths
8283
self.min_chars = min_chars
84+
self.ignore_front_matter = ignore_front_matter
8385

84-
self._loaded_documents: list[str | None]
85-
86-
self.documents_: list[str]
87-
self.paths_: list[Path]
86+
self.documents_: list[str] = []
87+
self.paths_: list[Path] = []
88+
self.reference_: str| None = None
8889

89-
self._load_documents()
90-
if min_chars:
91-
self._apply_min_chars_filter()
92-
self._prune_documents()
93-
self._prune_paths()
90+
self.add_from_paths()
9491

95-
def add_document(self, document: str|None):
96-
"""Add a document to the current corpus.
92+
def add_from_file(self, path: Path, is_reference: bool = False):
93+
"""Adds the contents of a file to the corpus.
9794
9895
Args:
99-
document (str): Document to be added.
100-
101-
Returns:
102-
list[str]: The new corpus after the document has been added.
96+
path (Path): The path to the file.
97+
is_reference (bool, optional): Indicates if the file is a reference file.
98+
Defaults to False.
99+
100+
Notes:
101+
- The file content is added to the corpus if it meets the minimum character
102+
length requirement.
103+
- If front matter stripping is enabled, the file content is stripped of its
104+
front matter before being added to the corpus.
103105
"""
104-
if document:
105-
self.documents_.append(document)
106-
107-
def _load_documents(self):
108-
self._loaded_documents = [try_read_file(p) for p in self.paths]
109-
110-
def _prune_paths(self):
111-
self.paths_ = compress(self.paths, self.documents_)
112-
113-
def _prune_documents(self):
114-
self.documents_ = [x for x in self._loaded_documents if x]
115-
116-
def _apply_min_chars_filter(self):
117-
"""Apply min chars filter in both documents and documents paths"""
118-
self._loaded_documents = [
119-
doc if doc and len(doc) >= self.min_chars else None
120-
for doc in self._loaded_documents
121-
]
122-
return self
106+
loaded_doc = try_read_file(path)
107+
if loaded_doc and len(loaded_doc) >= self.min_chars:
108+
if self.ignore_front_matter:
109+
loaded_doc = self.strip_front_matter(
110+
loaded_doc, extension=path.suffix
111+
)
112+
self.documents_.append(loaded_doc)
113+
if is_reference:
114+
self.reference_ = loaded_doc
115+
else:
116+
self.paths_.append(path)
117+
118+
def add_from_query(self, query: str):
119+
self.documents_.append(query)
120+
121+
def add_from_paths(self) -> list[str | None]:
122+
"""Load document contents from the specified paths."""
123+
return [self.add_from_file(p) for p in self.paths]
124+
125+
def strip_front_matter(self, document: str, extension: str) -> str:
126+
"""Strip front-matter from the loaded documents."""
127+
markup = Markup(extension=extension)
128+
return markup.strip_frontmatter(document)

findlike/pytest.ini

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[pytest]
2+
testpaths =
3+
tests

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "findlike"
3-
version = "1.3.1"
3+
version = "1.4.0"
44
authors = [{ name = "Bruno Arine", email = "[email protected]" }]
55
description = "findlike is a package to retrieve similar documents"
66
readme = "README.md"

tests/test_corpus.py

+67-24
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1-
import pytest
1+
import tempfile
22
from pathlib import Path
3+
from textwrap import dedent
4+
5+
import pytest
6+
37
from findlike.preprocessing import Corpus
48
from findlike.utils import try_read_file
59

@@ -26,25 +30,6 @@ def sample_paths(tmp_path):
2630
return [path1, path2, path3]
2731

2832

29-
def test_loading_documents(sample_paths):
30-
corpus = Corpus(sample_paths, min_chars=0)
31-
assert len(corpus._loaded_documents) == 3
32-
33-
34-
def test_min_chars_filter(sample_paths):
35-
corpus = Corpus(sample_paths, min_chars=30)
36-
filtered_docs = corpus.documents_
37-
filtered_paths = corpus.paths_
38-
39-
assert len(filtered_docs) == 2
40-
assert len(filtered_paths) == 2
41-
42-
43-
def test_pruning_documents(sample_paths):
44-
corpus = Corpus(sample_paths, min_chars=30)
45-
assert all(doc is not None for doc in corpus.documents_)
46-
47-
4833
def test_pruning_paths(sample_paths):
4934
corpus = Corpus(sample_paths, min_chars=30)
5035
filtered_paths = corpus.paths_
@@ -66,7 +51,65 @@ def test_try_read_file(sample_paths):
6651
try_read_file(invalid_path)
6752

6853

69-
def test_empty_paths_list():
70-
corpus = Corpus([], min_chars=0)
71-
assert len(corpus.documents_) == 0
72-
assert len(corpus.paths_) == 0
54+
class TestCorpus:
55+
# Fixture for creating temporary files with random content
56+
@pytest.fixture
57+
def temp_files(self):
58+
with tempfile.TemporaryDirectory() as tmpdir:
59+
tmp_path = Path(tmpdir)
60+
file1 = tmp_path / "file1.txt"
61+
file2 = tmp_path / "file2.txt"
62+
file1.write_text("This is file 1.")
63+
file2.write_text("This is file 2.")
64+
yield [file1, file2]
65+
66+
# Fixture for creating a Corpus instance
67+
@pytest.fixture
68+
def corpus(self):
69+
min_chars = 10
70+
return Corpus([], min_chars)
71+
72+
# Test add_from_file method
73+
def test_files_were_added(self, corpus, temp_files):
74+
# Add files to the corpus
75+
corpus.add_from_file(temp_files[0])
76+
corpus.add_from_file(temp_files[1])
77+
78+
# Check if documents and paths are updated correctly
79+
assert len(corpus.documents_) == 2
80+
assert len(corpus.paths_) == 2
81+
assert corpus.documents_[0] == "This is file 1."
82+
assert corpus.documents_[1] == "This is file 2."
83+
assert corpus.paths_[0] == temp_files[0]
84+
assert corpus.paths_[1] == temp_files[1]
85+
86+
# Test add_from_query method
87+
def test_add_from_query(self, corpus):
88+
# Add query to the corpus
89+
corpus.add_from_query("This is a query.")
90+
91+
# Check if the query is added to the documents
92+
assert len(corpus.documents_) == 1
93+
assert len(corpus.paths_) == 0
94+
assert corpus.documents_[0] == "This is a query."
95+
96+
# Test _strip_front_matter method
97+
def test_strip_front_matter(self, corpus):
98+
# Test with front matter stripping disabled
99+
document = "This is a document."
100+
stripped_document = corpus.strip_front_matter(document, extension=".txt")
101+
assert stripped_document == document
102+
103+
# Test with front matter stripping enabled
104+
document = """
105+
:PROPERTIES:
106+
:ID: 123
107+
:END:
108+
#+TITLE: Example
109+
This is some text.
110+
** A heading
111+
Some more text.
112+
"""
113+
extension = ".org"
114+
expected = "This is some text.\n** A heading\nSome more text."
115+
assert corpus.strip_front_matter(dedent(document), extension) == expected

0 commit comments

Comments
 (0)