Skip to content

Commit

Permalink
Merge pull request #4 from brunoarine/feat/ignore-front-matter
Browse files Browse the repository at this point in the history
feat: ignore front matter
  • Loading branch information
brunoarine authored Jul 11, 2023
2 parents b410c17 + bc188af commit f5b46cf
Show file tree
Hide file tree
Showing 9 changed files with 227 additions and 79 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ Here's the breakdown of the available options:
| `-H, --heading TEXT` | Text to show as the list heading. Default is "". Example: `findlike reference_file.txt -H "## Similar files"` |
| `-F, --format [plain, json]` | This option sets the output format. Default is "plain". Example: `findlike reference_file.txt -F json` |
| `-t, --threshold FLOAT` | Similarity score threshold. All results whose score are below the determined threshold will be omitted. Default is 0.05. Example: `findlike reference_file.txt -t 0` |
| `-i, --ignore-front-matter` | Tries to strip the front-matter from markup files like Markdown and Org-mode. |

## Examples

Expand Down
45 changes: 28 additions & 17 deletions findlike/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
from nltk.stem import SnowballStemmer
from stop_words import get_stop_words

from .constants import ALGORITHM_CLASSES, FORMATTER_CLASSES, TEXT_FILE_EXT
from .preprocessing import (
Corpus,
Processor,
)
from .utils import try_read_file, collect_paths
from .constants import FORMATTER_CLASSES, ALGORITHM_CLASSES, TEXT_FILE_EXT
from .utils import collect_paths


@click.command()
Expand Down Expand Up @@ -115,6 +115,13 @@
help="remove REFERENCE_FILE from results",
required=False,
)
@click.option(
"--ignore-front-matter",
"-i",
is_flag=True,
help="ignore front-matter from supported markup languages",
required=False,
)
@click.option(
"--heading",
"-H",
Expand Down Expand Up @@ -159,6 +166,7 @@ def cli(
format,
threshold,
absolute_paths,
ignore_front_matter,
):
"""'findlike' is a program that scans a given directory and returns the most
similar documents in relation to REFERENCE_FILE or --query QUERY.
Expand All @@ -172,38 +180,41 @@ def cli(
$ findlike -q "There is only one good, knowledge, and one evil, ignorance"
"""

# Set up the reference text.
if reference_file:
reference_content = try_read_file(Path(reference_file))
elif query:
reference_content = query
else:
raise click.UsageError(
"Neither REFERENCE_FILE nor --query QUERY was provided."
)

# Put together the list of documents to be analyzed.
directory_path = Path(directory)
extensions: list[str] = [filename_pattern] if filename_pattern else TEXT_FILE_EXT
extensions: list[str] = (
[filename_pattern] if filename_pattern else TEXT_FILE_EXT
)
document_paths = collect_paths(
directory=directory_path, extensions=extensions, recursive=recursive
)

# Create a corpus with the collected documents.
corpus = Corpus(paths=document_paths, min_chars=min_chars)
corpus.add_document(document=reference_content)
corpus = Corpus(
paths=document_paths,
min_chars=min_chars,
ignore_front_matter=ignore_front_matter,
)
if reference_file:
corpus.add_from_file(path=Path(reference_file), is_reference=True)
elif query:
corpus.add_from_query(query=query)
else:
raise click.UsageError(
"Neither REFERENCE_FILE nor --query QUERY was provided."
)

# Set up the documents pre-processor.
stemmer = SnowballStemmer(language).stem
processor = Processor(
stopwords=get_stop_words(language=language),
stemmer=stemmer,
)

# Set up the similarity model.
model = ALGORITHM_CLASSES[algorithm](processor=processor)
model.fit(corpus.documents_) # Add reference to avoid zero division
scores = model.get_scores(source=reference_content)
scores = model.get_scores(source=corpus.reference_)

# Format and print results.
formatter = FORMATTER_CLASSES[format](
Expand Down
2 changes: 1 addition & 1 deletion findlike/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@
"*.nuspec",
"*.nvmrc",
"*.ops",
"org",
"*.org",
"*.pas",
"*.pasm",
"*.patch",
Expand Down
52 changes: 52 additions & 0 deletions findlike/markup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import re

class Markup:
def __init__(self, extension: str):
self.extension = extension
self._MARKUP_EXTENSIONS = {
".org": self._strip_org_frontmatter
}

def strip_frontmatter(self, text: str) -> str:
if self.extension in self._MARKUP_EXTENSIONS.keys():
return self._MARKUP_EXTENSIONS[self.extension](text)
else:
return text

def _strip_org_frontmatter(self, content: str) -> str:
"""
Remove front matter from a string representing an Org-mode file.
This function removes all lines from `:PROPERTIES:` to `:END:`
and any lines starting with `#+` from the given content string.
Args:
content (str): The content of an Org-mode file as a string.
Returns:
str: The content with the front matter removed.
Example:
>>> content = '''
... :PROPERTIES:
... :ID: 123
... :END:
... #+TITLE: Example
... This is some text.
... ** A heading
... Some more text.
... '''
>>> cleaned_content = remove_front_matter(content)
>>> print(cleaned_content)
This is some text.
** A heading
Some more text.
"""
# Remove :PROPERTIES: to :END: block
content = re.sub(r':PROPERTIES:(.|\n)*?:END:', '', content)

# Remove lines starting with #+
pattern = r'^\s*#\+[a-zA-Z0-9_]+.*?$'
content = re.sub(pattern, '', content, flags=re.MULTILINE)

return content.strip()
78 changes: 42 additions & 36 deletions findlike/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
from pathlib import Path
from typing import Callable

from .utils import try_read_file, compress
from .markup import Markup
from .utils import compress, try_read_file

WORD_RE = re.compile(r"(?u)\b\w{2,}\b")
URL_RE = re.compile(r"\S*https?:\S*")
Expand Down Expand Up @@ -59,7 +60,6 @@ def _stemmize(self, tokens: list[str]) -> list[str]:
"""Get only the stems from a list of words."""
return [self.stemmer(w) for w in tokens]


class Corpus:
"""This wrapper provides easy access to a filtered corpus.
Expand All @@ -77,46 +77,52 @@ def __init__(
self,
paths: list[Path],
min_chars: int,
ignore_front_matter: bool = False,
):
self.paths = paths
self.min_chars = min_chars
self.ignore_front_matter = ignore_front_matter

self._loaded_documents: list[str | None]

self.documents_: list[str]
self.paths_: list[Path]
self.documents_: list[str] = []
self.paths_: list[Path] = []
self.reference_: str| None = None

self._load_documents()
if min_chars:
self._apply_min_chars_filter()
self._prune_documents()
self._prune_paths()
self.add_from_paths()

def add_document(self, document: str|None):
"""Add a document to the current corpus.
def add_from_file(self, path: Path, is_reference: bool = False):
"""Adds the contents of a file to the corpus.
Args:
document (str): Document to be added.
Returns:
list[str]: The new corpus after the document has been added.
path (Path): The path to the file.
is_reference (bool, optional): Indicates if the file is a reference file.
Defaults to False.
Notes:
- The file content is added to the corpus if it meets the minimum character
length requirement.
- If front matter stripping is enabled, the file content is stripped of its
front matter before being added to the corpus.
"""
if document:
self.documents_.append(document)

def _load_documents(self):
self._loaded_documents = [try_read_file(p) for p in self.paths]

def _prune_paths(self):
self.paths_ = compress(self.paths, self.documents_)

def _prune_documents(self):
self.documents_ = [x for x in self._loaded_documents if x]

def _apply_min_chars_filter(self):
"""Apply min chars filter in both documents and documents paths"""
self._loaded_documents = [
doc if doc and len(doc) >= self.min_chars else None
for doc in self._loaded_documents
]
return self
loaded_doc = try_read_file(path)
if loaded_doc and len(loaded_doc) >= self.min_chars:
if self.ignore_front_matter:
loaded_doc = self.strip_front_matter(
loaded_doc, extension=path.suffix
)
self.documents_.append(loaded_doc)
if is_reference:
self.reference_ = loaded_doc
else:
self.paths_.append(path)

def add_from_query(self, query: str):
self.documents_.append(query)

def add_from_paths(self) -> list[str | None]:
"""Load document contents from the specified paths."""
return [self.add_from_file(p) for p in self.paths]

def strip_front_matter(self, document: str, extension: str) -> str:
"""Strip front-matter from the loaded documents."""
markup = Markup(extension=extension)
return markup.strip_frontmatter(document)
3 changes: 3 additions & 0 deletions findlike/pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[pytest]
testpaths =
tests
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "findlike"
version = "1.3.1"
version = "1.4.0"
authors = [{ name = "Bruno Arine", email = "[email protected]" }]
description = "findlike is a package to retrieve similar documents"
readme = "README.md"
Expand Down
91 changes: 67 additions & 24 deletions tests/test_corpus.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import pytest
import tempfile
from pathlib import Path
from textwrap import dedent

import pytest

from findlike.preprocessing import Corpus
from findlike.utils import try_read_file

Expand All @@ -26,25 +30,6 @@ def sample_paths(tmp_path):
return [path1, path2, path3]


def test_loading_documents(sample_paths):
corpus = Corpus(sample_paths, min_chars=0)
assert len(corpus._loaded_documents) == 3


def test_min_chars_filter(sample_paths):
corpus = Corpus(sample_paths, min_chars=30)
filtered_docs = corpus.documents_
filtered_paths = corpus.paths_

assert len(filtered_docs) == 2
assert len(filtered_paths) == 2


def test_pruning_documents(sample_paths):
corpus = Corpus(sample_paths, min_chars=30)
assert all(doc is not None for doc in corpus.documents_)


def test_pruning_paths(sample_paths):
corpus = Corpus(sample_paths, min_chars=30)
filtered_paths = corpus.paths_
Expand All @@ -66,7 +51,65 @@ def test_try_read_file(sample_paths):
try_read_file(invalid_path)


def test_empty_paths_list():
corpus = Corpus([], min_chars=0)
assert len(corpus.documents_) == 0
assert len(corpus.paths_) == 0
class TestCorpus:
# Fixture for creating temporary files with random content
@pytest.fixture
def temp_files(self):
with tempfile.TemporaryDirectory() as tmpdir:
tmp_path = Path(tmpdir)
file1 = tmp_path / "file1.txt"
file2 = tmp_path / "file2.txt"
file1.write_text("This is file 1.")
file2.write_text("This is file 2.")
yield [file1, file2]

# Fixture for creating a Corpus instance
@pytest.fixture
def corpus(self):
min_chars = 10
return Corpus([], min_chars)

# Test add_from_file method
def test_files_were_added(self, corpus, temp_files):
# Add files to the corpus
corpus.add_from_file(temp_files[0])
corpus.add_from_file(temp_files[1])

# Check if documents and paths are updated correctly
assert len(corpus.documents_) == 2
assert len(corpus.paths_) == 2
assert corpus.documents_[0] == "This is file 1."
assert corpus.documents_[1] == "This is file 2."
assert corpus.paths_[0] == temp_files[0]
assert corpus.paths_[1] == temp_files[1]

# Test add_from_query method
def test_add_from_query(self, corpus):
# Add query to the corpus
corpus.add_from_query("This is a query.")

# Check if the query is added to the documents
assert len(corpus.documents_) == 1
assert len(corpus.paths_) == 0
assert corpus.documents_[0] == "This is a query."

# Test _strip_front_matter method
def test_strip_front_matter(self, corpus):
# Test with front matter stripping disabled
document = "This is a document."
stripped_document = corpus.strip_front_matter(document, extension=".txt")
assert stripped_document == document

# Test with front matter stripping enabled
document = """
:PROPERTIES:
:ID: 123
:END:
#+TITLE: Example
This is some text.
** A heading
Some more text.
"""
extension = ".org"
expected = "This is some text.\n** A heading\nSome more text."
assert corpus.strip_front_matter(dedent(document), extension) == expected
Loading

0 comments on commit f5b46cf

Please sign in to comment.