Merge pull request #4 from brunoarine/feat/ignore-front-matter

brunoarine · web-flow · commit f5b46cfae096 · 2023-07-10T22:52:50.000-03:00
feat: ignore front matter
diff --git a/README.md b/README.md
@@ -116,6 +116,7 @@ Here's the breakdown of the available options:
 | `-H, --heading TEXT`        | Text to show as the list heading. Default is "". Example: `findlike reference_file.txt -H "## Similar files"`                                                                                                                                                                                                                         |
 | `-F, --format [plain, json]` | This option sets the output format. Default is "plain". Example: `findlike reference_file.txt -F json`                                                       |
 | `-t, --threshold FLOAT`     | Similarity score threshold. All results whose score are below the determined threshold will be omitted. Default is 0.05. Example: `findlike reference_file.txt -t 0`                                                                                                                                                                  |
+| `-i, --ignore-front-matter` | Tries to strip the front-matter from markup files like Markdown and Org-mode. |
 
 ## Examples
 
diff --git a/findlike/cli.py b/findlike/cli.py
@@ -6,12 +6,12 @@
 from nltk.stem import SnowballStemmer
 from stop_words import get_stop_words
 
+from .constants import ALGORITHM_CLASSES, FORMATTER_CLASSES, TEXT_FILE_EXT
 from .preprocessing import (
     Corpus,
     Processor,
 )
-from .utils import try_read_file, collect_paths
-from .constants import FORMATTER_CLASSES, ALGORITHM_CLASSES, TEXT_FILE_EXT
+from .utils import collect_paths
 
 
 @click.command()
@@ -115,6 +115,13 @@
     help="remove REFERENCE_FILE from results",
     required=False,
 )
+@click.option(
+    "--ignore-front-matter",
+    "-i",
+    is_flag=True,
+    help="ignore front-matter from supported markup languages",
+    required=False,
+)
 @click.option(
     "--heading",
     "-H",
@@ -159,6 +166,7 @@ def cli(
     format,
     threshold,
     absolute_paths,
+    ignore_front_matter,
 ):
     """'findlike' is a program that scans a given directory and returns the most
     similar documents in relation to REFERENCE_FILE or --query QUERY.
@@ -172,38 +180,41 @@ def cli(
     $ findlike -q "There is only one good, knowledge, and one evil, ignorance"
     """
 
-    # Set up the reference text.
-    if reference_file:
-        reference_content = try_read_file(Path(reference_file))
-    elif query:
-        reference_content = query
-    else:
-        raise click.UsageError(
-            "Neither REFERENCE_FILE nor --query QUERY was provided."
-        )
-
     # Put together the list of documents to be analyzed.
     directory_path = Path(directory)
-    extensions: list[str] = [filename_pattern] if filename_pattern else TEXT_FILE_EXT
+    extensions: list[str] = (
+        [filename_pattern] if filename_pattern else TEXT_FILE_EXT
+    )
     document_paths = collect_paths(
         directory=directory_path, extensions=extensions, recursive=recursive
     )
 
     # Create a corpus with the collected documents.
-    corpus = Corpus(paths=document_paths, min_chars=min_chars)
-    corpus.add_document(document=reference_content)
+    corpus = Corpus(
+        paths=document_paths,
+        min_chars=min_chars,
+        ignore_front_matter=ignore_front_matter,
+    )
+    if reference_file:
+        corpus.add_from_file(path=Path(reference_file), is_reference=True)
+    elif query:
+        corpus.add_from_query(query=query)
+    else:
+        raise click.UsageError(
+            "Neither REFERENCE_FILE nor --query QUERY was provided."
+        )
 
     # Set up the documents pre-processor.
     stemmer = SnowballStemmer(language).stem
     processor = Processor(
         stopwords=get_stop_words(language=language),
         stemmer=stemmer,
     )
-    
+
     # Set up the similarity model.
     model = ALGORITHM_CLASSES[algorithm](processor=processor)
     model.fit(corpus.documents_)  # Add reference to avoid zero division
-    scores = model.get_scores(source=reference_content)
+    scores = model.get_scores(source=corpus.reference_)
 
     # Format and print results.
     formatter = FORMATTER_CLASSES[format](
diff --git a/findlike/constants.py b/findlike/constants.py
@@ -198,7 +198,7 @@
 	"*.nuspec",
 	"*.nvmrc",
 	"*.ops",
-    "org",
+    "*.org",
 	"*.pas",
 	"*.pasm",
 	"*.patch",
diff --git a/findlike/markup.py b/findlike/markup.py
@@ -0,0 +1,52 @@
+import re
+
+class Markup:
+    def __init__(self, extension: str):
+        self.extension = extension
+        self._MARKUP_EXTENSIONS = {
+            ".org": self._strip_org_frontmatter
+        }
+
+    def strip_frontmatter(self, text: str) -> str:
+            if self.extension in self._MARKUP_EXTENSIONS.keys():
+                return self._MARKUP_EXTENSIONS[self.extension](text)
+            else:
+                return text
+
+    def _strip_org_frontmatter(self, content: str) -> str:
+        """
+        Remove front matter from a string representing an Org-mode file.
+
+        This function removes all lines from `:PROPERTIES:` to `:END:` 
+        and any lines starting with `#+` from the given content string.
+
+        Args:
+            content (str): The content of an Org-mode file as a string.
+
+        Returns:
+            str: The content with the front matter removed.
+
+        Example:
+            >>> content = '''
+            ... :PROPERTIES:
+            ... :ID: 123
+            ... :END:
+            ... #+TITLE: Example
+            ... This is some text.
+            ... ** A heading
+            ... Some more text.
+            ... '''
+            >>> cleaned_content = remove_front_matter(content)
+            >>> print(cleaned_content)
+            This is some text.
+            ** A heading
+            Some more text.
+        """
+        # Remove :PROPERTIES: to :END: block
+        content = re.sub(r':PROPERTIES:(.|\n)*?:END:', '', content)
+        
+        # Remove lines starting with #+
+        pattern = r'^\s*#\+[a-zA-Z0-9_]+.*?$'
+        content = re.sub(pattern, '', content, flags=re.MULTILINE)
+        
+        return content.strip()
diff --git a/findlike/preprocessing.py b/findlike/preprocessing.py
@@ -4,7 +4,8 @@
 from pathlib import Path
 from typing import Callable
 
-from .utils import try_read_file, compress
+from .markup import Markup
+from .utils import compress, try_read_file
 
 WORD_RE = re.compile(r"(?u)\b\w{2,}\b")
 URL_RE = re.compile(r"\S*https?:\S*")
@@ -59,7 +60,6 @@ def _stemmize(self, tokens: list[str]) -> list[str]:
         """Get only the stems from a list of words."""
         return [self.stemmer(w) for w in tokens]
 
-
 class Corpus:
     """This wrapper provides easy access to a filtered corpus.
 
@@ -77,46 +77,52 @@ def __init__(
         self,
         paths: list[Path],
         min_chars: int,
+        ignore_front_matter: bool = False,
     ):
         self.paths = paths
         self.min_chars = min_chars
+        self.ignore_front_matter = ignore_front_matter
 
-        self._loaded_documents: list[str | None]
-
-        self.documents_: list[str]
-        self.paths_: list[Path]
+        self.documents_: list[str] = []
+        self.paths_: list[Path] = []
+        self.reference_: str| None = None
 
-        self._load_documents()
-        if min_chars:
-            self._apply_min_chars_filter()
-        self._prune_documents()
-        self._prune_paths()
+        self.add_from_paths()
 
-    def add_document(self, document: str|None):
-        """Add a document to the current corpus.
+    def add_from_file(self, path: Path, is_reference: bool = False):
+        """Adds the contents of a file to the corpus.
 
         Args:
-            document (str): Document to be added.
-
-        Returns:
-            list[str]: The new corpus after the document has been added.
+            path (Path): The path to the file.
+            is_reference (bool, optional): Indicates if the file is a reference file. 
+                Defaults to False.
+
+        Notes:
+            - The file content is added to the corpus if it meets the minimum character 
+              length requirement.
+            - If front matter stripping is enabled, the file content is stripped of its 
+              front matter before being added to the corpus.
         """
-        if document:
-            self.documents_.append(document)
-
-    def _load_documents(self):
-        self._loaded_documents = [try_read_file(p) for p in self.paths]
-
-    def _prune_paths(self):
-        self.paths_ = compress(self.paths, self.documents_)
-
-    def _prune_documents(self):
-        self.documents_ = [x for x in self._loaded_documents if x]
-
-    def _apply_min_chars_filter(self):
-        """Apply min chars filter in both documents and documents paths"""
-        self._loaded_documents = [
-            doc if doc and len(doc) >= self.min_chars else None
-            for doc in self._loaded_documents
-        ]
-        return self
+        loaded_doc = try_read_file(path)
+        if loaded_doc and len(loaded_doc) >= self.min_chars:
+            if self.ignore_front_matter:
+                loaded_doc = self.strip_front_matter(
+                    loaded_doc, extension=path.suffix
+                )
+            self.documents_.append(loaded_doc)
+            if is_reference:
+                self.reference_ = loaded_doc
+            else:
+                self.paths_.append(path)
+
+    def add_from_query(self, query: str):
+        self.documents_.append(query)
+
+    def add_from_paths(self) -> list[str | None]:
+        """Load document contents from the specified paths."""
+        return [self.add_from_file(p) for p in self.paths]
+
+    def strip_front_matter(self, document: str, extension: str) -> str:
+        """Strip front-matter from the loaded documents."""
+        markup = Markup(extension=extension)
+        return markup.strip_frontmatter(document)
diff --git a/findlike/pytest.ini b/findlike/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+testpaths =
+    tests
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "findlike"
-version = "1.3.1"
+version = "1.4.0"
 authors = [{ name = "Bruno Arine", email = "bruno.arine@runbox.com" }]
 description = "findlike is a package to retrieve similar documents"
 readme = "README.md"
diff --git a/tests/test_corpus.py b/tests/test_corpus.py
@@ -1,5 +1,9 @@
-import pytest
+import tempfile
 from pathlib import Path
+from textwrap import dedent
+
+import pytest
+
 from findlike.preprocessing import Corpus
 from findlike.utils import try_read_file
 
@@ -26,25 +30,6 @@ def sample_paths(tmp_path):
     return [path1, path2, path3]
 
 
-def test_loading_documents(sample_paths):
-    corpus = Corpus(sample_paths, min_chars=0)
-    assert len(corpus._loaded_documents) == 3
-
-
-def test_min_chars_filter(sample_paths):
-    corpus = Corpus(sample_paths, min_chars=30)
-    filtered_docs = corpus.documents_
-    filtered_paths = corpus.paths_
-
-    assert len(filtered_docs) == 2
-    assert len(filtered_paths) == 2
-
-
-def test_pruning_documents(sample_paths):
-    corpus = Corpus(sample_paths, min_chars=30)
-    assert all(doc is not None for doc in corpus.documents_)
-
-
 def test_pruning_paths(sample_paths):
     corpus = Corpus(sample_paths, min_chars=30)
     filtered_paths = corpus.paths_
@@ -66,7 +51,65 @@ def test_try_read_file(sample_paths):
         try_read_file(invalid_path)
 
 
-def test_empty_paths_list():
-    corpus = Corpus([], min_chars=0)
-    assert len(corpus.documents_) == 0
-    assert len(corpus.paths_) == 0
+class TestCorpus:
+    # Fixture for creating temporary files with random content
+    @pytest.fixture
+    def temp_files(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmp_path = Path(tmpdir)
+            file1 = tmp_path / "file1.txt"
+            file2 = tmp_path / "file2.txt"
+            file1.write_text("This is file 1.")
+            file2.write_text("This is file 2.")
+            yield [file1, file2]
+
+    # Fixture for creating a Corpus instance
+    @pytest.fixture
+    def corpus(self):
+        min_chars = 10
+        return Corpus([], min_chars)
+
+    # Test add_from_file method
+    def test_files_were_added(self, corpus, temp_files):
+        # Add files to the corpus
+        corpus.add_from_file(temp_files[0])
+        corpus.add_from_file(temp_files[1])
+
+        # Check if documents and paths are updated correctly
+        assert len(corpus.documents_) == 2
+        assert len(corpus.paths_) == 2
+        assert corpus.documents_[0] == "This is file 1."
+        assert corpus.documents_[1] == "This is file 2."
+        assert corpus.paths_[0] == temp_files[0]
+        assert corpus.paths_[1] == temp_files[1]
+
+    # Test add_from_query method
+    def test_add_from_query(self, corpus):
+        # Add query to the corpus
+        corpus.add_from_query("This is a query.")
+
+        # Check if the query is added to the documents
+        assert len(corpus.documents_) == 1
+        assert len(corpus.paths_) == 0
+        assert corpus.documents_[0] == "This is a query."
+
+    # Test _strip_front_matter method
+    def test_strip_front_matter(self, corpus):
+        # Test with front matter stripping disabled
+        document = "This is a document."
+        stripped_document = corpus.strip_front_matter(document, extension=".txt")
+        assert stripped_document == document
+        
+        # Test with front matter stripping enabled
+        document = """
+        :PROPERTIES:
+        :ID: 123
+        :END:
+        #+TITLE: Example
+        This is some text.
+        ** A heading
+        Some more text.
+        """
+        extension = ".org"
+        expected = "This is some text.\n** A heading\nSome more text."
+        assert corpus.strip_front_matter(dedent(document), extension) == expected
diff --git a/tests/test_markup.py b/tests/test_markup.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[pytest]`
	`2`	`+testpaths =`
	`3`	`+ tests`