From 275915cc96c55d9847704648416beb03fe338800 Mon Sep 17 00:00:00 2001 From: Roman Parpalak Date: Sat, 10 Feb 2024 17:17:24 +0200 Subject: [PATCH] Fixed a bug with unrelated search results for a '..' query. --- src/S2/Rose/Entity/Metadata/SentenceCollection.php | 6 +++--- src/S2/Rose/Entity/SnippetLine.php | 4 ++-- src/S2/Rose/Indexer.php | 6 +++--- src/S2/Rose/Snippet/SnippetBuilder.php | 3 ++- tests/unit/Rose/Entity/QueryTest.php | 7 +++++-- tests/unit/Rose/IntegrationTest.php | 12 ++++++++---- 6 files changed, 23 insertions(+), 15 deletions(-) diff --git a/src/S2/Rose/Entity/Metadata/SentenceCollection.php b/src/S2/Rose/Entity/Metadata/SentenceCollection.php index 8554275..40d8c19 100644 --- a/src/S2/Rose/Entity/Metadata/SentenceCollection.php +++ b/src/S2/Rose/Entity/Metadata/SentenceCollection.php @@ -1,6 +1,6 @@ foundWords); + $wordPattern = implode('|', array_map(static fn(string $word) => preg_quote($word, '#'), $this->foundWords)); $wordPatternWithFormatting = '(?:\\\\[' . StringHelper::FORMATTING_SYMBOLS . '])*(?:' . $wordPattern . ')(?:\\\\[' . strtoupper(StringHelper::FORMATTING_SYMBOLS) . '])*'; $replacedLine = preg_replace_callback( '#(?:\\s|^|\p{P})\\K' . $wordPatternWithFormatting . '(?:\\s+(?:' . $wordPatternWithFormatting . '))*\\b#su', diff --git a/src/S2/Rose/Indexer.php b/src/S2/Rose/Indexer.php index 9758b34..374ae64 100644 --- a/src/S2/Rose/Indexer.php +++ b/src/S2/Rose/Indexer.php @@ -2,7 +2,7 @@ /** * Creates search index * - * @copyright 2010-2023 Roman Parpalak + * @copyright 2010-2024 Roman Parpalak * @license MIT */ @@ -207,8 +207,8 @@ private function getStemsWithComponents(array $words): array // If the word contains punctuation marks like hyphen, add a variant without it if (false !== strpbrk($stemmedWord, '-.,')) { - foreach (preg_split('#[\-.,]#', $word) as $k => $subWord) { - if ($subWord) { + foreach (preg_split('#[\p{L}\d]\K[\-.,]+|[\-.,]+(?=[\p{L}\d])#u', $word) as $k => $subWord) { + if ($subWord !== '' && $subWord !== $word) { $componentsOfCompoundWords[(string)($i + 0.001 * ($k + 1))] = $this->stemmer->stemWord($subWord, false); } } diff --git a/src/S2/Rose/Snippet/SnippetBuilder.php b/src/S2/Rose/Snippet/SnippetBuilder.php index c6b5501..5a7fbd7 100644 --- a/src/S2/Rose/Snippet/SnippetBuilder.php +++ b/src/S2/Rose/Snippet/SnippetBuilder.php @@ -1,6 +1,6 @@ stemmer->irregularWordsFromStems($stems)); $regexRules = $this->stemmer->getRegexTransformationRules(); + $regexRules['#\\.#'] = '\\.'; // escaping dot in the following preg_match_all() call $stemsForRegex = array_map(static fn(string $stem): string => preg_replace( array_keys($regexRules), array_values($regexRules), diff --git a/tests/unit/Rose/Entity/QueryTest.php b/tests/unit/Rose/Entity/QueryTest.php index 831a17a..4b7fc8c 100644 --- a/tests/unit/Rose/Entity/QueryTest.php +++ b/tests/unit/Rose/Entity/QueryTest.php @@ -1,6 +1,6 @@ assertEquals([1, 2], (new Query('1|||2'))->valueToArray()); $this->assertEquals([1, 2], (new Query('1\\\\\\2'))->valueToArray()); $this->assertEquals(['a', 'b'], (new Query('a/b'))->valueToArray()); $this->assertEquals(['a', 'b'], (new Query(' a b '))->valueToArray()); + $this->assertEquals(['..'], (new Query('..'))->valueToArray()); + $this->assertEquals(['...'], (new Query('...'))->valueToArray()); + $this->assertEquals(['a..b'], (new Query('a..b'))->valueToArray()); } } diff --git a/tests/unit/Rose/IntegrationTest.php b/tests/unit/Rose/IntegrationTest.php index 16fc955..8f97d70 100644 --- a/tests/unit/Rose/IntegrationTest.php +++ b/tests/unit/Rose/IntegrationTest.php @@ -2,7 +2,7 @@ /** @noinspection PhpComposerExtensionStubsInspection */ /** - * @copyright 2016-2023 Roman Parpalak + * @copyright 2016-2024 Roman Parpalak * @license MIT */ @@ -90,7 +90,7 @@ public function testFeatures( $this->assertEquals([ '20:id_2' => 2.5953804134970615, - '20:id_1' => 0.12828323517212156, + '20:id_1' => 0.12932092968696407, '10:id_1' => 0.08569157515491249, ], $resultSet2->getSortedRelevanceByExternalId()); @@ -112,7 +112,7 @@ public function testFeatures( $this->assertEquals([ '20:id_2' => 2.5953804134970615, - '20:id_1' => 0.12828323517212156 + '20:id_1' => 0.12932092968696407 ], $resultSet2->getSortedRelevanceByExternalId()); $this->assertEquals(3, $resultSet2->getTotalCount()); @@ -265,6 +265,10 @@ public function testFeatures( $this->assertEquals('25', $img1->getHeight()); $this->assertEquals('Alternative text', $img1->getAlt()); + // Empty result + $this->assertCount(0, $finder->find(new Query('..'))->getItems()); + $this->assertCount(0, $finder->find(new Query('...'))->getItems()); + if ($readStorage instanceof PdoStorage && strpos($GLOBALS['s2_rose_test_db']['dsn'], 'sqlite') !== 0) { $indexer->index(new Indexable('dummy', 'Dummy new', '')); $similarItems = $readStorage->getSimilar(new ExternalId('id_2', 20), false); @@ -418,7 +422,7 @@ public function indexableProvider() ->setDate(new \DateTime('2016-08-24 00:00:00')) ->setUrl('url1') , - (new Indexable('id_1', 'Another instance', 'The same id but another instance. Word "content" is present here. Twice: content.', 20)) + (new Indexable('id_1', 'Another instance', 'The same id but another instance. Word "content" is present here. Twice: content. Delimiters must be $...$ or \[...\]', 20)) , (new Indexable('id_4', 'Another instance', 'Nothing is here but images: Alternative text', 20)) ,