Skip to content

Commit

Permalink
Fixed a bug with unrelated search results for a '..' query.
Browse files Browse the repository at this point in the history
  • Loading branch information
parpalak committed Feb 10, 2024
1 parent 447246d commit 275915c
Show file tree
Hide file tree
Showing 6 changed files with 23 additions and 15 deletions.
6 changes: 3 additions & 3 deletions src/S2/Rose/Entity/Metadata/SentenceCollection.php
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?php declare(strict_types=1);
/**
* @copyright 2023 Roman Parpalak
* @copyright 2023-2024 Roman Parpalak
* @license MIT
*/

Expand Down Expand Up @@ -104,13 +104,13 @@ public static function breakIntoWords(string $content): array
// We allow letters, digits and some punctuation: ".,-"
$content = preg_replace('#[^\\-.,0-9\\p{L}^_]+#u', ' ', $content);
$content = mb_strtolower($content);
$content = str_replace([", ", ". ", "- ", 'ё'], [' ', ' ', ' ', 'е'], $content);
$content = str_replace(['ё'], ['е'], $content);

// These punctuation characters are meant to be inside words and numbers.
// Remove trailing characters when splitting the words.
$content = rtrim($content, '-.,');

$words = explode(' ', $content);
$words = preg_split('#[\\-.,]*?[ ]+#S', $content);
StringHelper::removeLongWords($words);

return $words;
Expand Down
4 changes: 2 additions & 2 deletions src/S2/Rose/Entity/SnippetLine.php
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?php declare(strict_types=1);
/**
* @copyright 2017-2023 Roman Parpalak
* @copyright 2017-2024 Roman Parpalak
* @license MIT
*/

Expand Down Expand Up @@ -87,7 +87,7 @@ public function getHighlighted(string $highlightTemplate, bool $includeFormattin
// TODO: After implementing formatting this regex became a set of crutches.
// One has to break the snippets into words, clear formatting, convert words to stems
// and detect what stems has been found. Then highlight the original text based on words source offset.
$wordPattern = implode('|', $this->foundWords);
$wordPattern = implode('|', array_map(static fn(string $word) => preg_quote($word, '#'), $this->foundWords));
$wordPatternWithFormatting = '(?:\\\\[' . StringHelper::FORMATTING_SYMBOLS . '])*(?:' . $wordPattern . ')(?:\\\\[' . strtoupper(StringHelper::FORMATTING_SYMBOLS) . '])*';
$replacedLine = preg_replace_callback(
'#(?:\\s|^|\p{P})\\K' . $wordPatternWithFormatting . '(?:\\s+(?:' . $wordPatternWithFormatting . '))*\\b#su',
Expand Down
6 changes: 3 additions & 3 deletions src/S2/Rose/Indexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
/**
* Creates search index
*
* @copyright 2010-2023 Roman Parpalak
* @copyright 2010-2024 Roman Parpalak
* @license MIT
*/

Expand Down Expand Up @@ -207,8 +207,8 @@ private function getStemsWithComponents(array $words): array

// If the word contains punctuation marks like hyphen, add a variant without it
if (false !== strpbrk($stemmedWord, '-.,')) {
foreach (preg_split('#[\-.,]#', $word) as $k => $subWord) {
if ($subWord) {
foreach (preg_split('#[\p{L}\d]\K[\-.,]+|[\-.,]+(?=[\p{L}\d])#u', $word) as $k => $subWord) {
if ($subWord !== '' && $subWord !== $word) {
$componentsOfCompoundWords[(string)($i + 0.001 * ($k + 1))] = $this->stemmer->stemWord($subWord, false);
}
}
Expand Down
3 changes: 2 additions & 1 deletion src/S2/Rose/Snippet/SnippetBuilder.php
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?php declare(strict_types=1);
/**
* @copyright 2011-2023 Roman Parpalak
* @copyright 2011-2024 Roman Parpalak
* @license MIT
*/

Expand Down Expand Up @@ -84,6 +84,7 @@ public function buildSnippet(array $foundPositionsByStems, string $highlightTemp
$stems = array_merge($stems, $this->stemmer->irregularWordsFromStems($stems));

$regexRules = $this->stemmer->getRegexTransformationRules();
$regexRules['#\\.#'] = '\\.'; // escaping dot in the following preg_match_all() call
$stemsForRegex = array_map(static fn(string $stem): string => preg_replace(
array_keys($regexRules),
array_values($regexRules),
Expand Down
7 changes: 5 additions & 2 deletions tests/unit/Rose/Entity/QueryTest.php
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?php
/**
* @copyright 2016-2020 Roman Parpalak
* @copyright 2016-2024 Roman Parpalak
* @license MIT
*/

Expand All @@ -14,11 +14,14 @@
*/
class QueryTest extends Unit
{
public function testFilterInput()
public function testFilterInput(): void
{
$this->assertEquals([1, 2], (new Query('1|||2'))->valueToArray());
$this->assertEquals([1, 2], (new Query('1\\\\\\2'))->valueToArray());
$this->assertEquals(['a', 'b'], (new Query('a/b'))->valueToArray());
$this->assertEquals(['a', 'b'], (new Query(' a b '))->valueToArray());
$this->assertEquals(['..'], (new Query('..'))->valueToArray());
$this->assertEquals(['...'], (new Query('...'))->valueToArray());
$this->assertEquals(['a..b'], (new Query('a..b'))->valueToArray());
}
}
12 changes: 8 additions & 4 deletions tests/unit/Rose/IntegrationTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
/** @noinspection PhpComposerExtensionStubsInspection */

/**
* @copyright 2016-2023 Roman Parpalak
* @copyright 2016-2024 Roman Parpalak
* @license MIT
*/

Expand Down Expand Up @@ -90,7 +90,7 @@ public function testFeatures(

$this->assertEquals([
'20:id_2' => 2.5953804134970615,
'20:id_1' => 0.12828323517212156,
'20:id_1' => 0.12932092968696407,
'10:id_1' => 0.08569157515491249,
], $resultSet2->getSortedRelevanceByExternalId());

Expand All @@ -112,7 +112,7 @@ public function testFeatures(

$this->assertEquals([
'20:id_2' => 2.5953804134970615,
'20:id_1' => 0.12828323517212156
'20:id_1' => 0.12932092968696407
], $resultSet2->getSortedRelevanceByExternalId());

$this->assertEquals(3, $resultSet2->getTotalCount());
Expand Down Expand Up @@ -265,6 +265,10 @@ public function testFeatures(
$this->assertEquals('25', $img1->getHeight());
$this->assertEquals('Alternative text', $img1->getAlt());

// Empty result
$this->assertCount(0, $finder->find(new Query('..'))->getItems());
$this->assertCount(0, $finder->find(new Query('...'))->getItems());

if ($readStorage instanceof PdoStorage && strpos($GLOBALS['s2_rose_test_db']['dsn'], 'sqlite') !== 0) {
$indexer->index(new Indexable('dummy', 'Dummy new', ''));
$similarItems = $readStorage->getSimilar(new ExternalId('id_2', 20), false);
Expand Down Expand Up @@ -418,7 +422,7 @@ public function indexableProvider()
->setDate(new \DateTime('2016-08-24 00:00:00'))
->setUrl('url1')
,
(new Indexable('id_1', 'Another instance', 'The same id but another instance. Word "content" is present here. Twice: content.', 20))
(new Indexable('id_1', 'Another instance', 'The same id but another instance. Word "content" is present here. Twice: content. Delimiters must be $...$ or \[...\]', 20))
,
(new Indexable('id_4', 'Another instance', 'Nothing is here but images: <img src="1.jpg" width="10" height="15"> <img src="2.jpg" width="20" height="25" alt="Alternative text" />', 20))
,
Expand Down

0 comments on commit 275915c

Please sign in to comment.