From 80ad5bfbcd00e9d9bfb2c478d95901cdba831acd Mon Sep 17 00:00:00 2001
From: tripleee <tripleee@users.noreply.github.com>
Date: Thu, 19 May 2022 09:50:36 +0300
Subject: [PATCH] findspam.py: body_text_repeated(): phrase repeated at
 beginning of body

Second PR, had to back out the broken one I committed yesterday.

test/test_findspam.py: reinstate backed-out test case
---
 findspam.py           | 28 ++++++++++++++++++++++++++++
 test/test_findspam.py |  1 +
 2 files changed, 29 insertions(+)

diff --git a/findspam.py b/findspam.py
index c420c022b9..71605a6cd0 100644
--- a/findspam.py
+++ b/findspam.py
@@ -794,6 +794,34 @@ def misleading_link(s, site):
         return False, ''
 
 
+# noinspection PyUnusedLocal,PyMissingTypeHints,PyTypeChecker
+@create_rule("text repeated in {}", title=False, body_summary=True, max_rep=10000, max_score=10000)
+def body_text_repeated(s, site):
+    """
+    Do some hacks to reduce the need for regex backtracking for this rule
+    """
+    s = s.rstrip("\n")
+    if s.startswith("<p>") and s.endswith("</p>"):
+        s = s[3:-4]
+    initial_words = regex.match(r"\A([^\W_]+)[\W_]+([^\W_]+)[\W_]+([^\W_]+)", s)
+    if not initial_words:
+        return False, ""
+    escaped_initial_words = [regex.escape(x) for x in initial_words.groups()]
+    period = regex.match(
+        r"\A%s[\W_]+%s[\W_]+%s[\W_]+(.{1,40}?)%s[\W_]+%s[\W_]+%s(?=$|[\W_])" % (
+            tuple(escaped_initial_words * 2)), s)
+    if not period:
+        return False, ""
+    period_words = regex.split(r"[\W_]+", period.groups(0)[0])
+    escaped_words = escaped_initial_words + [
+        regex.escape(x) for x in period_words]
+    repeats_regex = r"\A(" + r"[\W_]+".join(escaped_words) + r"[\W_]*){10,}"
+    repeats = regex.match(repeats_regex, s)
+    if repeats:
+        return True, "Body contains repeated phrase '%s'" % repeats.groups(0)[0]
+    return False, ""
+
+
 # noinspection PyUnusedLocal,PyMissingTypeHints,PyTypeChecker
 @create_rule("repeating words in {}", max_rep=11, stripcodeblocks=True)
 def has_repeating_words(s, site):
diff --git a/test/test_findspam.py b/test/test_findspam.py
index 40217b743a..00e6d6cc56 100644
--- a/test/test_findspam.py
+++ b/test/test_findspam.py
@@ -136,6 +136,7 @@
     ('homoglyph phone numbers 07', '<p>Some 1-844i8O2i7S3S fbody</p>', 'a username', 'math.stackexchange.com', False, False, True),
     ('homoglyph phone numbers 08', '<p>Some 844-8O2-7S3S foobody</p>', 'a username', 'math.stackexchange.com', False, False, True),
     ('Multiple consecutive homoglyph numbers 1', '<p>SomeI-888-884-Olll 888-884-OIII +I-972-S34-S446 972-S34-S446 I-628-21S-2I66 628-21S-2l66 1-844i8O2i7S3S 844a8O2a7S3S body</p>', 'a username', 'math.stackexchange.com', False, False, True),
+    ('repeated body test', 'need enough interesting text to avoid few unique characters rule' * 15, 'luser', 'stackoverflow.com', False, False, True),
 ])
 def test_findspam(title, body, username, site, body_is_summary, is_answer, expected_spam):
     post = Post(api_response={'title': title, 'body': body,