From b2b9b4e5b5869c946316726cff54e6709d5a587b Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Tue, 19 Jul 2022 17:43:14 +0000
Subject: [PATCH] Fix ignoracle tests for wpull 2.x

---
 pipeline/archivebot/wpull/ignoracle_test.py | 92 ++++++++++++---------
 1 file changed, 54 insertions(+), 38 deletions(-)

diff --git a/pipeline/archivebot/wpull/ignoracle_test.py b/pipeline/archivebot/wpull/ignoracle_test.py
index 46ac6a43..f0b5b81f 100644
--- a/pipeline/archivebot/wpull/ignoracle_test.py
+++ b/pipeline/archivebot/wpull/ignoracle_test.py
@@ -1,12 +1,25 @@
 import unittest
-import re
+import wpull.pipeline.item
 
 from .ignoracle import Ignoracle, parameterize_record_info
 
 p1 = 'www\.example\.com/foo\.css\?'
 p2 = 'bar/.+/baz'
 
-@unittest.skip("Pending a fix for wpull 2.x interface")
+def make_url_record(url, level = 0, parent_url = None, root_url = None):
+    # Without kwargs, the URL is treated as a root URL, i.e. its own parent
+    # root_url defaults to parent_url if present else url
+    if parent_url is None:
+        parent_url = url
+    if root_url is None:
+        root_url = parent_url
+    record = wpull.pipeline.item.URLRecord()
+    record.url = url
+    record.parent_url = parent_url
+    record.root_url = root_url
+    record.level = level
+    return record
+
 class TestIgnoracle(unittest.TestCase):
     def setUp(self):
         self.oracle = Ignoracle()
@@ -14,19 +27,19 @@ def setUp(self):
         self.oracle.set_patterns([p1, p2])
 
     def test_ignores_returns_responsible_pattern(self):
-        self.assertEqual(self.oracle.ignores('http://www.example.com/foo.css?body=1'), p1)
-        self.assertEqual(self.oracle.ignores('http://www.example.com/bar/abc/def/baz'), p2)
+        self.assertEqual(self.oracle.ignores(make_url_record('http://www.example.com/foo.css?body=1')), p1)
+        self.assertEqual(self.oracle.ignores(make_url_record('http://www.example.com/bar/abc/def/baz')), p2)
 
     def test_ignores_skips_invalid_patterns(self):
         self.oracle.set_patterns(['???', p2])
 
-        self.assertEqual(self.oracle.ignores('http://www.example.com/bar/abc/def/baz'), p2)
+        self.assertEqual(self.oracle.ignores(make_url_record('http://www.example.com/bar/abc/def/baz')), p2)
 
     def test_ignores_supports_netloc_parameterization(self):
         pattern = '{primary_netloc}/foo\.css\?'
         self.oracle.set_patterns([pattern])
 
-        result = self.oracle.ignores('http://www.example.com/foo.css?body=1', primary_netloc='www.example.com')
+        result = self.oracle.ignores(make_url_record('http://www.example.com/foo.css?body=1'))
 
         self.assertEqual(result, pattern)
 
@@ -34,7 +47,8 @@ def test_permits_empty_brace_pairs(self):
         pattern = '{primary_netloc}{}/foo\.css\?{}'
         self.oracle.set_patterns([pattern])
 
-        result = self.oracle.ignores('http://www.example.com{}/foo.css?{}body=1', primary_netloc='www.example.com')
+        record = make_url_record('http://www.example.com{}/foo.css?{}body=1', level = 1, parent_url = 'http://www.example.com/')
+        result = self.oracle.ignores(record)
 
         self.assertEqual(result, pattern)
 
@@ -42,7 +56,7 @@ def test_permits_empty_brace_pairs_and_regex_repetitions(self):
         pattern = '{primary_netloc}{1}/foo\.css\?{}'
         self.oracle.set_patterns([pattern])
 
-        result = self.oracle.ignores('http://www.example.com/foo.css?{}body=1', primary_netloc='www.example.com')
+        result = self.oracle.ignores(make_url_record('http://www.example.com/foo.css?{}body=1'))
 
         self.assertEqual(result, pattern)
 
@@ -50,7 +64,7 @@ def test_parameterization_skips_regex_ranges(self):
         pattern = '/(.*)/(\\1/){3,}'
         self.oracle.set_patterns([pattern])
 
-        result = self.oracle.ignores('http://www.example.com/foo/foo/foo/foo/foo')
+        result = self.oracle.ignores(make_url_record('http://www.example.com/foo/foo/foo/foo/foo'))
 
         self.assertEqual(result, pattern)
 
@@ -60,7 +74,7 @@ def test_parameterization_skips_pattern_with_unknown_parameter(self):
 
         self.oracle.set_patterns([wrong, right])
 
-        result = self.oracle.ignores('http://www.example.com/foo/foo/foo/foo/foo')
+        result = self.oracle.ignores(make_url_record('http://www.example.com/foo/foo/foo/foo/foo'))
 
         self.assertEqual(result, right)
 
@@ -68,7 +82,8 @@ def test_ignores_supports_url_parameterization(self):
         pattern = '{primary_url}foo\.css\?'
         self.oracle.set_patterns([pattern])
 
-        result = self.oracle.ignores('http://www.example.com/foo.css?body=1', primary_url='http://www.example.com/')
+        record = make_url_record('http://www.example.com/foo.css?body=1', level = 1, parent_url = 'http://www.example.com/')
+        result = self.oracle.ignores(record)
 
         self.assertEqual(result, pattern)
 
@@ -76,7 +91,8 @@ def test_ignores_escapes_url(self):
         pattern = '{primary_url}foo\.css\?'
         self.oracle.set_patterns([pattern])
 
-        result = self.oracle.ignores('http://www.example.com/bar.css??/foo.css?body=1', primary_url='http://www.example.com/bar.css??/')
+        record = make_url_record('http://www.example.com/bar.css??/foo.css?body=1', level = 1, parent_url = 'http://www.example.com/bar.css??/')
+        result = self.oracle.ignores(record)
 
         self.assertEqual(result, pattern)
 
@@ -85,53 +101,53 @@ def test_ignores_with_parameterized_url_replaces_none_placeholder_with_empty_str
         self.oracle.set_patterns([pattern])
 
         # This should treat the pattern as if it were "foo\.css\?"
-        result = self.oracle.ignores('http://www.example.com/foo.css?body=1')
+        record = wpull.pipeline.item.URLRecord()
+        record.url = 'http://www.example.com/foo.css?body=1'
+        record.level = 1
+        # No parent or root URL...
+        result = self.oracle.ignores(record)
 
         self.assertEqual(result, pattern)
 
     def test_ignores_returns_false_for_unsuccessful_match(self):
-        self.assertFalse(self.oracle.ignores('http://www.example.com/media/qux.jpg'))
+        self.assertFalse(self.oracle.ignores(make_url_record('http://www.example.com/media/qux.jpg')))
 
     def test_set_patterns_converts_bytes_to_utf8(self):
         self.oracle.set_patterns([b'foobar'])
 
         self.assertEqual(self.oracle.patterns[0], 'foobar')
 
-@unittest.skip("Pending a fix for wpull 2.x interface")
 class TestRecordInfoParameterization(unittest.TestCase):
-    def test_uses_top_url_if_present(self):
-        record_info = dict(
-            top_url='http://www.example.com/'
-        )
+    def test_uses_root_url(self):
+        record = make_url_record('http://www.example.com/foo', level = 1, parent_url = 'http://www.example.com/', root_url = 'https://example.org/')
 
-        result = parameterize_record_info(record_info)
+        result = parameterize_record_info(record)
 
-        self.assertEqual('http://www.example.com/', result['primary_url'])
-        self.assertEqual('www.example.com', result['primary_netloc'])
+        self.assertEqual('https://example.org/', result['primary_url'])
+        self.assertEqual('example.org', result['primary_netloc'])
 
     def test_uses_url_for_level_zero_url(self):
-        record_info = dict(
-            url='http://www.example.com/',
-            level=0
-        )
+        record = make_url_record('http://www.example.com/', level = 0, parent_url = 'http://parent.invalid/', root_url = 'http://root.invalid/')
 
-        result = parameterize_record_info(record_info)
+        result = parameterize_record_info(record)
 
         self.assertEqual('http://www.example.com/', result['primary_url'])
         self.assertEqual('www.example.com', result['primary_netloc'])
 
-    def test_missing_primary_url_results_in_no_netloc(self):
-        result = parameterize_record_info(dict())
-
-        self.assertIsNone(result['primary_url'])
-        self.assertIsNone(result['primary_netloc'])
-
     def test_includes_auth_and_port_in_primary_netloc(self):
-        record_info = dict(
-            url='http://foo:bar@www.example.com:8080/',
-            level=0
-        )
+        record = make_url_record('http://foo:bar@www.example.com:8080/')
 
-        result = parameterize_record_info(record_info)
+        result = parameterize_record_info(record)
 
         self.assertEqual('foo:bar@www.example.com:8080', result['primary_netloc'])
+
+    def test_none_if_no_root_url(self):
+        record = wpull.pipeline.item.URLRecord()
+        record.url = 'http://www.example.com/foo.css?body=1'
+        record.level = 1
+        record.parent_url = 'http://www.example.com/'
+
+        result = parameterize_record_info(record)
+
+        self.assertIsNone(result['primary_url'])
+        self.assertIsNone(result['primary_netloc'])