From 0feaa15ce79fd939aa388fe1b6ee933637247749 Mon Sep 17 00:00:00 2001
From: Adrien Perrin <adrien.perrin@nersc.no>
Date: Fri, 12 Jan 2024 12:09:20 +0000
Subject: [PATCH 1/4] make HTMLDirectoryCrawler a concrete class

---
 geospaas_harvesting/crawlers.py | 44 ++++++++++++-----
 tests/test_generic_crawlers.py  | 84 +++++++++++++++++++++++----------
 2 files changed, 90 insertions(+), 38 deletions(-)

diff --git a/geospaas_harvesting/crawlers.py b/geospaas_harvesting/crawlers.py
index 322c7e41..b3a56fc8 100644
--- a/geospaas_harvesting/crawlers.py
+++ b/geospaas_harvesting/crawlers.py
@@ -318,7 +318,8 @@ class DirectoryCrawler(Crawler):
         f'^.*/{YEAR_PATTERN}/?{MONTH_PATTERN}/?{DAY_OF_MONTH_PATTERN}(/.*)?$')
     DAY_OF_YEAR_MATCHER = re.compile(f'^.*/{YEAR_PATTERN}/{DAY_OF_YEAR_PATTERN}(/.*)?$')
 
-    def __init__(self, root_url, time_range=(None, None), include=None, max_threads=1):
+    def __init__(self, root_url, time_range=(None, None), include=None,
+                 username=None, password=None, max_threads=1):
         """
         `root_url` is the URL of the data repository to explore.
         `time_range` is a 2-tuple of datetime.datetime objects defining the time range
@@ -330,13 +331,17 @@ def __init__(self, root_url, time_range=(None, None), include=None, max_threads=
         self.root_url = urlparse(root_url)
         self.time_range = time_range
         self.include = re.compile(include) if include else None
+        self.username = username
+        self.password = password
         self.set_initial_state()
 
     def __eq__(self, other):
         return (
             self.root_url == other.root_url and
             self.time_range == other.time_range and
-            self.include == other.include)
+            self.include == other.include and
+            self.username == other.username and
+            self.password == other.password)
 
     @property
     def base_url(self):
@@ -479,7 +484,8 @@ def _process_folder(self, folder_path):
         self.logger.debug("Looking for resources in '%s'...", folder_path)
         for path in self._list_folder_contents(folder_path):
             # deselect paths which contains any of the excludes strings
-            if self.EXCLUDE and self.EXCLUDE.search(path):
+            if ((self.EXCLUDE and self.EXCLUDE.search(path)) or
+                    self.root_url.path.startswith(path.rstrip(f"{os.sep}/"))):
                 continue
             if self._is_folder(path):
                 self._add_folder_to_process(path)
@@ -514,11 +520,11 @@ def get_normalized_attributes(self, dataset_info, **kwargs):
 
 
 class HTMLDirectoryCrawler(DirectoryCrawler):
-    """Implementation of WebDirectoryCrawler for repositories exposed as HTML pages."""
+    """Implementation of DirectoryCrawler for repositories exposed as HTML pages."""
 
     logger = logging.getLogger(__name__ + '.HTMLDirectoryCrawler')
 
-    FOLDERS_SUFFIXES = None
+    FOLDERS_SUFFIXES = ('/',)
 
     # ------------- crawl ------------
     @staticmethod
@@ -527,7 +533,7 @@ def _strip_folder_page(folder_path):
         Remove the index page of a folder path.
         For example: /foo/bar/contents.html becomes /foo/bar.
         """
-        return re.sub(r'/\w+\.html?$', r'', folder_path)
+        return re.sub(r'/(\w+\.html)?$', r'', folder_path)
 
     def _is_folder(self, path):
         return path.endswith(self.FOLDERS_SUFFIXES)
@@ -557,13 +563,23 @@ def _prepend_parent_path(parent_path, paths):
         return result
 
     def _list_folder_contents(self, folder_path):
-        html = self._http_get(f"{self.base_url}{folder_path}")
+        request_parameters = {}
+        if self.username is not None and self.password is not None:
+           request_parameters['auth'] = (self.username, self.password)
+        html = self._http_get(f"{self.base_url}{folder_path}", request_parameters)
         stripped_folder_path = self._strip_folder_page(folder_path)
         return self._prepend_parent_path(stripped_folder_path, self._get_links(html))
 
     # --------- get metadata ---------
     def get_normalized_attributes(self, dataset_info, **kwargs):
-        raise NotImplementedError()
+        """Gets dataset attributes using http"""
+        raw_attributes = {}
+        self.add_url(dataset_info.url, raw_attributes)
+        normalized_attributes = self._metadata_handler.get_parameters(raw_attributes)
+        # TODO: add FTP_SERVICE_NAME and FTP_SERVICE in django-geo-spaas
+        normalized_attributes['geospaas_service_name'] = catalog_managers.HTTP_SERVICE_NAME
+        normalized_attributes['geospaas_service'] = catalog_managers.HTTP_SERVICE
+        return normalized_attributes
 
 
 class OpenDAPCrawler(HTMLDirectoryCrawler):
@@ -690,16 +706,18 @@ class FTPCrawler(DirectoryCrawler):
     logger = logging.getLogger(__name__ + '.FTPCrawler')
 
     def __init__(self, root_url, time_range=(None, None), include=None,
-                 username='anonymous', password='anonymous', max_threads=1):
-
+                 username=None, password=None, max_threads=1):
         if not root_url.startswith('ftp://'):
             raise ValueError("The root url must start with 'ftp://'")
 
-        self.username = username
-        self.password = password
+        if username is None:
+            username = 'anonymous'
+        if password is None:
+            password = 'anonymous'
         self.ftp = None
 
-        super().__init__(root_url, time_range, include, max_threads=1)
+        super().__init__(root_url, time_range, include, max_threads=1,
+                         username=username, password=password)
 
     def __getstate__(self):
         """Method used to pickle the crawler"""
diff --git a/tests/test_generic_crawlers.py b/tests/test_generic_crawlers.py
index bc82ad56..505ce601 100644
--- a/tests/test_generic_crawlers.py
+++ b/tests/test_generic_crawlers.py
@@ -17,6 +17,7 @@
 
 import requests
 
+import geospaas.catalog.managers
 import geospaas_harvesting.crawlers as crawlers
 
 
@@ -609,6 +610,37 @@ def test_abstract_get_normalized_attributes(self):
 class HTMLDirectoryCrawlerTestCase(unittest.TestCase):
     """Tests for the HTMLDirectoryCrawler crawler"""
 
+    def test_strip_folder_page(self):
+        """_strip_folder_page() should remove the index page from a
+        folder path
+        """
+        self.assertEqual(
+            crawlers.HTMLDirectoryCrawler._strip_folder_page('/foo/bar/contents.html'),
+            '/foo/bar')
+        self.assertEqual(
+            crawlers.HTMLDirectoryCrawler._strip_folder_page('/foo/bar/'),
+            '/foo/bar')
+        self.assertEqual(
+            crawlers.HTMLDirectoryCrawler._strip_folder_page('/foo/bar'),
+            '/foo/bar')
+
+    def test_get_right_number_of_links(self):
+        """Test that the crawler gets the correct number of links from a test page"""
+        with open(os.path.join(
+                os.path.dirname(__file__), 'data', 'opendap', 'root.html')) as data_file:
+            html = data_file.read()
+        self.assertEqual(len(crawlers.HTMLDirectoryCrawler._get_links(html)), 4)
+
+        with open(os.path.join(os.path.dirname(__file__), 'data', 'empty.html')) as data_file:
+            html = data_file.read()
+        self.assertEqual(len(crawlers.HTMLDirectoryCrawler._get_links(html)), 0)
+
+    def test_link_extractor_error(self):
+        """In case of error, LinkExtractor must use a logger"""
+        parser = crawlers.LinkExtractor()
+        with self.assertLogs(parser.logger, level=logging.ERROR):
+            parser.error('some message')
+
     def test_prepend_parent_path(self):
         """
         Should prepend all the paths with the parent_path, except if they already start with it
@@ -620,12 +652,34 @@ def test_prepend_parent_path(self):
             ['/foo/bar', '/foo/baz']
         )
 
-    def test_abstract_get_normalized_attributes(self):
-        """The get_normalized_attribute is abstract in
-        HTMLDirectoryCrawler
+    def test_list_folder_contents(self):
+        """Test listing a folder's contents"""
+        with mock.patch('geospaas_harvesting.crawlers.Crawler._http_get') as mock_http_get:
+            mock_http_get.return_value = (
+                '<html>'
+                '<a href="bar/contents.html">folder/</a>'
+                '<a href="baz/">folder/</a>'
+                '<html/>')
+            crawler = crawlers.HTMLDirectoryCrawler('')
+            self.assertListEqual(
+                crawler._list_folder_contents('/foo/contents.html'),
+                ['/foo/bar/contents.html', '/foo/baz/'])
+
+    def test_get_normalized_attributes(self):
+        """Test that the attributes are gotten using metanorm, and the
+        geospaas_service attributes are set
         """
-        with self.assertRaises(NotImplementedError):
-            crawlers.HTMLDirectoryCrawler('').get_normalized_attributes(None)
+        crawler = crawlers.HTMLDirectoryCrawler('http://foo')
+        with mock.patch.object(crawler, '_metadata_handler') as mock_handler:
+            mock_handler.get_parameters.return_value = {'foo': 'bar'}
+            self.assertDictEqual(
+                    crawler.get_normalized_attributes(crawlers.DatasetInfo('ftp://uri')),
+                    {
+                        'foo': 'bar',
+                        'geospaas_service_name': geospaas.catalog.managers.HTTP_SERVICE_NAME,
+                        'geospaas_service': geospaas.catalog.managers.HTTP_SERVICE
+                    })
+            mock_handler.get_parameters.assert_called_once_with({'url': 'ftp://uri'})
 
 
 class OpenDAPCrawlerTestCase(unittest.TestCase):
@@ -739,26 +793,6 @@ def test_get_html_logs_error_on_http_status(self, mock_error_logger):
         _ = crawlers.OpenDAPCrawler._http_get(self.TEST_DATA['inexistent']['urls'][0])
         mock_error_logger.assert_called_once()
 
-    def test_get_right_number_of_links(self):
-        """Test that the crawler gets the correct number of links from a test page"""
-        links = {}
-        for sample in ('root', 'empty'):
-            data_file = open(os.path.join(
-                os.path.dirname(__file__),
-                self.TEST_DATA[sample]['file_path']))
-            html = data_file.read()
-            data_file.close()
-            links[sample] = crawlers.OpenDAPCrawler._get_links(html)
-
-        self.assertEqual(len(links['root']), 4)
-        self.assertEqual(len(links['empty']), 0)
-
-    def test_link_extractor_error(self):
-        """In case of error, LinkExtractor must use a logger"""
-        parser = crawlers.LinkExtractor()
-        with self.assertLogs(parser.logger, level=logging.ERROR):
-            parser.error('some message')
-
     def test_process_folder(self):
         """
         Explore root page and make sure the _url and _to_process attributes of the crawler have the

From df0ad62bbcc93aa5fafce84455454af693cb3074 Mon Sep 17 00:00:00 2001
From: Adrien Perrin <adrien.perrin@nersc.no>
Date: Fri, 12 Jan 2024 13:00:45 +0000
Subject: [PATCH 2/4] add http provider

---
 geospaas_harvesting/config.py         |  2 ++
 geospaas_harvesting/providers/http.py | 25 +++++++++++++++++++++++++
 2 files changed, 27 insertions(+)
 create mode 100644 geospaas_harvesting/providers/http.py

diff --git a/geospaas_harvesting/config.py b/geospaas_harvesting/config.py
index 9bea0f0d..4c53b775 100644
--- a/geospaas_harvesting/config.py
+++ b/geospaas_harvesting/config.py
@@ -8,6 +8,7 @@
 import geospaas_harvesting.providers.resto as providers_resto
 import geospaas_harvesting.providers.earthdata_cmr as providers_earthdata_cmr
 import geospaas_harvesting.providers.ftp as providers_ftp
+import geospaas_harvesting.providers.http as providers_http
 import geospaas_harvesting.providers.jaxa as providers_jaxa
 import geospaas_harvesting.providers.local as providers_local
 import geospaas_harvesting.providers.metno as providers_metno
@@ -62,6 +63,7 @@ class ProvidersArgument(DictArgument):
         'earthdata_cmr': providers_earthdata_cmr.EarthDataCMRProvider,
         'ftp': providers_ftp.FTPProvider,
         'gportal_ftp': providers_jaxa.GPortalProvider,
+        'http': providers_http.HTTPProvider,
         'netcdf': providers_local.NetCDFProvider,
         'nansat': providers_local.NansatProvider,
         'metno': providers_metno.METNOProvider,
diff --git a/geospaas_harvesting/providers/http.py b/geospaas_harvesting/providers/http.py
new file mode 100644
index 00000000..e246eec8
--- /dev/null
+++ b/geospaas_harvesting/providers/http.py
@@ -0,0 +1,25 @@
+"""Code for searching FTP repositories"""
+from urllib.parse import urljoin
+
+from .base import Provider, TimeFilterMixin
+from ..arguments import PathArgument, StringArgument
+from ..crawlers import HTMLDirectoryCrawler
+
+
+class HTTPProvider(TimeFilterMixin, Provider):
+    """Generic HTTP directory provider"""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.search_parameters_parser.add_arguments([
+            StringArgument('url', required=True),
+            StringArgument('include', default='.'),
+        ])
+
+    def _make_crawler(self, parameters):
+        return HTMLDirectoryCrawler(
+            parameters['url'],
+            time_range=(parameters['start_time'], parameters['end_time']),
+            username=self.username,
+            password=self.password,
+            include=parameters['include'])

From 5da7f3e55b45c2c6c9d6a9f3e097f2aff62822c1 Mon Sep 17 00:00:00 2001
From: Adrien Perrin <adrien.perrin@nersc.no>
Date: Fri, 12 Jan 2024 13:18:05 +0000
Subject: [PATCH 3/4] add tests for http provider

---
 tests/providers/test_http.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 tests/providers/test_http.py

diff --git a/tests/providers/test_http.py b/tests/providers/test_http.py
new file mode 100644
index 00000000..fb1935bf
--- /dev/null
+++ b/tests/providers/test_http.py
@@ -0,0 +1,32 @@
+# pylint: disable=protected-access
+"""Tests for the generic FTP provider"""
+import unittest
+import unittest.mock as mock
+from datetime import datetime, timezone
+
+import geospaas_harvesting.crawlers as crawlers
+from geospaas_harvesting.providers.http import HTTPProvider
+
+
+class HTTPProviderTestCase(unittest.TestCase):
+    """Tests for HTTPProvider"""
+
+    def test_make_crawler(self):
+        """Test creating a crawler from parameters"""
+        provider = HTTPProvider(name='test', username='user', password='pass')
+        parameters = {
+            'start_time': datetime(2023, 1, 1, tzinfo=timezone.utc),
+            'end_time': datetime(2023, 1, 2, tzinfo=timezone.utc),
+            'url': 'http://foo/bar',
+            'include': '.*'
+        }
+        with mock.patch('ftplib.FTP'):
+            self.assertEqual(
+                provider._make_crawler(parameters),
+                crawlers.HTMLDirectoryCrawler(
+                    'http://foo/bar',
+                    include='.*',
+                    time_range=(datetime(2023, 1, 1, tzinfo=timezone.utc),
+                                datetime(2023, 1, 2, tzinfo=timezone.utc)),
+                    username='user',
+                    password='pass'))

From 2e50608727d00b19b8d4130dc4b778bfe9b588f4 Mon Sep 17 00:00:00 2001
From: Adrien Perrin <adrien.perrin@nersc.no>
Date: Fri, 12 Jan 2024 13:18:21 +0000
Subject: [PATCH 4/4] add missing tests for HTMLDirectoryCrawler

---
 tests/test_generic_crawlers.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tests/test_generic_crawlers.py b/tests/test_generic_crawlers.py
index 505ce601..c6266b08 100644
--- a/tests/test_generic_crawlers.py
+++ b/tests/test_generic_crawlers.py
@@ -665,6 +665,26 @@ def test_list_folder_contents(self):
                 crawler._list_folder_contents('/foo/contents.html'),
                 ['/foo/bar/contents.html', '/foo/baz/'])
 
+    def test_list_folder_contents_no_auth(self):
+        """If no username and password are provided, HTTP requests
+        should not have an 'auth' parameter
+        """
+        with mock.patch('geospaas_harvesting.crawlers.Crawler._http_get') as mock_http_get:
+            mock_http_get.return_value = ('<html><html/>')
+            crawler = crawlers.HTMLDirectoryCrawler('http://foo')
+            crawler._list_folder_contents('/bar')
+            mock_http_get.assert_called_once_with('http://foo/bar', {})
+
+    def test_list_folder_contents_with_auth(self):
+        """If a username and password are provided, HTTP requests
+        should have an 'auth' parameter
+        """
+        with mock.patch('geospaas_harvesting.crawlers.Crawler._http_get') as mock_http_get:
+            mock_http_get.return_value = ('<html><html/>')
+            crawler = crawlers.HTMLDirectoryCrawler('http://foo', username='user', password='pass')
+            crawler._list_folder_contents('/bar')
+        mock_http_get.assert_called_once_with('http://foo/bar', {'auth': ('user', 'pass')})
+
     def test_get_normalized_attributes(self):
         """Test that the attributes are gotten using metanorm, and the
         geospaas_service attributes are set