make HTMLDirectoryCrawler a concrete class

nansencenter · Jan 12, 2024 · 0feaa15 · 0feaa15
1 parent 81cff2c
commit 0feaa15
Show file tree

Hide file tree

Showing 2 changed files with 90 additions and 38 deletions.
diff --git a/geospaas_harvesting/crawlers.py b/geospaas_harvesting/crawlers.py
@@ -318,7 +318,8 @@ class DirectoryCrawler(Crawler):
         f'^.*/{YEAR_PATTERN}/?{MONTH_PATTERN}/?{DAY_OF_MONTH_PATTERN}(/.*)?$')
     DAY_OF_YEAR_MATCHER = re.compile(f'^.*/{YEAR_PATTERN}/{DAY_OF_YEAR_PATTERN}(/.*)?$')
 
-    def __init__(self, root_url, time_range=(None, None), include=None, max_threads=1):
+    def __init__(self, root_url, time_range=(None, None), include=None,
+                 username=None, password=None, max_threads=1):
         """
         `root_url` is the URL of the data repository to explore.
         `time_range` is a 2-tuple of datetime.datetime objects defining the time range
@@ -330,13 +331,17 @@ def __init__(self, root_url, time_range=(None, None), include=None, max_threads=
         self.root_url = urlparse(root_url)
         self.time_range = time_range
         self.include = re.compile(include) if include else None
+        self.username = username
+        self.password = password
         self.set_initial_state()
 
     def __eq__(self, other):
         return (
             self.root_url == other.root_url and
             self.time_range == other.time_range and
-            self.include == other.include)
+            self.include == other.include and
+            self.username == other.username and
+            self.password == other.password)
 
     @property
     def base_url(self):
@@ -479,7 +484,8 @@ def _process_folder(self, folder_path):
         self.logger.debug("Looking for resources in '%s'...", folder_path)
         for path in self._list_folder_contents(folder_path):
             # deselect paths which contains any of the excludes strings
-            if self.EXCLUDE and self.EXCLUDE.search(path):
+            if ((self.EXCLUDE and self.EXCLUDE.search(path)) or
+                    self.root_url.path.startswith(path.rstrip(f"{os.sep}/"))):
                 continue
             if self._is_folder(path):
                 self._add_folder_to_process(path)
@@ -514,11 +520,11 @@ def get_normalized_attributes(self, dataset_info, **kwargs):
 
 
 class HTMLDirectoryCrawler(DirectoryCrawler):
-    """Implementation of WebDirectoryCrawler for repositories exposed as HTML pages."""
+    """Implementation of DirectoryCrawler for repositories exposed as HTML pages."""
 
     logger = logging.getLogger(__name__ + '.HTMLDirectoryCrawler')
 
-    FOLDERS_SUFFIXES = None
+    FOLDERS_SUFFIXES = ('/',)
 
     # ------------- crawl ------------
     @staticmethod
@@ -527,7 +533,7 @@ def _strip_folder_page(folder_path):
         Remove the index page of a folder path.
         For example: /foo/bar/contents.html becomes /foo/bar.
         """
-        return re.sub(r'/\w+\.html?$', r'', folder_path)
+        return re.sub(r'/(\w+\.html)?$', r'', folder_path)
 
     def _is_folder(self, path):
         return path.endswith(self.FOLDERS_SUFFIXES)
@@ -557,13 +563,23 @@ def _prepend_parent_path(parent_path, paths):
         return result
 
     def _list_folder_contents(self, folder_path):
-        html = self._http_get(f"{self.base_url}{folder_path}")
+        request_parameters = {}
+        if self.username is not None and self.password is not None:
+           request_parameters['auth'] = (self.username, self.password)
+        html = self._http_get(f"{self.base_url}{folder_path}", request_parameters)
         stripped_folder_path = self._strip_folder_page(folder_path)
         return self._prepend_parent_path(stripped_folder_path, self._get_links(html))
 
     # --------- get metadata ---------
     def get_normalized_attributes(self, dataset_info, **kwargs):
-        raise NotImplementedError()
+        """Gets dataset attributes using http"""
+        raw_attributes = {}
+        self.add_url(dataset_info.url, raw_attributes)
+        normalized_attributes = self._metadata_handler.get_parameters(raw_attributes)
+        # TODO: add FTP_SERVICE_NAME and FTP_SERVICE in django-geo-spaas
+        normalized_attributes['geospaas_service_name'] = catalog_managers.HTTP_SERVICE_NAME
+        normalized_attributes['geospaas_service'] = catalog_managers.HTTP_SERVICE
+        return normalized_attributes
 
 
 class OpenDAPCrawler(HTMLDirectoryCrawler):
@@ -690,16 +706,18 @@ class FTPCrawler(DirectoryCrawler):
     logger = logging.getLogger(__name__ + '.FTPCrawler')
 
     def __init__(self, root_url, time_range=(None, None), include=None,
-                 username='anonymous', password='anonymous', max_threads=1):
-
+                 username=None, password=None, max_threads=1):
         if not root_url.startswith('ftp://'):
             raise ValueError("The root url must start with 'ftp://'")
 
-        self.username = username
-        self.password = password
+        if username is None:
+            username = 'anonymous'
+        if password is None:
+            password = 'anonymous'
         self.ftp = None
 
-        super().__init__(root_url, time_range, include, max_threads=1)
+        super().__init__(root_url, time_range, include, max_threads=1,
+                         username=username, password=password)
 
     def __getstate__(self):
         """Method used to pickle the crawler"""

diff --git a/tests/test_generic_crawlers.py b/tests/test_generic_crawlers.py
@@ -17,6 +17,7 @@
 
 import requests
 
+import geospaas.catalog.managers
 import geospaas_harvesting.crawlers as crawlers
 
 
@@ -609,6 +610,37 @@ def test_abstract_get_normalized_attributes(self):
 class HTMLDirectoryCrawlerTestCase(unittest.TestCase):
     """Tests for the HTMLDirectoryCrawler crawler"""
 
+    def test_strip_folder_page(self):
+        """_strip_folder_page() should remove the index page from a
+        folder path
+        """
+        self.assertEqual(
+            crawlers.HTMLDirectoryCrawler._strip_folder_page('/foo/bar/contents.html'),
+            '/foo/bar')
+        self.assertEqual(
+            crawlers.HTMLDirectoryCrawler._strip_folder_page('/foo/bar/'),
+            '/foo/bar')
+        self.assertEqual(
+            crawlers.HTMLDirectoryCrawler._strip_folder_page('/foo/bar'),
+            '/foo/bar')
+
+    def test_get_right_number_of_links(self):
+        """Test that the crawler gets the correct number of links from a test page"""
+        with open(os.path.join(
+                os.path.dirname(__file__), 'data', 'opendap', 'root.html')) as data_file:
+            html = data_file.read()
+        self.assertEqual(len(crawlers.HTMLDirectoryCrawler._get_links(html)), 4)
+
+        with open(os.path.join(os.path.dirname(__file__), 'data', 'empty.html')) as data_file:
+            html = data_file.read()
+        self.assertEqual(len(crawlers.HTMLDirectoryCrawler._get_links(html)), 0)
+
+    def test_link_extractor_error(self):
+        """In case of error, LinkExtractor must use a logger"""
+        parser = crawlers.LinkExtractor()
+        with self.assertLogs(parser.logger, level=logging.ERROR):
+            parser.error('some message')
+
     def test_prepend_parent_path(self):
         """
         Should prepend all the paths with the parent_path, except if they already start with it
@@ -620,12 +652,34 @@ def test_prepend_parent_path(self):
             ['/foo/bar', '/foo/baz']
         )
 
-    def test_abstract_get_normalized_attributes(self):
-        """The get_normalized_attribute is abstract in
-        HTMLDirectoryCrawler
+    def test_list_folder_contents(self):
+        """Test listing a folder's contents"""
+        with mock.patch('geospaas_harvesting.crawlers.Crawler._http_get') as mock_http_get:
+            mock_http_get.return_value = (
+                '<html>'
+                '<a href="bar/contents.html">folder/</a>'
+                '<a href="baz/">folder/</a>'
+                '<html/>')
+            crawler = crawlers.HTMLDirectoryCrawler('')
+            self.assertListEqual(
+                crawler._list_folder_contents('/foo/contents.html'),
+                ['/foo/bar/contents.html', '/foo/baz/'])
+
+    def test_get_normalized_attributes(self):
+        """Test that the attributes are gotten using metanorm, and the
+        geospaas_service attributes are set
         """
-        with self.assertRaises(NotImplementedError):
-            crawlers.HTMLDirectoryCrawler('').get_normalized_attributes(None)
+        crawler = crawlers.HTMLDirectoryCrawler('http://foo')
+        with mock.patch.object(crawler, '_metadata_handler') as mock_handler:
+            mock_handler.get_parameters.return_value = {'foo': 'bar'}
+            self.assertDictEqual(
+                    crawler.get_normalized_attributes(crawlers.DatasetInfo('ftp://uri')),
+                    {
+                        'foo': 'bar',
+                        'geospaas_service_name': geospaas.catalog.managers.HTTP_SERVICE_NAME,
+                        'geospaas_service': geospaas.catalog.managers.HTTP_SERVICE
+                    })
+            mock_handler.get_parameters.assert_called_once_with({'url': 'ftp://uri'})
 
 
 class OpenDAPCrawlerTestCase(unittest.TestCase):
@@ -739,26 +793,6 @@ def test_get_html_logs_error_on_http_status(self, mock_error_logger):
         _ = crawlers.OpenDAPCrawler._http_get(self.TEST_DATA['inexistent']['urls'][0])
         mock_error_logger.assert_called_once()
 
-    def test_get_right_number_of_links(self):
-        """Test that the crawler gets the correct number of links from a test page"""
-        links = {}
-        for sample in ('root', 'empty'):
-            data_file = open(os.path.join(
-                os.path.dirname(__file__),
-                self.TEST_DATA[sample]['file_path']))
-            html = data_file.read()
-            data_file.close()
-            links[sample] = crawlers.OpenDAPCrawler._get_links(html)
-
-        self.assertEqual(len(links['root']), 4)
-        self.assertEqual(len(links['empty']), 0)
-
-    def test_link_extractor_error(self):
-        """In case of error, LinkExtractor must use a logger"""
-        parser = crawlers.LinkExtractor()
-        with self.assertLogs(parser.logger, level=logging.ERROR):
-            parser.error('some message')
-
     def test_process_folder(self):
         """
         Explore root page and make sure the _url and _to_process attributes of the crawler have the