diff --git a/geospaas_harvesting/crawlers.py b/geospaas_harvesting/crawlers.py index 322c7e4..b3a56fc 100644 --- a/geospaas_harvesting/crawlers.py +++ b/geospaas_harvesting/crawlers.py @@ -318,7 +318,8 @@ class DirectoryCrawler(Crawler): f'^.*/{YEAR_PATTERN}/?{MONTH_PATTERN}/?{DAY_OF_MONTH_PATTERN}(/.*)?$') DAY_OF_YEAR_MATCHER = re.compile(f'^.*/{YEAR_PATTERN}/{DAY_OF_YEAR_PATTERN}(/.*)?$') - def __init__(self, root_url, time_range=(None, None), include=None, max_threads=1): + def __init__(self, root_url, time_range=(None, None), include=None, + username=None, password=None, max_threads=1): """ `root_url` is the URL of the data repository to explore. `time_range` is a 2-tuple of datetime.datetime objects defining the time range @@ -330,13 +331,17 @@ def __init__(self, root_url, time_range=(None, None), include=None, max_threads= self.root_url = urlparse(root_url) self.time_range = time_range self.include = re.compile(include) if include else None + self.username = username + self.password = password self.set_initial_state() def __eq__(self, other): return ( self.root_url == other.root_url and self.time_range == other.time_range and - self.include == other.include) + self.include == other.include and + self.username == other.username and + self.password == other.password) @property def base_url(self): @@ -479,7 +484,8 @@ def _process_folder(self, folder_path): self.logger.debug("Looking for resources in '%s'...", folder_path) for path in self._list_folder_contents(folder_path): # deselect paths which contains any of the excludes strings - if self.EXCLUDE and self.EXCLUDE.search(path): + if ((self.EXCLUDE and self.EXCLUDE.search(path)) or + self.root_url.path.startswith(path.rstrip(f"{os.sep}/"))): continue if self._is_folder(path): self._add_folder_to_process(path) @@ -514,11 +520,11 @@ def get_normalized_attributes(self, dataset_info, **kwargs): class HTMLDirectoryCrawler(DirectoryCrawler): - """Implementation of WebDirectoryCrawler for repositories exposed as HTML pages.""" + """Implementation of DirectoryCrawler for repositories exposed as HTML pages.""" logger = logging.getLogger(__name__ + '.HTMLDirectoryCrawler') - FOLDERS_SUFFIXES = None + FOLDERS_SUFFIXES = ('/',) # ------------- crawl ------------ @staticmethod @@ -527,7 +533,7 @@ def _strip_folder_page(folder_path): Remove the index page of a folder path. For example: /foo/bar/contents.html becomes /foo/bar. """ - return re.sub(r'/\w+\.html?$', r'', folder_path) + return re.sub(r'/(\w+\.html)?$', r'', folder_path) def _is_folder(self, path): return path.endswith(self.FOLDERS_SUFFIXES) @@ -557,13 +563,23 @@ def _prepend_parent_path(parent_path, paths): return result def _list_folder_contents(self, folder_path): - html = self._http_get(f"{self.base_url}{folder_path}") + request_parameters = {} + if self.username is not None and self.password is not None: + request_parameters['auth'] = (self.username, self.password) + html = self._http_get(f"{self.base_url}{folder_path}", request_parameters) stripped_folder_path = self._strip_folder_page(folder_path) return self._prepend_parent_path(stripped_folder_path, self._get_links(html)) # --------- get metadata --------- def get_normalized_attributes(self, dataset_info, **kwargs): - raise NotImplementedError() + """Gets dataset attributes using http""" + raw_attributes = {} + self.add_url(dataset_info.url, raw_attributes) + normalized_attributes = self._metadata_handler.get_parameters(raw_attributes) + # TODO: add FTP_SERVICE_NAME and FTP_SERVICE in django-geo-spaas + normalized_attributes['geospaas_service_name'] = catalog_managers.HTTP_SERVICE_NAME + normalized_attributes['geospaas_service'] = catalog_managers.HTTP_SERVICE + return normalized_attributes class OpenDAPCrawler(HTMLDirectoryCrawler): @@ -690,16 +706,18 @@ class FTPCrawler(DirectoryCrawler): logger = logging.getLogger(__name__ + '.FTPCrawler') def __init__(self, root_url, time_range=(None, None), include=None, - username='anonymous', password='anonymous', max_threads=1): - + username=None, password=None, max_threads=1): if not root_url.startswith('ftp://'): raise ValueError("The root url must start with 'ftp://'") - self.username = username - self.password = password + if username is None: + username = 'anonymous' + if password is None: + password = 'anonymous' self.ftp = None - super().__init__(root_url, time_range, include, max_threads=1) + super().__init__(root_url, time_range, include, max_threads=1, + username=username, password=password) def __getstate__(self): """Method used to pickle the crawler""" diff --git a/tests/test_generic_crawlers.py b/tests/test_generic_crawlers.py index bc82ad5..505ce60 100644 --- a/tests/test_generic_crawlers.py +++ b/tests/test_generic_crawlers.py @@ -17,6 +17,7 @@ import requests +import geospaas.catalog.managers import geospaas_harvesting.crawlers as crawlers @@ -609,6 +610,37 @@ def test_abstract_get_normalized_attributes(self): class HTMLDirectoryCrawlerTestCase(unittest.TestCase): """Tests for the HTMLDirectoryCrawler crawler""" + def test_strip_folder_page(self): + """_strip_folder_page() should remove the index page from a + folder path + """ + self.assertEqual( + crawlers.HTMLDirectoryCrawler._strip_folder_page('/foo/bar/contents.html'), + '/foo/bar') + self.assertEqual( + crawlers.HTMLDirectoryCrawler._strip_folder_page('/foo/bar/'), + '/foo/bar') + self.assertEqual( + crawlers.HTMLDirectoryCrawler._strip_folder_page('/foo/bar'), + '/foo/bar') + + def test_get_right_number_of_links(self): + """Test that the crawler gets the correct number of links from a test page""" + with open(os.path.join( + os.path.dirname(__file__), 'data', 'opendap', 'root.html')) as data_file: + html = data_file.read() + self.assertEqual(len(crawlers.HTMLDirectoryCrawler._get_links(html)), 4) + + with open(os.path.join(os.path.dirname(__file__), 'data', 'empty.html')) as data_file: + html = data_file.read() + self.assertEqual(len(crawlers.HTMLDirectoryCrawler._get_links(html)), 0) + + def test_link_extractor_error(self): + """In case of error, LinkExtractor must use a logger""" + parser = crawlers.LinkExtractor() + with self.assertLogs(parser.logger, level=logging.ERROR): + parser.error('some message') + def test_prepend_parent_path(self): """ Should prepend all the paths with the parent_path, except if they already start with it @@ -620,12 +652,34 @@ def test_prepend_parent_path(self): ['/foo/bar', '/foo/baz'] ) - def test_abstract_get_normalized_attributes(self): - """The get_normalized_attribute is abstract in - HTMLDirectoryCrawler + def test_list_folder_contents(self): + """Test listing a folder's contents""" + with mock.patch('geospaas_harvesting.crawlers.Crawler._http_get') as mock_http_get: + mock_http_get.return_value = ( + '' + 'folder/' + 'folder/' + '') + crawler = crawlers.HTMLDirectoryCrawler('') + self.assertListEqual( + crawler._list_folder_contents('/foo/contents.html'), + ['/foo/bar/contents.html', '/foo/baz/']) + + def test_get_normalized_attributes(self): + """Test that the attributes are gotten using metanorm, and the + geospaas_service attributes are set """ - with self.assertRaises(NotImplementedError): - crawlers.HTMLDirectoryCrawler('').get_normalized_attributes(None) + crawler = crawlers.HTMLDirectoryCrawler('http://foo') + with mock.patch.object(crawler, '_metadata_handler') as mock_handler: + mock_handler.get_parameters.return_value = {'foo': 'bar'} + self.assertDictEqual( + crawler.get_normalized_attributes(crawlers.DatasetInfo('ftp://uri')), + { + 'foo': 'bar', + 'geospaas_service_name': geospaas.catalog.managers.HTTP_SERVICE_NAME, + 'geospaas_service': geospaas.catalog.managers.HTTP_SERVICE + }) + mock_handler.get_parameters.assert_called_once_with({'url': 'ftp://uri'}) class OpenDAPCrawlerTestCase(unittest.TestCase): @@ -739,26 +793,6 @@ def test_get_html_logs_error_on_http_status(self, mock_error_logger): _ = crawlers.OpenDAPCrawler._http_get(self.TEST_DATA['inexistent']['urls'][0]) mock_error_logger.assert_called_once() - def test_get_right_number_of_links(self): - """Test that the crawler gets the correct number of links from a test page""" - links = {} - for sample in ('root', 'empty'): - data_file = open(os.path.join( - os.path.dirname(__file__), - self.TEST_DATA[sample]['file_path'])) - html = data_file.read() - data_file.close() - links[sample] = crawlers.OpenDAPCrawler._get_links(html) - - self.assertEqual(len(links['root']), 4) - self.assertEqual(len(links['empty']), 0) - - def test_link_extractor_error(self): - """In case of error, LinkExtractor must use a logger""" - parser = crawlers.LinkExtractor() - with self.assertLogs(parser.logger, level=logging.ERROR): - parser.error('some message') - def test_process_folder(self): """ Explore root page and make sure the _url and _to_process attributes of the crawler have the