From 0feaa15ce79fd939aa388fe1b6ee933637247749 Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Fri, 12 Jan 2024 12:09:20 +0000 Subject: [PATCH 1/4] make HTMLDirectoryCrawler a concrete class --- geospaas_harvesting/crawlers.py | 44 ++++++++++++----- tests/test_generic_crawlers.py | 84 +++++++++++++++++++++++---------- 2 files changed, 90 insertions(+), 38 deletions(-) diff --git a/geospaas_harvesting/crawlers.py b/geospaas_harvesting/crawlers.py index 322c7e41..b3a56fc8 100644 --- a/geospaas_harvesting/crawlers.py +++ b/geospaas_harvesting/crawlers.py @@ -318,7 +318,8 @@ class DirectoryCrawler(Crawler): f'^.*/{YEAR_PATTERN}/?{MONTH_PATTERN}/?{DAY_OF_MONTH_PATTERN}(/.*)?$') DAY_OF_YEAR_MATCHER = re.compile(f'^.*/{YEAR_PATTERN}/{DAY_OF_YEAR_PATTERN}(/.*)?$') - def __init__(self, root_url, time_range=(None, None), include=None, max_threads=1): + def __init__(self, root_url, time_range=(None, None), include=None, + username=None, password=None, max_threads=1): """ `root_url` is the URL of the data repository to explore. `time_range` is a 2-tuple of datetime.datetime objects defining the time range @@ -330,13 +331,17 @@ def __init__(self, root_url, time_range=(None, None), include=None, max_threads= self.root_url = urlparse(root_url) self.time_range = time_range self.include = re.compile(include) if include else None + self.username = username + self.password = password self.set_initial_state() def __eq__(self, other): return ( self.root_url == other.root_url and self.time_range == other.time_range and - self.include == other.include) + self.include == other.include and + self.username == other.username and + self.password == other.password) @property def base_url(self): @@ -479,7 +484,8 @@ def _process_folder(self, folder_path): self.logger.debug("Looking for resources in '%s'...", folder_path) for path in self._list_folder_contents(folder_path): # deselect paths which contains any of the excludes strings - if self.EXCLUDE and self.EXCLUDE.search(path): + if ((self.EXCLUDE and self.EXCLUDE.search(path)) or + self.root_url.path.startswith(path.rstrip(f"{os.sep}/"))): continue if self._is_folder(path): self._add_folder_to_process(path) @@ -514,11 +520,11 @@ def get_normalized_attributes(self, dataset_info, **kwargs): class HTMLDirectoryCrawler(DirectoryCrawler): - """Implementation of WebDirectoryCrawler for repositories exposed as HTML pages.""" + """Implementation of DirectoryCrawler for repositories exposed as HTML pages.""" logger = logging.getLogger(__name__ + '.HTMLDirectoryCrawler') - FOLDERS_SUFFIXES = None + FOLDERS_SUFFIXES = ('/',) # ------------- crawl ------------ @staticmethod @@ -527,7 +533,7 @@ def _strip_folder_page(folder_path): Remove the index page of a folder path. For example: /foo/bar/contents.html becomes /foo/bar. """ - return re.sub(r'/\w+\.html?$', r'', folder_path) + return re.sub(r'/(\w+\.html)?$', r'', folder_path) def _is_folder(self, path): return path.endswith(self.FOLDERS_SUFFIXES) @@ -557,13 +563,23 @@ def _prepend_parent_path(parent_path, paths): return result def _list_folder_contents(self, folder_path): - html = self._http_get(f"{self.base_url}{folder_path}") + request_parameters = {} + if self.username is not None and self.password is not None: + request_parameters['auth'] = (self.username, self.password) + html = self._http_get(f"{self.base_url}{folder_path}", request_parameters) stripped_folder_path = self._strip_folder_page(folder_path) return self._prepend_parent_path(stripped_folder_path, self._get_links(html)) # --------- get metadata --------- def get_normalized_attributes(self, dataset_info, **kwargs): - raise NotImplementedError() + """Gets dataset attributes using http""" + raw_attributes = {} + self.add_url(dataset_info.url, raw_attributes) + normalized_attributes = self._metadata_handler.get_parameters(raw_attributes) + # TODO: add FTP_SERVICE_NAME and FTP_SERVICE in django-geo-spaas + normalized_attributes['geospaas_service_name'] = catalog_managers.HTTP_SERVICE_NAME + normalized_attributes['geospaas_service'] = catalog_managers.HTTP_SERVICE + return normalized_attributes class OpenDAPCrawler(HTMLDirectoryCrawler): @@ -690,16 +706,18 @@ class FTPCrawler(DirectoryCrawler): logger = logging.getLogger(__name__ + '.FTPCrawler') def __init__(self, root_url, time_range=(None, None), include=None, - username='anonymous', password='anonymous', max_threads=1): - + username=None, password=None, max_threads=1): if not root_url.startswith('ftp://'): raise ValueError("The root url must start with 'ftp://'") - self.username = username - self.password = password + if username is None: + username = 'anonymous' + if password is None: + password = 'anonymous' self.ftp = None - super().__init__(root_url, time_range, include, max_threads=1) + super().__init__(root_url, time_range, include, max_threads=1, + username=username, password=password) def __getstate__(self): """Method used to pickle the crawler""" diff --git a/tests/test_generic_crawlers.py b/tests/test_generic_crawlers.py index bc82ad56..505ce601 100644 --- a/tests/test_generic_crawlers.py +++ b/tests/test_generic_crawlers.py @@ -17,6 +17,7 @@ import requests +import geospaas.catalog.managers import geospaas_harvesting.crawlers as crawlers @@ -609,6 +610,37 @@ def test_abstract_get_normalized_attributes(self): class HTMLDirectoryCrawlerTestCase(unittest.TestCase): """Tests for the HTMLDirectoryCrawler crawler""" + def test_strip_folder_page(self): + """_strip_folder_page() should remove the index page from a + folder path + """ + self.assertEqual( + crawlers.HTMLDirectoryCrawler._strip_folder_page('/foo/bar/contents.html'), + '/foo/bar') + self.assertEqual( + crawlers.HTMLDirectoryCrawler._strip_folder_page('/foo/bar/'), + '/foo/bar') + self.assertEqual( + crawlers.HTMLDirectoryCrawler._strip_folder_page('/foo/bar'), + '/foo/bar') + + def test_get_right_number_of_links(self): + """Test that the crawler gets the correct number of links from a test page""" + with open(os.path.join( + os.path.dirname(__file__), 'data', 'opendap', 'root.html')) as data_file: + html = data_file.read() + self.assertEqual(len(crawlers.HTMLDirectoryCrawler._get_links(html)), 4) + + with open(os.path.join(os.path.dirname(__file__), 'data', 'empty.html')) as data_file: + html = data_file.read() + self.assertEqual(len(crawlers.HTMLDirectoryCrawler._get_links(html)), 0) + + def test_link_extractor_error(self): + """In case of error, LinkExtractor must use a logger""" + parser = crawlers.LinkExtractor() + with self.assertLogs(parser.logger, level=logging.ERROR): + parser.error('some message') + def test_prepend_parent_path(self): """ Should prepend all the paths with the parent_path, except if they already start with it @@ -620,12 +652,34 @@ def test_prepend_parent_path(self): ['/foo/bar', '/foo/baz'] ) - def test_abstract_get_normalized_attributes(self): - """The get_normalized_attribute is abstract in - HTMLDirectoryCrawler + def test_list_folder_contents(self): + """Test listing a folder's contents""" + with mock.patch('geospaas_harvesting.crawlers.Crawler._http_get') as mock_http_get: + mock_http_get.return_value = ( + '' + 'folder/' + 'folder/' + '') + crawler = crawlers.HTMLDirectoryCrawler('') + self.assertListEqual( + crawler._list_folder_contents('/foo/contents.html'), + ['/foo/bar/contents.html', '/foo/baz/']) + + def test_get_normalized_attributes(self): + """Test that the attributes are gotten using metanorm, and the + geospaas_service attributes are set """ - with self.assertRaises(NotImplementedError): - crawlers.HTMLDirectoryCrawler('').get_normalized_attributes(None) + crawler = crawlers.HTMLDirectoryCrawler('http://foo') + with mock.patch.object(crawler, '_metadata_handler') as mock_handler: + mock_handler.get_parameters.return_value = {'foo': 'bar'} + self.assertDictEqual( + crawler.get_normalized_attributes(crawlers.DatasetInfo('ftp://uri')), + { + 'foo': 'bar', + 'geospaas_service_name': geospaas.catalog.managers.HTTP_SERVICE_NAME, + 'geospaas_service': geospaas.catalog.managers.HTTP_SERVICE + }) + mock_handler.get_parameters.assert_called_once_with({'url': 'ftp://uri'}) class OpenDAPCrawlerTestCase(unittest.TestCase): @@ -739,26 +793,6 @@ def test_get_html_logs_error_on_http_status(self, mock_error_logger): _ = crawlers.OpenDAPCrawler._http_get(self.TEST_DATA['inexistent']['urls'][0]) mock_error_logger.assert_called_once() - def test_get_right_number_of_links(self): - """Test that the crawler gets the correct number of links from a test page""" - links = {} - for sample in ('root', 'empty'): - data_file = open(os.path.join( - os.path.dirname(__file__), - self.TEST_DATA[sample]['file_path'])) - html = data_file.read() - data_file.close() - links[sample] = crawlers.OpenDAPCrawler._get_links(html) - - self.assertEqual(len(links['root']), 4) - self.assertEqual(len(links['empty']), 0) - - def test_link_extractor_error(self): - """In case of error, LinkExtractor must use a logger""" - parser = crawlers.LinkExtractor() - with self.assertLogs(parser.logger, level=logging.ERROR): - parser.error('some message') - def test_process_folder(self): """ Explore root page and make sure the _url and _to_process attributes of the crawler have the From df0ad62bbcc93aa5fafce84455454af693cb3074 Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Fri, 12 Jan 2024 13:00:45 +0000 Subject: [PATCH 2/4] add http provider --- geospaas_harvesting/config.py | 2 ++ geospaas_harvesting/providers/http.py | 25 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 geospaas_harvesting/providers/http.py diff --git a/geospaas_harvesting/config.py b/geospaas_harvesting/config.py index 9bea0f0d..4c53b775 100644 --- a/geospaas_harvesting/config.py +++ b/geospaas_harvesting/config.py @@ -8,6 +8,7 @@ import geospaas_harvesting.providers.resto as providers_resto import geospaas_harvesting.providers.earthdata_cmr as providers_earthdata_cmr import geospaas_harvesting.providers.ftp as providers_ftp +import geospaas_harvesting.providers.http as providers_http import geospaas_harvesting.providers.jaxa as providers_jaxa import geospaas_harvesting.providers.local as providers_local import geospaas_harvesting.providers.metno as providers_metno @@ -62,6 +63,7 @@ class ProvidersArgument(DictArgument): 'earthdata_cmr': providers_earthdata_cmr.EarthDataCMRProvider, 'ftp': providers_ftp.FTPProvider, 'gportal_ftp': providers_jaxa.GPortalProvider, + 'http': providers_http.HTTPProvider, 'netcdf': providers_local.NetCDFProvider, 'nansat': providers_local.NansatProvider, 'metno': providers_metno.METNOProvider, diff --git a/geospaas_harvesting/providers/http.py b/geospaas_harvesting/providers/http.py new file mode 100644 index 00000000..e246eec8 --- /dev/null +++ b/geospaas_harvesting/providers/http.py @@ -0,0 +1,25 @@ +"""Code for searching FTP repositories""" +from urllib.parse import urljoin + +from .base import Provider, TimeFilterMixin +from ..arguments import PathArgument, StringArgument +from ..crawlers import HTMLDirectoryCrawler + + +class HTTPProvider(TimeFilterMixin, Provider): + """Generic HTTP directory provider""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.search_parameters_parser.add_arguments([ + StringArgument('url', required=True), + StringArgument('include', default='.'), + ]) + + def _make_crawler(self, parameters): + return HTMLDirectoryCrawler( + parameters['url'], + time_range=(parameters['start_time'], parameters['end_time']), + username=self.username, + password=self.password, + include=parameters['include']) From 5da7f3e55b45c2c6c9d6a9f3e097f2aff62822c1 Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Fri, 12 Jan 2024 13:18:05 +0000 Subject: [PATCH 3/4] add tests for http provider --- tests/providers/test_http.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 tests/providers/test_http.py diff --git a/tests/providers/test_http.py b/tests/providers/test_http.py new file mode 100644 index 00000000..fb1935bf --- /dev/null +++ b/tests/providers/test_http.py @@ -0,0 +1,32 @@ +# pylint: disable=protected-access +"""Tests for the generic FTP provider""" +import unittest +import unittest.mock as mock +from datetime import datetime, timezone + +import geospaas_harvesting.crawlers as crawlers +from geospaas_harvesting.providers.http import HTTPProvider + + +class HTTPProviderTestCase(unittest.TestCase): + """Tests for HTTPProvider""" + + def test_make_crawler(self): + """Test creating a crawler from parameters""" + provider = HTTPProvider(name='test', username='user', password='pass') + parameters = { + 'start_time': datetime(2023, 1, 1, tzinfo=timezone.utc), + 'end_time': datetime(2023, 1, 2, tzinfo=timezone.utc), + 'url': 'http://foo/bar', + 'include': '.*' + } + with mock.patch('ftplib.FTP'): + self.assertEqual( + provider._make_crawler(parameters), + crawlers.HTMLDirectoryCrawler( + 'http://foo/bar', + include='.*', + time_range=(datetime(2023, 1, 1, tzinfo=timezone.utc), + datetime(2023, 1, 2, tzinfo=timezone.utc)), + username='user', + password='pass')) From 2e50608727d00b19b8d4130dc4b778bfe9b588f4 Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Fri, 12 Jan 2024 13:18:21 +0000 Subject: [PATCH 4/4] add missing tests for HTMLDirectoryCrawler --- tests/test_generic_crawlers.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_generic_crawlers.py b/tests/test_generic_crawlers.py index 505ce601..c6266b08 100644 --- a/tests/test_generic_crawlers.py +++ b/tests/test_generic_crawlers.py @@ -665,6 +665,26 @@ def test_list_folder_contents(self): crawler._list_folder_contents('/foo/contents.html'), ['/foo/bar/contents.html', '/foo/baz/']) + def test_list_folder_contents_no_auth(self): + """If no username and password are provided, HTTP requests + should not have an 'auth' parameter + """ + with mock.patch('geospaas_harvesting.crawlers.Crawler._http_get') as mock_http_get: + mock_http_get.return_value = ('') + crawler = crawlers.HTMLDirectoryCrawler('http://foo') + crawler._list_folder_contents('/bar') + mock_http_get.assert_called_once_with('http://foo/bar', {}) + + def test_list_folder_contents_with_auth(self): + """If a username and password are provided, HTTP requests + should have an 'auth' parameter + """ + with mock.patch('geospaas_harvesting.crawlers.Crawler._http_get') as mock_http_get: + mock_http_get.return_value = ('') + crawler = crawlers.HTMLDirectoryCrawler('http://foo', username='user', password='pass') + crawler._list_folder_contents('/bar') + mock_http_get.assert_called_once_with('http://foo/bar', {'auth': ('user', 'pass')}) + def test_get_normalized_attributes(self): """Test that the attributes are gotten using metanorm, and the geospaas_service attributes are set