From 0feaa15ce79fd939aa388fe1b6ee933637247749 Mon Sep 17 00:00:00 2001
From: Adrien Perrin
Date: Fri, 12 Jan 2024 12:09:20 +0000
Subject: [PATCH 1/4] make HTMLDirectoryCrawler a concrete class
---
geospaas_harvesting/crawlers.py | 44 ++++++++++++-----
tests/test_generic_crawlers.py | 84 +++++++++++++++++++++++----------
2 files changed, 90 insertions(+), 38 deletions(-)
diff --git a/geospaas_harvesting/crawlers.py b/geospaas_harvesting/crawlers.py
index 322c7e41..b3a56fc8 100644
--- a/geospaas_harvesting/crawlers.py
+++ b/geospaas_harvesting/crawlers.py
@@ -318,7 +318,8 @@ class DirectoryCrawler(Crawler):
f'^.*/{YEAR_PATTERN}/?{MONTH_PATTERN}/?{DAY_OF_MONTH_PATTERN}(/.*)?$')
DAY_OF_YEAR_MATCHER = re.compile(f'^.*/{YEAR_PATTERN}/{DAY_OF_YEAR_PATTERN}(/.*)?$')
- def __init__(self, root_url, time_range=(None, None), include=None, max_threads=1):
+ def __init__(self, root_url, time_range=(None, None), include=None,
+ username=None, password=None, max_threads=1):
"""
`root_url` is the URL of the data repository to explore.
`time_range` is a 2-tuple of datetime.datetime objects defining the time range
@@ -330,13 +331,17 @@ def __init__(self, root_url, time_range=(None, None), include=None, max_threads=
self.root_url = urlparse(root_url)
self.time_range = time_range
self.include = re.compile(include) if include else None
+ self.username = username
+ self.password = password
self.set_initial_state()
def __eq__(self, other):
return (
self.root_url == other.root_url and
self.time_range == other.time_range and
- self.include == other.include)
+ self.include == other.include and
+ self.username == other.username and
+ self.password == other.password)
@property
def base_url(self):
@@ -479,7 +484,8 @@ def _process_folder(self, folder_path):
self.logger.debug("Looking for resources in '%s'...", folder_path)
for path in self._list_folder_contents(folder_path):
# deselect paths which contains any of the excludes strings
- if self.EXCLUDE and self.EXCLUDE.search(path):
+ if ((self.EXCLUDE and self.EXCLUDE.search(path)) or
+ self.root_url.path.startswith(path.rstrip(f"{os.sep}/"))):
continue
if self._is_folder(path):
self._add_folder_to_process(path)
@@ -514,11 +520,11 @@ def get_normalized_attributes(self, dataset_info, **kwargs):
class HTMLDirectoryCrawler(DirectoryCrawler):
- """Implementation of WebDirectoryCrawler for repositories exposed as HTML pages."""
+ """Implementation of DirectoryCrawler for repositories exposed as HTML pages."""
logger = logging.getLogger(__name__ + '.HTMLDirectoryCrawler')
- FOLDERS_SUFFIXES = None
+ FOLDERS_SUFFIXES = ('/',)
# ------------- crawl ------------
@staticmethod
@@ -527,7 +533,7 @@ def _strip_folder_page(folder_path):
Remove the index page of a folder path.
For example: /foo/bar/contents.html becomes /foo/bar.
"""
- return re.sub(r'/\w+\.html?$', r'', folder_path)
+ return re.sub(r'/(\w+\.html)?$', r'', folder_path)
def _is_folder(self, path):
return path.endswith(self.FOLDERS_SUFFIXES)
@@ -557,13 +563,23 @@ def _prepend_parent_path(parent_path, paths):
return result
def _list_folder_contents(self, folder_path):
- html = self._http_get(f"{self.base_url}{folder_path}")
+ request_parameters = {}
+ if self.username is not None and self.password is not None:
+ request_parameters['auth'] = (self.username, self.password)
+ html = self._http_get(f"{self.base_url}{folder_path}", request_parameters)
stripped_folder_path = self._strip_folder_page(folder_path)
return self._prepend_parent_path(stripped_folder_path, self._get_links(html))
# --------- get metadata ---------
def get_normalized_attributes(self, dataset_info, **kwargs):
- raise NotImplementedError()
+ """Gets dataset attributes using http"""
+ raw_attributes = {}
+ self.add_url(dataset_info.url, raw_attributes)
+ normalized_attributes = self._metadata_handler.get_parameters(raw_attributes)
+ # TODO: add FTP_SERVICE_NAME and FTP_SERVICE in django-geo-spaas
+ normalized_attributes['geospaas_service_name'] = catalog_managers.HTTP_SERVICE_NAME
+ normalized_attributes['geospaas_service'] = catalog_managers.HTTP_SERVICE
+ return normalized_attributes
class OpenDAPCrawler(HTMLDirectoryCrawler):
@@ -690,16 +706,18 @@ class FTPCrawler(DirectoryCrawler):
logger = logging.getLogger(__name__ + '.FTPCrawler')
def __init__(self, root_url, time_range=(None, None), include=None,
- username='anonymous', password='anonymous', max_threads=1):
-
+ username=None, password=None, max_threads=1):
if not root_url.startswith('ftp://'):
raise ValueError("The root url must start with 'ftp://'")
- self.username = username
- self.password = password
+ if username is None:
+ username = 'anonymous'
+ if password is None:
+ password = 'anonymous'
self.ftp = None
- super().__init__(root_url, time_range, include, max_threads=1)
+ super().__init__(root_url, time_range, include, max_threads=1,
+ username=username, password=password)
def __getstate__(self):
"""Method used to pickle the crawler"""
diff --git a/tests/test_generic_crawlers.py b/tests/test_generic_crawlers.py
index bc82ad56..505ce601 100644
--- a/tests/test_generic_crawlers.py
+++ b/tests/test_generic_crawlers.py
@@ -17,6 +17,7 @@
import requests
+import geospaas.catalog.managers
import geospaas_harvesting.crawlers as crawlers
@@ -609,6 +610,37 @@ def test_abstract_get_normalized_attributes(self):
class HTMLDirectoryCrawlerTestCase(unittest.TestCase):
"""Tests for the HTMLDirectoryCrawler crawler"""
+ def test_strip_folder_page(self):
+ """_strip_folder_page() should remove the index page from a
+ folder path
+ """
+ self.assertEqual(
+ crawlers.HTMLDirectoryCrawler._strip_folder_page('/foo/bar/contents.html'),
+ '/foo/bar')
+ self.assertEqual(
+ crawlers.HTMLDirectoryCrawler._strip_folder_page('/foo/bar/'),
+ '/foo/bar')
+ self.assertEqual(
+ crawlers.HTMLDirectoryCrawler._strip_folder_page('/foo/bar'),
+ '/foo/bar')
+
+ def test_get_right_number_of_links(self):
+ """Test that the crawler gets the correct number of links from a test page"""
+ with open(os.path.join(
+ os.path.dirname(__file__), 'data', 'opendap', 'root.html')) as data_file:
+ html = data_file.read()
+ self.assertEqual(len(crawlers.HTMLDirectoryCrawler._get_links(html)), 4)
+
+ with open(os.path.join(os.path.dirname(__file__), 'data', 'empty.html')) as data_file:
+ html = data_file.read()
+ self.assertEqual(len(crawlers.HTMLDirectoryCrawler._get_links(html)), 0)
+
+ def test_link_extractor_error(self):
+ """In case of error, LinkExtractor must use a logger"""
+ parser = crawlers.LinkExtractor()
+ with self.assertLogs(parser.logger, level=logging.ERROR):
+ parser.error('some message')
+
def test_prepend_parent_path(self):
"""
Should prepend all the paths with the parent_path, except if they already start with it
@@ -620,12 +652,34 @@ def test_prepend_parent_path(self):
['/foo/bar', '/foo/baz']
)
- def test_abstract_get_normalized_attributes(self):
- """The get_normalized_attribute is abstract in
- HTMLDirectoryCrawler
+ def test_list_folder_contents(self):
+ """Test listing a folder's contents"""
+ with mock.patch('geospaas_harvesting.crawlers.Crawler._http_get') as mock_http_get:
+ mock_http_get.return_value = (
+ ''
+ 'folder/'
+ 'folder/'
+ '')
+ crawler = crawlers.HTMLDirectoryCrawler('')
+ self.assertListEqual(
+ crawler._list_folder_contents('/foo/contents.html'),
+ ['/foo/bar/contents.html', '/foo/baz/'])
+
+ def test_get_normalized_attributes(self):
+ """Test that the attributes are gotten using metanorm, and the
+ geospaas_service attributes are set
"""
- with self.assertRaises(NotImplementedError):
- crawlers.HTMLDirectoryCrawler('').get_normalized_attributes(None)
+ crawler = crawlers.HTMLDirectoryCrawler('http://foo')
+ with mock.patch.object(crawler, '_metadata_handler') as mock_handler:
+ mock_handler.get_parameters.return_value = {'foo': 'bar'}
+ self.assertDictEqual(
+ crawler.get_normalized_attributes(crawlers.DatasetInfo('ftp://uri')),
+ {
+ 'foo': 'bar',
+ 'geospaas_service_name': geospaas.catalog.managers.HTTP_SERVICE_NAME,
+ 'geospaas_service': geospaas.catalog.managers.HTTP_SERVICE
+ })
+ mock_handler.get_parameters.assert_called_once_with({'url': 'ftp://uri'})
class OpenDAPCrawlerTestCase(unittest.TestCase):
@@ -739,26 +793,6 @@ def test_get_html_logs_error_on_http_status(self, mock_error_logger):
_ = crawlers.OpenDAPCrawler._http_get(self.TEST_DATA['inexistent']['urls'][0])
mock_error_logger.assert_called_once()
- def test_get_right_number_of_links(self):
- """Test that the crawler gets the correct number of links from a test page"""
- links = {}
- for sample in ('root', 'empty'):
- data_file = open(os.path.join(
- os.path.dirname(__file__),
- self.TEST_DATA[sample]['file_path']))
- html = data_file.read()
- data_file.close()
- links[sample] = crawlers.OpenDAPCrawler._get_links(html)
-
- self.assertEqual(len(links['root']), 4)
- self.assertEqual(len(links['empty']), 0)
-
- def test_link_extractor_error(self):
- """In case of error, LinkExtractor must use a logger"""
- parser = crawlers.LinkExtractor()
- with self.assertLogs(parser.logger, level=logging.ERROR):
- parser.error('some message')
-
def test_process_folder(self):
"""
Explore root page and make sure the _url and _to_process attributes of the crawler have the
From df0ad62bbcc93aa5fafce84455454af693cb3074 Mon Sep 17 00:00:00 2001
From: Adrien Perrin
Date: Fri, 12 Jan 2024 13:00:45 +0000
Subject: [PATCH 2/4] add http provider
---
geospaas_harvesting/config.py | 2 ++
geospaas_harvesting/providers/http.py | 25 +++++++++++++++++++++++++
2 files changed, 27 insertions(+)
create mode 100644 geospaas_harvesting/providers/http.py
diff --git a/geospaas_harvesting/config.py b/geospaas_harvesting/config.py
index 9bea0f0d..4c53b775 100644
--- a/geospaas_harvesting/config.py
+++ b/geospaas_harvesting/config.py
@@ -8,6 +8,7 @@
import geospaas_harvesting.providers.resto as providers_resto
import geospaas_harvesting.providers.earthdata_cmr as providers_earthdata_cmr
import geospaas_harvesting.providers.ftp as providers_ftp
+import geospaas_harvesting.providers.http as providers_http
import geospaas_harvesting.providers.jaxa as providers_jaxa
import geospaas_harvesting.providers.local as providers_local
import geospaas_harvesting.providers.metno as providers_metno
@@ -62,6 +63,7 @@ class ProvidersArgument(DictArgument):
'earthdata_cmr': providers_earthdata_cmr.EarthDataCMRProvider,
'ftp': providers_ftp.FTPProvider,
'gportal_ftp': providers_jaxa.GPortalProvider,
+ 'http': providers_http.HTTPProvider,
'netcdf': providers_local.NetCDFProvider,
'nansat': providers_local.NansatProvider,
'metno': providers_metno.METNOProvider,
diff --git a/geospaas_harvesting/providers/http.py b/geospaas_harvesting/providers/http.py
new file mode 100644
index 00000000..e246eec8
--- /dev/null
+++ b/geospaas_harvesting/providers/http.py
@@ -0,0 +1,25 @@
+"""Code for searching FTP repositories"""
+from urllib.parse import urljoin
+
+from .base import Provider, TimeFilterMixin
+from ..arguments import PathArgument, StringArgument
+from ..crawlers import HTMLDirectoryCrawler
+
+
+class HTTPProvider(TimeFilterMixin, Provider):
+ """Generic HTTP directory provider"""
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.search_parameters_parser.add_arguments([
+ StringArgument('url', required=True),
+ StringArgument('include', default='.'),
+ ])
+
+ def _make_crawler(self, parameters):
+ return HTMLDirectoryCrawler(
+ parameters['url'],
+ time_range=(parameters['start_time'], parameters['end_time']),
+ username=self.username,
+ password=self.password,
+ include=parameters['include'])
From 5da7f3e55b45c2c6c9d6a9f3e097f2aff62822c1 Mon Sep 17 00:00:00 2001
From: Adrien Perrin
Date: Fri, 12 Jan 2024 13:18:05 +0000
Subject: [PATCH 3/4] add tests for http provider
---
tests/providers/test_http.py | 32 ++++++++++++++++++++++++++++++++
1 file changed, 32 insertions(+)
create mode 100644 tests/providers/test_http.py
diff --git a/tests/providers/test_http.py b/tests/providers/test_http.py
new file mode 100644
index 00000000..fb1935bf
--- /dev/null
+++ b/tests/providers/test_http.py
@@ -0,0 +1,32 @@
+# pylint: disable=protected-access
+"""Tests for the generic FTP provider"""
+import unittest
+import unittest.mock as mock
+from datetime import datetime, timezone
+
+import geospaas_harvesting.crawlers as crawlers
+from geospaas_harvesting.providers.http import HTTPProvider
+
+
+class HTTPProviderTestCase(unittest.TestCase):
+ """Tests for HTTPProvider"""
+
+ def test_make_crawler(self):
+ """Test creating a crawler from parameters"""
+ provider = HTTPProvider(name='test', username='user', password='pass')
+ parameters = {
+ 'start_time': datetime(2023, 1, 1, tzinfo=timezone.utc),
+ 'end_time': datetime(2023, 1, 2, tzinfo=timezone.utc),
+ 'url': 'http://foo/bar',
+ 'include': '.*'
+ }
+ with mock.patch('ftplib.FTP'):
+ self.assertEqual(
+ provider._make_crawler(parameters),
+ crawlers.HTMLDirectoryCrawler(
+ 'http://foo/bar',
+ include='.*',
+ time_range=(datetime(2023, 1, 1, tzinfo=timezone.utc),
+ datetime(2023, 1, 2, tzinfo=timezone.utc)),
+ username='user',
+ password='pass'))
From 2e50608727d00b19b8d4130dc4b778bfe9b588f4 Mon Sep 17 00:00:00 2001
From: Adrien Perrin
Date: Fri, 12 Jan 2024 13:18:21 +0000
Subject: [PATCH 4/4] add missing tests for HTMLDirectoryCrawler
---
tests/test_generic_crawlers.py | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/tests/test_generic_crawlers.py b/tests/test_generic_crawlers.py
index 505ce601..c6266b08 100644
--- a/tests/test_generic_crawlers.py
+++ b/tests/test_generic_crawlers.py
@@ -665,6 +665,26 @@ def test_list_folder_contents(self):
crawler._list_folder_contents('/foo/contents.html'),
['/foo/bar/contents.html', '/foo/baz/'])
+ def test_list_folder_contents_no_auth(self):
+ """If no username and password are provided, HTTP requests
+ should not have an 'auth' parameter
+ """
+ with mock.patch('geospaas_harvesting.crawlers.Crawler._http_get') as mock_http_get:
+ mock_http_get.return_value = ('')
+ crawler = crawlers.HTMLDirectoryCrawler('http://foo')
+ crawler._list_folder_contents('/bar')
+ mock_http_get.assert_called_once_with('http://foo/bar', {})
+
+ def test_list_folder_contents_with_auth(self):
+ """If a username and password are provided, HTTP requests
+ should have an 'auth' parameter
+ """
+ with mock.patch('geospaas_harvesting.crawlers.Crawler._http_get') as mock_http_get:
+ mock_http_get.return_value = ('')
+ crawler = crawlers.HTMLDirectoryCrawler('http://foo', username='user', password='pass')
+ crawler._list_folder_contents('/bar')
+ mock_http_get.assert_called_once_with('http://foo/bar', {'auth': ('user', 'pass')})
+
def test_get_normalized_attributes(self):
"""Test that the attributes are gotten using metanorm, and the
geospaas_service attributes are set