Skip to content

Commit

Permalink
Merge pull request #136 from nansencenter/issue135_http_provider
Browse files Browse the repository at this point in the history
Make an HTTP provider
  • Loading branch information
aperrin66 authored Jan 15, 2024
2 parents 81cff2c + 2e50608 commit 9ff56c7
Show file tree
Hide file tree
Showing 5 changed files with 169 additions and 38 deletions.
2 changes: 2 additions & 0 deletions geospaas_harvesting/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import geospaas_harvesting.providers.resto as providers_resto
import geospaas_harvesting.providers.earthdata_cmr as providers_earthdata_cmr
import geospaas_harvesting.providers.ftp as providers_ftp
import geospaas_harvesting.providers.http as providers_http
import geospaas_harvesting.providers.jaxa as providers_jaxa
import geospaas_harvesting.providers.local as providers_local
import geospaas_harvesting.providers.metno as providers_metno
Expand Down Expand Up @@ -62,6 +63,7 @@ class ProvidersArgument(DictArgument):
'earthdata_cmr': providers_earthdata_cmr.EarthDataCMRProvider,
'ftp': providers_ftp.FTPProvider,
'gportal_ftp': providers_jaxa.GPortalProvider,
'http': providers_http.HTTPProvider,
'netcdf': providers_local.NetCDFProvider,
'nansat': providers_local.NansatProvider,
'metno': providers_metno.METNOProvider,
Expand Down
44 changes: 31 additions & 13 deletions geospaas_harvesting/crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,8 @@ class DirectoryCrawler(Crawler):
f'^.*/{YEAR_PATTERN}/?{MONTH_PATTERN}/?{DAY_OF_MONTH_PATTERN}(/.*)?$')
DAY_OF_YEAR_MATCHER = re.compile(f'^.*/{YEAR_PATTERN}/{DAY_OF_YEAR_PATTERN}(/.*)?$')

def __init__(self, root_url, time_range=(None, None), include=None, max_threads=1):
def __init__(self, root_url, time_range=(None, None), include=None,
username=None, password=None, max_threads=1):
"""
`root_url` is the URL of the data repository to explore.
`time_range` is a 2-tuple of datetime.datetime objects defining the time range
Expand All @@ -330,13 +331,17 @@ def __init__(self, root_url, time_range=(None, None), include=None, max_threads=
self.root_url = urlparse(root_url)
self.time_range = time_range
self.include = re.compile(include) if include else None
self.username = username
self.password = password
self.set_initial_state()

def __eq__(self, other):
return (
self.root_url == other.root_url and
self.time_range == other.time_range and
self.include == other.include)
self.include == other.include and
self.username == other.username and
self.password == other.password)

@property
def base_url(self):
Expand Down Expand Up @@ -479,7 +484,8 @@ def _process_folder(self, folder_path):
self.logger.debug("Looking for resources in '%s'...", folder_path)
for path in self._list_folder_contents(folder_path):
# deselect paths which contains any of the excludes strings
if self.EXCLUDE and self.EXCLUDE.search(path):
if ((self.EXCLUDE and self.EXCLUDE.search(path)) or
self.root_url.path.startswith(path.rstrip(f"{os.sep}/"))):
continue
if self._is_folder(path):
self._add_folder_to_process(path)
Expand Down Expand Up @@ -514,11 +520,11 @@ def get_normalized_attributes(self, dataset_info, **kwargs):


class HTMLDirectoryCrawler(DirectoryCrawler):
"""Implementation of WebDirectoryCrawler for repositories exposed as HTML pages."""
"""Implementation of DirectoryCrawler for repositories exposed as HTML pages."""

logger = logging.getLogger(__name__ + '.HTMLDirectoryCrawler')

FOLDERS_SUFFIXES = None
FOLDERS_SUFFIXES = ('/',)

# ------------- crawl ------------
@staticmethod
Expand All @@ -527,7 +533,7 @@ def _strip_folder_page(folder_path):
Remove the index page of a folder path.
For example: /foo/bar/contents.html becomes /foo/bar.
"""
return re.sub(r'/\w+\.html?$', r'', folder_path)
return re.sub(r'/(\w+\.html)?$', r'', folder_path)

def _is_folder(self, path):
return path.endswith(self.FOLDERS_SUFFIXES)
Expand Down Expand Up @@ -557,13 +563,23 @@ def _prepend_parent_path(parent_path, paths):
return result

def _list_folder_contents(self, folder_path):
html = self._http_get(f"{self.base_url}{folder_path}")
request_parameters = {}
if self.username is not None and self.password is not None:
request_parameters['auth'] = (self.username, self.password)
html = self._http_get(f"{self.base_url}{folder_path}", request_parameters)
stripped_folder_path = self._strip_folder_page(folder_path)
return self._prepend_parent_path(stripped_folder_path, self._get_links(html))

# --------- get metadata ---------
def get_normalized_attributes(self, dataset_info, **kwargs):
raise NotImplementedError()
"""Gets dataset attributes using http"""
raw_attributes = {}
self.add_url(dataset_info.url, raw_attributes)
normalized_attributes = self._metadata_handler.get_parameters(raw_attributes)
# TODO: add FTP_SERVICE_NAME and FTP_SERVICE in django-geo-spaas
normalized_attributes['geospaas_service_name'] = catalog_managers.HTTP_SERVICE_NAME
normalized_attributes['geospaas_service'] = catalog_managers.HTTP_SERVICE
return normalized_attributes


class OpenDAPCrawler(HTMLDirectoryCrawler):
Expand Down Expand Up @@ -690,16 +706,18 @@ class FTPCrawler(DirectoryCrawler):
logger = logging.getLogger(__name__ + '.FTPCrawler')

def __init__(self, root_url, time_range=(None, None), include=None,
username='anonymous', password='anonymous', max_threads=1):

username=None, password=None, max_threads=1):
if not root_url.startswith('ftp://'):
raise ValueError("The root url must start with 'ftp://'")

self.username = username
self.password = password
if username is None:
username = 'anonymous'
if password is None:
password = 'anonymous'
self.ftp = None

super().__init__(root_url, time_range, include, max_threads=1)
super().__init__(root_url, time_range, include, max_threads=1,
username=username, password=password)

def __getstate__(self):
"""Method used to pickle the crawler"""
Expand Down
25 changes: 25 additions & 0 deletions geospaas_harvesting/providers/http.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""Code for searching FTP repositories"""
from urllib.parse import urljoin

from .base import Provider, TimeFilterMixin
from ..arguments import PathArgument, StringArgument
from ..crawlers import HTMLDirectoryCrawler


class HTTPProvider(TimeFilterMixin, Provider):
"""Generic HTTP directory provider"""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.search_parameters_parser.add_arguments([
StringArgument('url', required=True),
StringArgument('include', default='.'),
])

def _make_crawler(self, parameters):
return HTMLDirectoryCrawler(
parameters['url'],
time_range=(parameters['start_time'], parameters['end_time']),
username=self.username,
password=self.password,
include=parameters['include'])
32 changes: 32 additions & 0 deletions tests/providers/test_http.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# pylint: disable=protected-access
"""Tests for the generic FTP provider"""
import unittest
import unittest.mock as mock
from datetime import datetime, timezone

import geospaas_harvesting.crawlers as crawlers
from geospaas_harvesting.providers.http import HTTPProvider


class HTTPProviderTestCase(unittest.TestCase):
"""Tests for HTTPProvider"""

def test_make_crawler(self):
"""Test creating a crawler from parameters"""
provider = HTTPProvider(name='test', username='user', password='pass')
parameters = {
'start_time': datetime(2023, 1, 1, tzinfo=timezone.utc),
'end_time': datetime(2023, 1, 2, tzinfo=timezone.utc),
'url': 'http://foo/bar',
'include': '.*'
}
with mock.patch('ftplib.FTP'):
self.assertEqual(
provider._make_crawler(parameters),
crawlers.HTMLDirectoryCrawler(
'http://foo/bar',
include='.*',
time_range=(datetime(2023, 1, 1, tzinfo=timezone.utc),
datetime(2023, 1, 2, tzinfo=timezone.utc)),
username='user',
password='pass'))
104 changes: 79 additions & 25 deletions tests/test_generic_crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import requests

import geospaas.catalog.managers
import geospaas_harvesting.crawlers as crawlers


Expand Down Expand Up @@ -609,6 +610,37 @@ def test_abstract_get_normalized_attributes(self):
class HTMLDirectoryCrawlerTestCase(unittest.TestCase):
"""Tests for the HTMLDirectoryCrawler crawler"""

def test_strip_folder_page(self):
"""_strip_folder_page() should remove the index page from a
folder path
"""
self.assertEqual(
crawlers.HTMLDirectoryCrawler._strip_folder_page('/foo/bar/contents.html'),
'/foo/bar')
self.assertEqual(
crawlers.HTMLDirectoryCrawler._strip_folder_page('/foo/bar/'),
'/foo/bar')
self.assertEqual(
crawlers.HTMLDirectoryCrawler._strip_folder_page('/foo/bar'),
'/foo/bar')

def test_get_right_number_of_links(self):
"""Test that the crawler gets the correct number of links from a test page"""
with open(os.path.join(
os.path.dirname(__file__), 'data', 'opendap', 'root.html')) as data_file:
html = data_file.read()
self.assertEqual(len(crawlers.HTMLDirectoryCrawler._get_links(html)), 4)

with open(os.path.join(os.path.dirname(__file__), 'data', 'empty.html')) as data_file:
html = data_file.read()
self.assertEqual(len(crawlers.HTMLDirectoryCrawler._get_links(html)), 0)

def test_link_extractor_error(self):
"""In case of error, LinkExtractor must use a logger"""
parser = crawlers.LinkExtractor()
with self.assertLogs(parser.logger, level=logging.ERROR):
parser.error('some message')

def test_prepend_parent_path(self):
"""
Should prepend all the paths with the parent_path, except if they already start with it
Expand All @@ -620,12 +652,54 @@ def test_prepend_parent_path(self):
['/foo/bar', '/foo/baz']
)

def test_abstract_get_normalized_attributes(self):
"""The get_normalized_attribute is abstract in
HTMLDirectoryCrawler
def test_list_folder_contents(self):
"""Test listing a folder's contents"""
with mock.patch('geospaas_harvesting.crawlers.Crawler._http_get') as mock_http_get:
mock_http_get.return_value = (
'<html>'
'<a href="bar/contents.html">folder/</a>'
'<a href="baz/">folder/</a>'
'<html/>')
crawler = crawlers.HTMLDirectoryCrawler('')
self.assertListEqual(
crawler._list_folder_contents('/foo/contents.html'),
['/foo/bar/contents.html', '/foo/baz/'])

def test_list_folder_contents_no_auth(self):
"""If no username and password are provided, HTTP requests
should not have an 'auth' parameter
"""
with self.assertRaises(NotImplementedError):
crawlers.HTMLDirectoryCrawler('').get_normalized_attributes(None)
with mock.patch('geospaas_harvesting.crawlers.Crawler._http_get') as mock_http_get:
mock_http_get.return_value = ('<html><html/>')
crawler = crawlers.HTMLDirectoryCrawler('http://foo')
crawler._list_folder_contents('/bar')
mock_http_get.assert_called_once_with('http://foo/bar', {})

def test_list_folder_contents_with_auth(self):
"""If a username and password are provided, HTTP requests
should have an 'auth' parameter
"""
with mock.patch('geospaas_harvesting.crawlers.Crawler._http_get') as mock_http_get:
mock_http_get.return_value = ('<html><html/>')
crawler = crawlers.HTMLDirectoryCrawler('http://foo', username='user', password='pass')
crawler._list_folder_contents('/bar')
mock_http_get.assert_called_once_with('http://foo/bar', {'auth': ('user', 'pass')})

def test_get_normalized_attributes(self):
"""Test that the attributes are gotten using metanorm, and the
geospaas_service attributes are set
"""
crawler = crawlers.HTMLDirectoryCrawler('http://foo')
with mock.patch.object(crawler, '_metadata_handler') as mock_handler:
mock_handler.get_parameters.return_value = {'foo': 'bar'}
self.assertDictEqual(
crawler.get_normalized_attributes(crawlers.DatasetInfo('ftp://uri')),
{
'foo': 'bar',
'geospaas_service_name': geospaas.catalog.managers.HTTP_SERVICE_NAME,
'geospaas_service': geospaas.catalog.managers.HTTP_SERVICE
})
mock_handler.get_parameters.assert_called_once_with({'url': 'ftp://uri'})


class OpenDAPCrawlerTestCase(unittest.TestCase):
Expand Down Expand Up @@ -739,26 +813,6 @@ def test_get_html_logs_error_on_http_status(self, mock_error_logger):
_ = crawlers.OpenDAPCrawler._http_get(self.TEST_DATA['inexistent']['urls'][0])
mock_error_logger.assert_called_once()

def test_get_right_number_of_links(self):
"""Test that the crawler gets the correct number of links from a test page"""
links = {}
for sample in ('root', 'empty'):
data_file = open(os.path.join(
os.path.dirname(__file__),
self.TEST_DATA[sample]['file_path']))
html = data_file.read()
data_file.close()
links[sample] = crawlers.OpenDAPCrawler._get_links(html)

self.assertEqual(len(links['root']), 4)
self.assertEqual(len(links['empty']), 0)

def test_link_extractor_error(self):
"""In case of error, LinkExtractor must use a logger"""
parser = crawlers.LinkExtractor()
with self.assertLogs(parser.logger, level=logging.ERROR):
parser.error('some message')

def test_process_folder(self):
"""
Explore root page and make sure the _url and _to_process attributes of the crawler have the
Expand Down

0 comments on commit 9ff56c7

Please sign in to comment.