Skip to content

Commit

Permalink
make HTMLDirectoryCrawler a concrete class
Browse files Browse the repository at this point in the history
  • Loading branch information
aperrin66 committed Jan 12, 2024
1 parent 81cff2c commit 0feaa15
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 38 deletions.
44 changes: 31 additions & 13 deletions geospaas_harvesting/crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,8 @@ class DirectoryCrawler(Crawler):
f'^.*/{YEAR_PATTERN}/?{MONTH_PATTERN}/?{DAY_OF_MONTH_PATTERN}(/.*)?$')
DAY_OF_YEAR_MATCHER = re.compile(f'^.*/{YEAR_PATTERN}/{DAY_OF_YEAR_PATTERN}(/.*)?$')

def __init__(self, root_url, time_range=(None, None), include=None, max_threads=1):
def __init__(self, root_url, time_range=(None, None), include=None,
username=None, password=None, max_threads=1):
"""
`root_url` is the URL of the data repository to explore.
`time_range` is a 2-tuple of datetime.datetime objects defining the time range
Expand All @@ -330,13 +331,17 @@ def __init__(self, root_url, time_range=(None, None), include=None, max_threads=
self.root_url = urlparse(root_url)
self.time_range = time_range
self.include = re.compile(include) if include else None
self.username = username
self.password = password
self.set_initial_state()

def __eq__(self, other):
return (
self.root_url == other.root_url and
self.time_range == other.time_range and
self.include == other.include)
self.include == other.include and
self.username == other.username and
self.password == other.password)

@property
def base_url(self):
Expand Down Expand Up @@ -479,7 +484,8 @@ def _process_folder(self, folder_path):
self.logger.debug("Looking for resources in '%s'...", folder_path)
for path in self._list_folder_contents(folder_path):
# deselect paths which contains any of the excludes strings
if self.EXCLUDE and self.EXCLUDE.search(path):
if ((self.EXCLUDE and self.EXCLUDE.search(path)) or
self.root_url.path.startswith(path.rstrip(f"{os.sep}/"))):
continue
if self._is_folder(path):
self._add_folder_to_process(path)
Expand Down Expand Up @@ -514,11 +520,11 @@ def get_normalized_attributes(self, dataset_info, **kwargs):


class HTMLDirectoryCrawler(DirectoryCrawler):
"""Implementation of WebDirectoryCrawler for repositories exposed as HTML pages."""
"""Implementation of DirectoryCrawler for repositories exposed as HTML pages."""

logger = logging.getLogger(__name__ + '.HTMLDirectoryCrawler')

FOLDERS_SUFFIXES = None
FOLDERS_SUFFIXES = ('/',)

# ------------- crawl ------------
@staticmethod
Expand All @@ -527,7 +533,7 @@ def _strip_folder_page(folder_path):
Remove the index page of a folder path.
For example: /foo/bar/contents.html becomes /foo/bar.
"""
return re.sub(r'/\w+\.html?$', r'', folder_path)
return re.sub(r'/(\w+\.html)?$', r'', folder_path)

def _is_folder(self, path):
return path.endswith(self.FOLDERS_SUFFIXES)
Expand Down Expand Up @@ -557,13 +563,23 @@ def _prepend_parent_path(parent_path, paths):
return result

def _list_folder_contents(self, folder_path):
html = self._http_get(f"{self.base_url}{folder_path}")
request_parameters = {}
if self.username is not None and self.password is not None:
request_parameters['auth'] = (self.username, self.password)
html = self._http_get(f"{self.base_url}{folder_path}", request_parameters)
stripped_folder_path = self._strip_folder_page(folder_path)
return self._prepend_parent_path(stripped_folder_path, self._get_links(html))

# --------- get metadata ---------
def get_normalized_attributes(self, dataset_info, **kwargs):
raise NotImplementedError()
"""Gets dataset attributes using http"""
raw_attributes = {}
self.add_url(dataset_info.url, raw_attributes)
normalized_attributes = self._metadata_handler.get_parameters(raw_attributes)
# TODO: add FTP_SERVICE_NAME and FTP_SERVICE in django-geo-spaas
normalized_attributes['geospaas_service_name'] = catalog_managers.HTTP_SERVICE_NAME
normalized_attributes['geospaas_service'] = catalog_managers.HTTP_SERVICE
return normalized_attributes


class OpenDAPCrawler(HTMLDirectoryCrawler):
Expand Down Expand Up @@ -690,16 +706,18 @@ class FTPCrawler(DirectoryCrawler):
logger = logging.getLogger(__name__ + '.FTPCrawler')

def __init__(self, root_url, time_range=(None, None), include=None,
username='anonymous', password='anonymous', max_threads=1):

username=None, password=None, max_threads=1):
if not root_url.startswith('ftp://'):
raise ValueError("The root url must start with 'ftp://'")

self.username = username
self.password = password
if username is None:
username = 'anonymous'
if password is None:
password = 'anonymous'
self.ftp = None

super().__init__(root_url, time_range, include, max_threads=1)
super().__init__(root_url, time_range, include, max_threads=1,
username=username, password=password)

def __getstate__(self):
"""Method used to pickle the crawler"""
Expand Down
84 changes: 59 additions & 25 deletions tests/test_generic_crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import requests

import geospaas.catalog.managers
import geospaas_harvesting.crawlers as crawlers


Expand Down Expand Up @@ -609,6 +610,37 @@ def test_abstract_get_normalized_attributes(self):
class HTMLDirectoryCrawlerTestCase(unittest.TestCase):
"""Tests for the HTMLDirectoryCrawler crawler"""

def test_strip_folder_page(self):
"""_strip_folder_page() should remove the index page from a
folder path
"""
self.assertEqual(
crawlers.HTMLDirectoryCrawler._strip_folder_page('/foo/bar/contents.html'),
'/foo/bar')
self.assertEqual(
crawlers.HTMLDirectoryCrawler._strip_folder_page('/foo/bar/'),
'/foo/bar')
self.assertEqual(
crawlers.HTMLDirectoryCrawler._strip_folder_page('/foo/bar'),
'/foo/bar')

def test_get_right_number_of_links(self):
"""Test that the crawler gets the correct number of links from a test page"""
with open(os.path.join(
os.path.dirname(__file__), 'data', 'opendap', 'root.html')) as data_file:
html = data_file.read()
self.assertEqual(len(crawlers.HTMLDirectoryCrawler._get_links(html)), 4)

with open(os.path.join(os.path.dirname(__file__), 'data', 'empty.html')) as data_file:
html = data_file.read()
self.assertEqual(len(crawlers.HTMLDirectoryCrawler._get_links(html)), 0)

def test_link_extractor_error(self):
"""In case of error, LinkExtractor must use a logger"""
parser = crawlers.LinkExtractor()
with self.assertLogs(parser.logger, level=logging.ERROR):
parser.error('some message')

def test_prepend_parent_path(self):
"""
Should prepend all the paths with the parent_path, except if they already start with it
Expand All @@ -620,12 +652,34 @@ def test_prepend_parent_path(self):
['/foo/bar', '/foo/baz']
)

def test_abstract_get_normalized_attributes(self):
"""The get_normalized_attribute is abstract in
HTMLDirectoryCrawler
def test_list_folder_contents(self):
"""Test listing a folder's contents"""
with mock.patch('geospaas_harvesting.crawlers.Crawler._http_get') as mock_http_get:
mock_http_get.return_value = (
'<html>'
'<a href="bar/contents.html">folder/</a>'
'<a href="baz/">folder/</a>'
'<html/>')
crawler = crawlers.HTMLDirectoryCrawler('')
self.assertListEqual(
crawler._list_folder_contents('/foo/contents.html'),
['/foo/bar/contents.html', '/foo/baz/'])

def test_get_normalized_attributes(self):
"""Test that the attributes are gotten using metanorm, and the
geospaas_service attributes are set
"""
with self.assertRaises(NotImplementedError):
crawlers.HTMLDirectoryCrawler('').get_normalized_attributes(None)
crawler = crawlers.HTMLDirectoryCrawler('http://foo')
with mock.patch.object(crawler, '_metadata_handler') as mock_handler:
mock_handler.get_parameters.return_value = {'foo': 'bar'}
self.assertDictEqual(
crawler.get_normalized_attributes(crawlers.DatasetInfo('ftp://uri')),
{
'foo': 'bar',
'geospaas_service_name': geospaas.catalog.managers.HTTP_SERVICE_NAME,
'geospaas_service': geospaas.catalog.managers.HTTP_SERVICE
})
mock_handler.get_parameters.assert_called_once_with({'url': 'ftp://uri'})


class OpenDAPCrawlerTestCase(unittest.TestCase):
Expand Down Expand Up @@ -739,26 +793,6 @@ def test_get_html_logs_error_on_http_status(self, mock_error_logger):
_ = crawlers.OpenDAPCrawler._http_get(self.TEST_DATA['inexistent']['urls'][0])
mock_error_logger.assert_called_once()

def test_get_right_number_of_links(self):
"""Test that the crawler gets the correct number of links from a test page"""
links = {}
for sample in ('root', 'empty'):
data_file = open(os.path.join(
os.path.dirname(__file__),
self.TEST_DATA[sample]['file_path']))
html = data_file.read()
data_file.close()
links[sample] = crawlers.OpenDAPCrawler._get_links(html)

self.assertEqual(len(links['root']), 4)
self.assertEqual(len(links['empty']), 0)

def test_link_extractor_error(self):
"""In case of error, LinkExtractor must use a logger"""
parser = crawlers.LinkExtractor()
with self.assertLogs(parser.logger, level=logging.ERROR):
parser.error('some message')

def test_process_folder(self):
"""
Explore root page and make sure the _url and _to_process attributes of the crawler have the
Expand Down

0 comments on commit 0feaa15

Please sign in to comment.