-
-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This adds support for the Internet Archive's new, beta CDX search endpoint at `/web/timemap/cdx`. It deals with pagination much better and is eventually slated to replace the search currently at `/cdx/search/cdx`, but is a little slower and still being tested. This commit is a start, but we still need to do more detailed testing and talk more with the Wayback Machine team about things that are unclear here. I'm also not sure if `filter`, `collapse`, `resolveRevisits`, etc. are actually supported. Fixes #8.
- Loading branch information
Showing
2 changed files
with
246 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,7 +53,7 @@ | |
# https://github.com/internetarchive/wayback/blob/bd205b9b26664a6e2ea3c0c2a8948f0dc6ff4519/wayback-cdx-server/src/main/java/org/archive/cdxserver/format/CDX11Format.java#L13-L17 # noqa | ||
# NOTE: the `length` and `robotflags` fields appear to always be empty | ||
# TODO: support new/upcoming CDX API | ||
# CDX_SEARCH_URL = 'https://web.archive.org/web/timemap/cdx' | ||
CDX_SEARCH_2_URL = 'https://web.archive.org/web/timemap/cdx' | ||
|
||
ARCHIVE_URL_TEMPLATE = 'https://web.archive.org/web/{timestamp}{mode}/{url}' | ||
REDUNDANT_HTTP_PORT = re.compile(r'^(http://[^:/]+):80(.*)$') | ||
|
@@ -609,6 +609,250 @@ def search(self, url, *, match_type=None, limit=1000, offset=None, | |
|
||
return count | ||
|
||
# TODO: should we support limit (maybe only negative?) and fast_latest? | ||
def search_v2(self, url, *, match_type=None, from_date=None, to_date=None, | ||
filter_field=None, collapse=None, resolve_revisits=True, | ||
skip_malformed_results=True, page_size=5): | ||
""" | ||
Search archive.org's *new* CDX API for all captures of a given URL. This | ||
returns an iterator of :class:`CdxRecord` objects. | ||
This is similar to :meth:`WaybackClient.search`, but uses a new, beta | ||
search API that is eventually intended to replace the main search API. | ||
It offers more predictable results without the complex issues around | ||
``limit`` and pagination involved the current search API. | ||
Results include captures with similar, but not exactly matching URLs. | ||
They are matched by a SURT-formatted, canonicalized URL that: | ||
* Does not differentiate between HTTP and HTTPS, | ||
* Is not case-sensitive, and | ||
* Treats ``www.`` and ``www*.`` subdomains the same as no subdomain at | ||
all. | ||
This will automatically page through all results for a given search. If | ||
you want fewer results, you can stop iterating early: | ||
.. code-block:: python | ||
from itertools import islice | ||
first10 = list(islice(client.search(...), 10)) | ||
Parameters | ||
---------- | ||
url : str | ||
The URL to search for captures of. | ||
Special patterns in ``url`` imply a value for the ``match_type`` | ||
parameter and match multiple URLs: | ||
* If the URL starts with `*.` (e.g. ``*.epa.gov``) OR | ||
``match_type='domain'``, the search will include all URLs at the | ||
given domain and its subdomains. | ||
* If the URL ends with `/*` (e.g. ``https://epa.gov/*``) OR | ||
``match_type='prefix'``, the search will include all URLs that | ||
start with the text up to the ``*``. | ||
* Otherwise, this returns matches just for the requeted URL. | ||
**NOTE:** if the URL includes wildcards or ``match_type`` is set to | ||
something other than ``None`` or ``'exact'``, results will not | ||
include recent captures (generally, captures from the last 3 days). | ||
match_type : str, optional | ||
Determines how to interpret the ``url`` parameter. It must be one of | ||
the following: | ||
* ``exact`` (default) returns results matching the requested URL | ||
(see notes about SURT above; this is not an exact string match of | ||
the URL you pass in). | ||
* ``prefix`` returns results that start with the requested URL. | ||
* ``host`` returns results from all URLs at the host in the | ||
requested URL. | ||
* ``domain`` returns results from all URLs at the domain or any | ||
subdomain of the requested URL. | ||
The default value is calculated based on the format of ``url``. | ||
**NOTE:** if the URL includes wildcards or ``match_type`` is set to | ||
something other than ``None`` or ``'exact'``, results will not | ||
include recent captures (generally, captures from the last 3 days). | ||
from_date : datetime or date, optional | ||
Only include captures after this date. Equivalent to the | ||
`from` argument in the CDX API. If it does not have a time zone, it | ||
is assumed to be in UTC. | ||
to_date : datetime or date, optional | ||
Only include captures before this date. Equivalent to the `to` | ||
argument in the CDX API. If it does not have a time zone, it is | ||
assumed to be in UTC. | ||
filter_field : str, optional | ||
A filter for any field in the results. Equivalent to the ``filter`` | ||
argument in the CDX API. (format: ``[!]field:regex``) | ||
collapse : str, optional | ||
Collapse consecutive results that match on a given field. (format: | ||
`fieldname` or `fieldname:N` -- N is the number of chars to match.) | ||
resolve_revisits : bool, default: True | ||
Attempt to resolve ``warc/revisit`` records to their actual content | ||
type and response code. Not supported on all CDX servers. | ||
skip_malformed_results : bool, default: True | ||
If true, don't yield records that look like they have no actual | ||
memento associated with them. Some crawlers will erroneously | ||
attempt to capture bad URLs like | ||
``http://mailto:[email protected]`` or | ||
``http://data:image/jpeg;base64,AF34...`` and so on. This is a | ||
filter performed client side and is not a CDX API argument. | ||
page_size : int, default: 5 | ||
How many *index blocks* (not results!) to search in each request. | ||
Regardless of the value set here, this function will yield results | ||
from every page. If you want to stop early, then stop iterating | ||
over the results of this method. | ||
Raises | ||
------ | ||
UnexpectedResponseFormat | ||
If the CDX response was not parseable. | ||
Yields | ||
------ | ||
version: CdxRecord | ||
A :class:`CdxRecord` encapsulating one capture or revisit | ||
References | ||
---------- | ||
* HTTP API Docs: https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server | ||
* SURT formatting: http://crawler.archive.org/articles/user_manual/glossary.html#surt | ||
* SURT implementation: https://github.com/internetarchive/surt | ||
Notes | ||
----- | ||
Several CDX API parameters are not relevant or handled automatically | ||
by this function. This does not support: `page`, `offset`, `limit`, | ||
`output`, `fl`, `showDupeCount`, `showSkipCount`, `lastSkipTimestamp`, | ||
`showNumPages`, `showPagedIndex`. | ||
""" | ||
if (not isinstance(page_size, int)) or page_size <= 0: | ||
raise TypeError(f'`page_size` must be a positive integer, not {page_size!r}') | ||
|
||
# TODO: support args that can be set multiple times: filter, collapse | ||
# Should take input as a sequence and convert to repeat query args | ||
# TODO: Check types | ||
query_args = {'url': url, 'matchType': match_type, 'from': from_date, | ||
'to': to_date, 'filter': filter_field, | ||
'collapse': collapse, 'resolveRevisits': resolve_revisits, | ||
'pageSize': page_size} | ||
|
||
query = {} | ||
for key, value in query_args.items(): | ||
if value is not None: | ||
if isinstance(value, str): | ||
query[key] = value | ||
elif isinstance(value, date): | ||
query[key] = _utils.format_timestamp(value) | ||
else: | ||
query[key] = str(value).lower() | ||
|
||
# Since pages are a number of *blocks searched* and not results, a page | ||
# in the middle of the result set may have nothing in it. The only way | ||
# to know when to stop iterating is to check how many pages there are. | ||
page_count = int(self.session.request('GET', CDX_SEARCH_2_URL, params={ | ||
**query, | ||
'showNumPages': 'true' | ||
}).text) | ||
|
||
page = 0 | ||
previous_result = None | ||
while page < page_count: | ||
response = self.session.request('GET', CDX_SEARCH_2_URL, params={ | ||
**query, | ||
'page': page | ||
}) | ||
page += 1 | ||
|
||
try: | ||
# Read/cache the response and close straightaway. If we need to | ||
# raise for status, we want to pre-emptively close the response | ||
# so a user handling the error doesn't need to worry about it. If | ||
# we don't raise here, we still want to close the connection so it | ||
# doesn't leak when we move onto the next of results or when this | ||
# iterator ends. | ||
read_and_close(response) | ||
response.raise_for_status() | ||
except requests.exceptions.HTTPError as error: | ||
if 'AdministrativeAccessControlException' in response.text: | ||
raise BlockedSiteError(query['url']) | ||
elif 'RobotAccessControlException' in response.text: | ||
raise BlockedByRobotsError(query['url']) | ||
else: | ||
raise WaybackException(str(error)) | ||
|
||
lines = iter(response.content.splitlines()) | ||
for line in lines: | ||
text = line.decode() | ||
|
||
if text == previous_result: | ||
# This result line is a repeat. Skip it. | ||
continue | ||
else: | ||
previous_result = text | ||
|
||
try: | ||
# The v2 search currently has a different format than v1. | ||
# - 0-5 are the same as v1 (urlkey, timestamp, original, | ||
# mimetype, statuscode, digest (SHA-1)) | ||
# - 6-7 I haven't been able to figure out (but I understand | ||
# they are planned for removal later, so we should | ||
# probably ignore) | ||
# - 8 is `length` (field 6 in v1) (byte length of the data | ||
# IN THE WARC file, not the archived response). | ||
# - 9 is the byte offset of the data in the WARC file. | ||
# (Planned for removal in the future) | ||
# - 10 is the name of the WARC file. | ||
# (Planned for removal in the future) | ||
# | ||
# FIXME: if `output=json` is supported in the future, we | ||
# should really use it (even though there's higher parsing | ||
# overhead) since we can check field names and not risk | ||
# errors if the field order changes. | ||
fields = text.split(' ') | ||
data = CdxRecord(*(fields[:6]), fields[8], '', '') | ||
if data.status_code == '-': | ||
# the status code given for a revisit record | ||
status_code = None | ||
else: | ||
status_code = int(data.status_code) | ||
length = None if data.length == '-' else int(data.length) | ||
capture_time = _utils.parse_timestamp(data.timestamp) | ||
except Exception as err: | ||
if 'RobotAccessControlException' in text: | ||
raise BlockedByRobotsError(query["url"]) | ||
raise UnexpectedResponseFormat( | ||
f'Could not parse CDX output: "{text}" (query: {query})') from err | ||
|
||
clean_url = REDUNDANT_HTTPS_PORT.sub( | ||
r'\1\2', REDUNDANT_HTTP_PORT.sub( | ||
r'\1\2', data.url)) | ||
if skip_malformed_results and is_malformed_url(clean_url): | ||
continue | ||
if clean_url != data.url: | ||
data = data._replace(url=clean_url) | ||
|
||
# TODO: repeat captures have a status code of `-` and a mime type | ||
# of `warc/revisit`. These can only be resolved by requesting the | ||
# content and following redirects. Maybe nice to do so | ||
# automatically here. | ||
data = data._replace( | ||
status_code=status_code, | ||
length=length, | ||
timestamp=capture_time, | ||
raw_url=ARCHIVE_URL_TEMPLATE.format( | ||
url=data.url, | ||
timestamp=data.timestamp, | ||
mode=Mode.original.value), | ||
view_url=ARCHIVE_URL_TEMPLATE.format( | ||
url=data.url, | ||
timestamp=data.timestamp, | ||
mode=Mode.view.value) | ||
) | ||
yield data | ||
|
||
def get_memento(self, url, datetime=None, mode=Mode.original, *, | ||
exact=True, exact_redirects=None, | ||
target_window=24 * 60 * 60, follow_redirects=True): | ||
|