Skip to content

Commit

Permalink
fix: use httpx instead of requests to avoid Bandcamp blocking
Browse files Browse the repository at this point in the history
When using requests/urllib3, Bandcamp response to all requests with
403 errors.

Investigating why, I tried:
- using curl to send the same request: it worked
- writing a tiny Python script to `GET bandcamp.com/` with requests: it
  failed with 403
- waiting a week to see if it solved itself: no luck
- changing the above mentioned script to use http.client or httpx worked

I think that in this case, Bandcamp's Web Application Firewall (WAF)
blocks the requests based not on their contents but on an artifact of
how urllib3 builds/sends the data, since curl with exact same headers
works.

Instead of trying to identify the exact reason, which is quite hard
without any info on Bandcamp's WAF, and fix/workaround that, I rewrote
the very little required HTTP code to use httpx and sidestep the issue.
  • Loading branch information
ThinkChaos committed Aug 3, 2024
1 parent 0fa90d8 commit ea78c1f
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 177 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@

### Fixed

- `search`: properly escape query strings for better results with special characters
- `search`:
- properly escape query strings for better results with special characters
- change HTTP client implementation to avoid Bandcamp "403 Forbidden" responses

## [0.19.1] 2024-05-10

Expand Down
17 changes: 4 additions & 13 deletions beetsplug/bandcamp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,15 @@
import re
from contextlib import contextmanager
from functools import lru_cache, partial
from html import unescape
from itertools import chain
from operator import itemgetter
from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Literal, Sequence

import requests
from beets import IncludeLazyConfig, __version__, config, library, plugins
from beets import IncludeLazyConfig, config, library, plugins

from beetsplug import fetchart # type: ignore[attr-defined]

from .http import HTTPError, http_get_text
from .metaguru import Metaguru
from .search import search_bandcamp

Expand All @@ -57,12 +56,6 @@

ALBUM_URL_IN_TRACK = re.compile(r'<a id="buyAlbumLink" href="([^"]+)')
LABEL_URL_IN_COMMENT = re.compile(r"Visit (https:[\w/.-]+\.[a-z]+)")
USER_AGENT = f"beets/{__version__} +http://beets.radbox.org/"


@lru_cache(maxsize=None)
def get_response(url: str) -> requests.Response:
return requests.get(url, headers={"User-Agent": USER_AGENT})


class BandcampRequestsHandler:
Expand All @@ -79,13 +72,11 @@ def _info(self, msg_template: str, *args: Sequence[str]) -> None:

def _get(self, url: str) -> str:
"""Return text contents of the url response."""
response = get_response(url)
try:
response.raise_for_status()
except requests.HTTPError as e:
return http_get_text(url)
except HTTPError as e:
self._info("{}", e)
return ""
return unescape(response.text)

def guru(self, url: str) -> Metaguru:
return Metaguru.from_html(self._get(url), config=self.config.flatten())
Expand Down
21 changes: 21 additions & 0 deletions beetsplug/bandcamp/http.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from functools import lru_cache
from html import unescape
from urllib.parse import urlsplit

from beets import __version__
import httpx

HTTPError = httpx.HTTPError

USER_AGENT = f"beets/{__version__} +https://beets.io/"

_client = httpx.Client(headers={"User-Agent": USER_AGENT})

@lru_cache(maxsize=None)
def http_get_text(url: str) -> str:
"""Return text contents of the url."""

response = _client.get(url)
response.raise_for_status()

return unescape(response.text)
10 changes: 2 additions & 8 deletions beetsplug/bandcamp/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from typing import Any, Callable, Dict, List
from urllib.parse import quote_plus

import requests
from .http import http_get_text

JSONDict = Dict[str, Any]
SEARCH_URL = "https://bandcamp.com/search?page={}&q={}"
Expand Down Expand Up @@ -95,17 +95,11 @@ def parse_and_sort_results(html: str, **kwargs: str) -> List[JSONDict]:
return [{"index": i + 1, **r} for i, r in enumerate(results)]


def get_url(url: str) -> str:
response = requests.get(url)
response.raise_for_status()
return unescape(response.text)


def search_bandcamp(
query: str = "",
search_type: str = "",
page: int = 1,
get: Callable[[str], str] = get_url,
get: Callable[[str], str] = http_get_text,
**kwargs: Any,
) -> List[JSONDict]:
"""Return a list with item JSONs of type search_type matching the query."""
Expand Down
Loading

0 comments on commit ea78c1f

Please sign in to comment.