Skip to content

Commit

Permalink
enable pyupgrade rules
Browse files Browse the repository at this point in the history
  • Loading branch information
freddyheppell committed Jul 10, 2024
1 parent 325f167 commit b9fe3ee
Show file tree
Hide file tree
Showing 17 changed files with 60 additions and 52 deletions.
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ select = [
"PD",
"I",
"PT",
"B"
"B",
"UP",
]

ignore = [
Expand Down
5 changes: 2 additions & 3 deletions src/wpextract/dl/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import html
import json
import os
from typing import List
from urllib import parse as urlparse

from tqdm.auto import tqdm
Expand All @@ -18,7 +17,7 @@ class Exporter:

@staticmethod
def download_media(
session: RequestSession, media: List[str], output_folder: str
session: RequestSession, media: list[str], output_folder: str
) -> int:
"""Downloads the media files based on the given URLs.
Expand Down Expand Up @@ -139,7 +138,7 @@ def write_file(filename, data):

@staticmethod
def export_posts(
posts: List[dict],
posts: list[dict],
filename: str,
):
"""Exports posts to the specified file.
Expand Down
24 changes: 19 additions & 5 deletions src/wpextract/dl/requestsession.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import random
import time
from http.cookies import SimpleCookie
from typing import Tuple, Union
from typing import Union

import requests
from requests.adapters import HTTPAdapter
Expand All @@ -14,21 +14,25 @@

class ConnectionCouldNotResolve(Exception):
"""The remote host could not be resolved."""

pass


class ConnectionReset(Exception):
"""The connection was reset during the request."""

pass


class ConnectionRefused(Exception):
"""The connection was refused by the server."""

pass


class ConnectionTimeout(Exception):
"""A connection timeout occurred."""

pass


Expand All @@ -38,6 +42,7 @@ class HTTPError400(Exception):
See Also:
HTTPErrorInvalidPage for a special case of this error.
"""

pass


Expand All @@ -46,41 +51,49 @@ class HTTPErrorInvalidPage(Exception):
This indicates the last page has been passed and all items have been retrieved.
"""

pass


class HTTPError401(Exception):
"""HTTP Unauthorized."""

pass


class HTTPError403(Exception):
"""HTTP Forbidden."""

pass


class HTTPError404(Exception):
"""HTTP Not Found."""

pass


class HTTPError500(Exception):
"""HTTP Internal Server Error."""

pass


class HTTPError502(Exception):
"""HTTP Bad Gateway."""

pass


class HTTPError(Exception):
"""A generic HTTP error with an unexpected code."""

pass


class HTTPTooManyRedirects(Exception):
"""Raised if the number of allowed redirects exceeds the configured maximum value."""

pass


Expand Down Expand Up @@ -172,11 +185,12 @@ def wait(self):
time.sleep(self.wait_s * wait_factor)


AuthorizationType = Union[Tuple[str, str], HTTPBasicAuth, HTTPDigestAuth]
AuthorizationType = Union[tuple[str, str], HTTPBasicAuth, HTTPDigestAuth]


class RequestSession:
"""Manages HTTP requests and their behaviour."""

def __init__(
self,
proxy: str = None,
Expand Down Expand Up @@ -253,13 +267,13 @@ def do_request(self, method, url, data=None, stream=False):
)
except requests.ConnectionError as e:
if "Errno -5" in str(e) or "Errno -2" in str(e) or "Errno -3" in str(e):
logging.error("Could not resolve host %s" % url)
logging.error(f"Could not resolve host {url}")
raise ConnectionCouldNotResolve from e
elif "Errno 111" in str(e):
logging.error("Connection refused by %s" % url)
logging.error(f"Connection refused by {url}")
raise ConnectionRefused from e
elif "RemoteDisconnected" in str(e):
logging.error("Connection reset by %s" % url)
logging.error(f"Connection reset by {url}")
raise ConnectionReset from e
else:
print(e)
Expand Down
6 changes: 3 additions & 3 deletions src/wpextract/downloader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from pathlib import Path
from typing import List, Optional
from typing import Optional

from wpextract.dl.exceptions import WordPressApiNotV2
from wpextract.dl.exporter import Exporter
Expand All @@ -15,7 +15,7 @@ def __init__(
self,
target: str,
out_path: Path,
data_types: List[str],
data_types: list[str],
session: Optional[RequestSession] = None,
json_prefix: Optional[str] = None,
):
Expand Down Expand Up @@ -136,7 +136,7 @@ def _list_obj(self, obj_type, start=None, limit=None, cache=True):
)
except WordPressApiNotV2:
logging.error("The API does not support WP V2")
except IOError as e:
except OSError as e:
logging.error(f"Could not open {e.filename} for writing")
print()

Expand Down
4 changes: 2 additions & 2 deletions src/wpextract/extract.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from pathlib import Path
from typing import Dict, Optional
from typing import Optional

from pandas import DataFrame

Expand Down Expand Up @@ -46,7 +46,7 @@ class WPExtractor:
pages: Optional[DataFrame]
"""DataFrame of extracted pages."""

scrape_url_mapping: Dict[str, Path]
scrape_url_mapping: dict[str, Path]

def __init__(
self,
Expand Down
6 changes: 3 additions & 3 deletions src/wpextract/extractors/data/images.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
import re
from dataclasses import dataclass
from typing import List, Optional
from typing import Optional

from wpextract.extractors.data.links import Linkable, LinkRegistry

Expand Down Expand Up @@ -56,8 +56,8 @@ def resolve_image(


def resolve_images(
registry: LinkRegistry, images: List[ResolvableMediaUse]
) -> List[ResolvableMediaUse]:
registry: LinkRegistry, images: list[ResolvableMediaUse]
) -> list[ResolvableMediaUse]:
"""Resolve the internal links of a list of media uses.
Args:
Expand Down
6 changes: 3 additions & 3 deletions src/wpextract/extractors/data/link_resolver.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import List, Optional
from typing import Optional
from urllib.parse import urlparse, urlunparse

from wpextract.extractors.data.links import LinkRegistry, ResolvableLink
Expand Down Expand Up @@ -66,8 +66,8 @@ def resolve_link(


def resolve_links(
registry: LinkRegistry, links: List[ResolvableLink]
) -> List[ResolvableLink]:
registry: LinkRegistry, links: list[ResolvableLink]
) -> list[ResolvableLink]:
"""Resolve a list of links against the link registry.
Args:
Expand Down
8 changes: 4 additions & 4 deletions src/wpextract/extractors/data/links.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from dataclasses import dataclass
from typing import Dict, List, Optional
from typing import Optional


@dataclass
Expand Down Expand Up @@ -29,8 +29,8 @@ class ResolvableLink(Link):
class LinkRegistry:
"""A collection of all known links on the site."""

links: List[Linkable]
url_index_cache: Dict[str, int]
links: list[Linkable]
url_index_cache: dict[str, int]

def __init__(self):
"""Init a new registry."""
Expand Down Expand Up @@ -59,7 +59,7 @@ def add_linkable(
if _refresh_cache:
self._refresh_cache()

def add_linkables(self, data_type: str, links: List[str], idxes: List[str]) -> None:
def add_linkables(self, data_type: str, links: list[str], idxes: list[str]) -> None:
"""Add multiple linkable items at once.
Args:
Expand Down
6 changes: 3 additions & 3 deletions src/wpextract/extractors/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import json
import logging
from pathlib import Path
from typing import Any, List, Optional
from typing import Any, Optional

import numpy as np
import pandas as pd
Expand All @@ -23,7 +23,7 @@ def load_from_path(path: Path) -> Optional[dict]:
if not path.is_file():
return None

with open(path, "r") as f:
with open(path) as f:
return json.load(f)


Expand Down Expand Up @@ -53,7 +53,7 @@ def load_df(path: Path, index_col: str = "id") -> Optional[pd.DataFrame]:
return pd.json_normalize(data_raw).set_index(index_col)


def _set_nested_keys(row_dict: dict, split_key: List[str], val: Any):
def _set_nested_keys(row_dict: dict, split_key: list[str], val: Any):
"""Set a value in the dictionary with nested keys.
Args:
Expand Down
4 changes: 2 additions & 2 deletions src/wpextract/extractors/posts.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from pathlib import Path
from typing import Dict, Optional
from typing import Optional

import pandas as pd
from pandas import DataFrame
Expand Down Expand Up @@ -57,7 +57,7 @@
def load_posts(
path: Path,
link_registry: LinkRegistry,
scrape_urls_files: Dict[str, Path],
scrape_urls_files: dict[str, Path],
translation_pickers: Optional[PickerListType] = None,
) -> Optional[pd.DataFrame]:
"""Load the posts from a JSON file.
Expand Down
11 changes: 5 additions & 6 deletions src/wpextract/parse/content.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import copy
import logging
from typing import List, Tuple
from urllib.parse import urljoin, urlparse, urlunparse

import pandas as pd
Expand All @@ -15,13 +14,13 @@
NEWLINE_TAGS = {"br", "p"}


InternalLinks = List[ResolvableLink]
ExternalLinks = List[Link]
InternalLinks = list[ResolvableLink]
ExternalLinks = list[Link]


def extract_links(
doc: BeautifulSoup, self_link: str
) -> Tuple[InternalLinks, ExternalLinks]:
) -> tuple[InternalLinks, ExternalLinks]:
"""Get the internal and external links of the document.
Args:
Expand Down Expand Up @@ -60,7 +59,7 @@ def extract_links(
return internal_links, external_links


Embeds = List[str]
Embeds = list[str]


def extract_embeds(doc: BeautifulSoup) -> Embeds:
Expand All @@ -75,7 +74,7 @@ def extract_embeds(doc: BeautifulSoup) -> Embeds:
return [iframe["src"] for iframe in doc.find_all("iframe")]


Images = List[MediaUse]
Images = list[MediaUse]


def extract_images(doc: BeautifulSoup, self_link: str) -> Images:
Expand Down
4 changes: 2 additions & 2 deletions src/wpextract/parse/translations/_extractor.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import logging
from typing import List, Optional, Type
from typing import Optional

import pandas as pd
from bs4 import BeautifulSoup

import wpextract.parse.translations._pickers as pickers

PICKERS = [pickers.Polylang, pickers.GenericLangSwitcher]
PickerListType = List[Type[pickers.LangPicker]]
PickerListType = list[type[pickers.LangPicker]]

PageTranslationData = pd.Series

Expand Down
3 changes: 1 addition & 2 deletions src/wpextract/parse/translations/_pickers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from abc import ABC, abstractmethod
from typing import List

from bs4 import BeautifulSoup, PageElement, Tag
from langcodes import Language
Expand All @@ -18,7 +17,7 @@ class LangPicker(ABC):
"""The document to extract the language picker from."""
root_el: Tag
"""The root element of the language picker, populated if [`LangPicker.matches`][wpextract.parse.translations.LangPicker.matches] is succesful."""
translations: List[TranslationLink]
translations: list[TranslationLink]
"""A list of translation links, populated by calling [`LangPicker.add_translation`][wpextract.parse.translations.LangPicker.add_translation] within [`LangPicker.extract`][wpextract.parse.translations.LangPicker.extract]."""
current_language: Language
"""The current language of the page, populated by calling [`LangPicker.set_current_lang`][wpextract.parse.translations.LangPicker.set_current_lang] within [`LangPicker.extract`][wpextract.parse.translations.LangPicker.extract]."""
Expand Down
Loading

0 comments on commit b9fe3ee

Please sign in to comment.