Skip to content

Commit

Permalink
type hinting: add remaining types and integrate into CI (#748)
Browse files Browse the repository at this point in the history
* type hinting: add remaining types and integrate into CI

* change workflow order

* fix errors and update setup

* fix build

* try Python 3.14

* fix CI workflow

* add py.typed

* fix remaining strict warnings
  • Loading branch information
adbar authored Nov 22, 2024
1 parent 0f05e0b commit 0ad8c3d
Show file tree
Hide file tree
Showing 12 changed files with 65 additions and 67 deletions.
33 changes: 15 additions & 18 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
matrix:
os: [ubuntu-latest]
# https://github.com/actions/python-versions/blob/main/versions-manifest.json
python-version: ["3.9", "3.11"] # "3.13", "3.14-dev"
python-version: ["3.9", "3.11", "3.13"] # "3.14-dev"
env:
- MINIMAL: "true"
PROXY_TEST: "false"
Expand Down Expand Up @@ -57,7 +57,7 @@ jobs:
python-version: ${{ matrix.python-version }}

- name: Upgrade pip
run: python -m pip install --upgrade pip setuptools wheel
run: python -m pip install --upgrade pip

- name: Get pip cache dir
id: pip-cache
Expand All @@ -75,35 +75,32 @@ jobs:
# package setup
- uses: actions/checkout@v4

# only where prebuilt wheels do not exist
# - name: Install LXML dependencies
# if: ${{ matrix.python-version == '3.13-dev' }}
# run: |
# sudo apt-get update
# sudo apt-get install libxml2-dev libxslt-dev

- name: Install dependencies
run: python -m pip install -e ".[dev]"

- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
# pycurl installation fix
- name: Install packages required by pycurl
if: ${{ matrix.env.MINIMAL == 'false'}}
if: ${{ matrix.env.MINIMAL == 'false' }}
run: |
sudo apt-get update
sudo apt-get install libcurl4-gnutls-dev libgnutls28-dev
# alternatively: sudo apt-get install libcurl4-openssl-dev libssl-dev

- name: Install full dependencies
if: ${{ matrix.env.MINIMAL == 'false'}}
if: ${{ matrix.env.MINIMAL == 'false' }}
run: python -m pip install -e ".[all]"

# tests
- name: Lint with flake8
- name: Type checking
if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }}
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
mypy -p trafilatura
- name: Test with pytest
run: |
Expand All @@ -113,7 +110,7 @@ jobs:

# coverage
- name: Upload coverage to Codecov
if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.11' }}
if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }}
uses: codecov/codecov-action@v4
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
Expand Down
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
include CITATION.cff CONTRIBUTING.md HISTORY.md README.rst LICENSE
graft trafilatura/data/
include trafilatura/settings.cfg
include trafilatura/py.typed

include tests/__init__.py
include tests/*test*.py
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,11 @@ trafilatura = "trafilatura.cli:main"
[project.optional-dependencies]
dev = [
"flake8",
"mypy",
"pytest",
"pytest-cov",
"types-lxml",
"types-urllib3",
]
all = [
"brotli",
Expand Down
11 changes: 5 additions & 6 deletions trafilatura/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from time import sleep
from typing import (
Any,
ByteString,
Callable,
Dict,
Generator,
Expand Down Expand Up @@ -73,7 +72,7 @@ def create_pool(**args: Any) -> Union[urllib3.PoolManager, Any]:
return manager_class(**manager_args, **args) # type: ignore[arg-type]


DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True)
DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True) # type: ignore[no-untyped-call]
USER_AGENT = (
"trafilatura/" + version("trafilatura") + " (+https://github.com/adbar/trafilatura)"
)
Expand Down Expand Up @@ -106,7 +105,7 @@ class Response:
"Store information gathered in a HTTP response object."
__slots__ = ["data", "headers", "html", "status", "url"]

def __init__(self, data: ByteString, status: int, url: str) -> None:
def __init__(self, data: bytes, status: int, url: str) -> None:
self.data = data
self.headers: Optional[Dict[str, str]] = None
self.html: Optional[str] = None
Expand Down Expand Up @@ -332,14 +331,14 @@ def _pycurl_is_live_page(url: str) -> bool:
curl.setopt(pycurl.SSL_VERIFYPEER, 0)
curl.setopt(pycurl.SSL_VERIFYHOST, 0)
# Set option to avoid getting the response body
curl.setopt(curl.NOBODY, True) # type: ignore[attr-defined]
curl.setopt(curl.NOBODY, True)
if PROXY_URL:
curl.setopt(pycurl.PRE_PROXY, PROXY_URL)
# Perform the request
try:
curl.perform()
# Get the response code
page_exists = curl.getinfo(curl.RESPONSE_CODE) < 400 # type: ignore[attr-defined]
page_exists = curl.getinfo(curl.RESPONSE_CODE) < 400
except pycurl.error as err:
LOGGER.debug("pycurl HEAD error: %s %s", url, err)
page_exists = False
Expand Down Expand Up @@ -503,7 +502,7 @@ def _send_pycurl_request(
# ip_info = curl.getinfo(curl.PRIMARY_IP)

resp = Response(
bufferbytes, curl.getinfo(curl.RESPONSE_CODE), curl.getinfo(curl.EFFECTIVE_URL) # type: ignore[attr-defined]
bufferbytes, curl.getinfo(curl.RESPONSE_CODE), curl.getinfo(curl.EFFECTIVE_URL)
)
curl.close()

Expand Down
2 changes: 1 addition & 1 deletion trafilatura/external.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def compare_extraction(tree: HtmlElement, backup_tree: HtmlElement, body: _Eleme

# post-processing: remove unwanted sections
if use_readability and not jt_result:
body, text, len_text = sanitize_tree(body, options)
body, text, len_text = sanitize_tree(body, options) # type: ignore[arg-type]

return body, text, len_text

Expand Down
15 changes: 8 additions & 7 deletions trafilatura/htmlprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ def convert_lists(elem: _Element) -> None:
for subelem in elem.iter("dd", "dt", "li"):
# keep track of dd/dt items
if subelem.tag in ("dd", "dt"):
subelem.set("rend", f"{subelem.tag}-{i}")
subelem.set("rend", f"{str(subelem.tag)}-{i}")
# increment counter after <dd> in description list
if subelem.tag == "dd":
i += 1
Expand Down Expand Up @@ -397,15 +397,15 @@ def convert_tags(
convert_link(elem, base_url)

if options.formatting:
for elem in tree.iter(REND_TAG_MAPPING.keys()): # type: ignore[call-overload]
for elem in tree.iter(REND_TAG_MAPPING.keys()):
elem.attrib.clear()
elem.set("rend", REND_TAG_MAPPING[elem.tag])
elem.tag = "hi"
else:
strip_tags(tree, *REND_TAG_MAPPING.keys())

# iterate over all concerned elements
for elem in tree.iter(CONVERSIONS.keys()): # type: ignore[call-overload]
for elem in tree.iter(CONVERSIONS.keys()):
CONVERSIONS[elem.tag](elem)
# images
if options.images:
Expand All @@ -430,12 +430,13 @@ def convert_tags(

def convert_to_html(tree: _Element) -> _Element:
"Convert XML to simplified HTML."
for elem in tree.iter(HTML_CONVERSIONS.keys()): # type: ignore[call-overload]
for elem in tree.iter(HTML_CONVERSIONS.keys()):
conversion = HTML_CONVERSIONS[str(elem.tag)]
# apply function or straight conversion
if callable(HTML_CONVERSIONS[elem.tag]):
elem.tag = HTML_CONVERSIONS[elem.tag](elem) # type: ignore[operator]
if callable(conversion):
elem.tag = conversion(elem)
else:
elem.tag = HTML_CONVERSIONS[elem.tag]
elem.tag = conversion # type: ignore[assignment]
# handle attributes
if elem.tag == "a":
elem.set("href", elem.attrib.pop("target", ""))
Expand Down
Loading

0 comments on commit 0ad8c3d

Please sign in to comment.