Skip to content

Commit

Permalink
UrlStore compression: make bz2 & zlib optional
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Aug 29, 2024
1 parent a48713a commit 81b87f2
Showing 1 changed file with 57 additions and 10 deletions.
67 changes: 57 additions & 10 deletions courlan/urlstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,26 @@
Defines a URL store which holds URLs along with relevant information and entails crawling helpers.
"""

import bz2
import gc
import logging
import pickle
import signal
import sys
import zlib

try:
import bz2

HAS_BZ2 = True
except ImportError:
HAS_BZ2 = False

Check warning on line 16 in courlan/urlstore.py

View check run for this annotation

Codecov / codecov/patch

courlan/urlstore.py#L15-L16

Added lines #L15 - L16 were not covered by tests

try:
import zlib

HAS_ZLIB = True
except ImportError:
HAS_ZLIB = False

Check warning on line 23 in courlan/urlstore.py

View check run for this annotation

Codecov / codecov/patch

courlan/urlstore.py#L22-L23

Added lines #L22 - L23 were not covered by tests


from collections import defaultdict, deque
from datetime import datetime, timedelta
Expand All @@ -17,6 +30,7 @@
from threading import Lock
from typing import (
Any,
Callable,
DefaultDict,
Deque,
Dict,
Expand All @@ -38,6 +52,43 @@
LOGGER = logging.getLogger(__name__)


class Compressor:
"Use system information on available compression modules and define corresponding methods."
__slots__ = ("compressor", "decompressor")

def __init__(self) -> None:
self.compressor: Callable[[Any], bytes] = self._dump
self.decompressor: Callable[[bytes], Any] = self._load

if HAS_ZLIB:
self.compressor = zlib.compress
self.decompressor = zlib.decompress
elif HAS_BZ2:
self.compressor = bz2.compress
self.decompressor = bz2.decompress

Check warning on line 68 in courlan/urlstore.py

View check run for this annotation

Codecov / codecov/patch

courlan/urlstore.py#L66-L68

Added lines #L66 - L68 were not covered by tests

@staticmethod
def _dump(data: Any) -> bytes:
"Pickle the data or object."
return pickle.dumps(data, protocol=5)

@staticmethod
def _load(data: bytes) -> Any:
"Load Python object from pickle."
return pickle.loads(data)

def compress(self, data: Any) -> bytes:
"Pickle the data and compress it if a method is available."
return self.compressor(self._dump(data))

def decompress(self, data: bytes) -> Any:
"Decompress the data if a method is available and load the object."
return self._load(self.decompressor(data))


COMPRESSOR = Compressor()


class State(Enum):
"Record state information about a domain or host."
OPEN = 1
Expand Down Expand Up @@ -149,7 +200,7 @@ def _buffer_urls(
def _load_urls(self, domain: str) -> Deque[UrlPathTuple]:
if domain in self.urldict:
if self.compressed:
return pickle.loads(bz2.decompress(self.urldict[domain].tuples)) # type: ignore
return COMPRESSOR.decompress(self.urldict[domain].tuples) # type: ignore
return self.urldict[domain].tuples
return deque()

Expand Down Expand Up @@ -197,9 +248,7 @@ def _store_urls(

with self._lock:
if self.compressed:
self.urldict[domain].tuples = bz2.compress( # type: ignore[assignment]
pickle.dumps(urls, protocol=4)
)
self.urldict[domain].tuples = COMPRESSOR.compress(urls) # type: ignore[assignment]
else:
self.urldict[domain].tuples = urls
self.urldict[domain].total = len(urls)
Expand Down Expand Up @@ -453,16 +502,14 @@ def establish_download_schedule(
def store_rules(self, website: str, rules: Optional[RobotFileParser]) -> None:
"Store crawling rules for a given website."
if self.compressed:
rules = zlib.compress( # type: ignore[assignment]
pickle.dumps(rules, protocol=4)
)
rules = COMPRESSOR.compress(rules) # type: ignore[assignment]
self.urldict[website].rules = rules

def get_rules(self, website: str) -> Optional[RobotFileParser]:
"Return the stored crawling rules for the given website."
if website in self.urldict:
if self.compressed:
return pickle.loads(zlib.decompress(self.urldict[website].rules)) # type: ignore
return COMPRESSOR.decompress(self.urldict[website].rules) # type: ignore
return self.urldict[website].rules
return None

Expand Down

0 comments on commit 81b87f2

Please sign in to comment.