UrlStore compression: make bz2 & zlib optional

adbar · Aug 29, 2024 · 81b87f2 · 81b87f2
1 parent a48713a
commit 81b87f2
Showing 1 changed file with 57 additions and 10 deletions.
diff --git a/courlan/urlstore.py b/courlan/urlstore.py
@@ -2,13 +2,26 @@
 Defines a URL store which holds URLs along with relevant information and entails crawling helpers.
 """
 
-import bz2
 import gc
 import logging
 import pickle
 import signal
 import sys
-import zlib
+
+try:
+    import bz2
+
+    HAS_BZ2 = True
+except ImportError:
+    HAS_BZ2 = False
+
+try:
+    import zlib
+
+    HAS_ZLIB = True
+except ImportError:
+    HAS_ZLIB = False
+
 
 from collections import defaultdict, deque
 from datetime import datetime, timedelta
@@ -17,6 +30,7 @@
 from threading import Lock
 from typing import (
     Any,
+    Callable,
     DefaultDict,
     Deque,
     Dict,
@@ -38,6 +52,43 @@
 LOGGER = logging.getLogger(__name__)
 
 
+class Compressor:
+    "Use system information on available compression modules and define corresponding methods."
+    __slots__ = ("compressor", "decompressor")
+
+    def __init__(self) -> None:
+        self.compressor: Callable[[Any], bytes] = self._dump
+        self.decompressor: Callable[[bytes], Any] = self._load
+
+        if HAS_ZLIB:
+            self.compressor = zlib.compress
+            self.decompressor = zlib.decompress
+        elif HAS_BZ2:
+            self.compressor = bz2.compress
+            self.decompressor = bz2.decompress
+
+    @staticmethod
+    def _dump(data: Any) -> bytes:
+        "Pickle the data or object."
+        return pickle.dumps(data, protocol=5)
+
+    @staticmethod
+    def _load(data: bytes) -> Any:
+        "Load Python object from pickle."
+        return pickle.loads(data)
+
+    def compress(self, data: Any) -> bytes:
+        "Pickle the data and compress it if a method is available."
+        return self.compressor(self._dump(data))
+
+    def decompress(self, data: bytes) -> Any:
+        "Decompress the data if a method is available and load the object."
+        return self._load(self.decompressor(data))
+
+
+COMPRESSOR = Compressor()
+
+
 class State(Enum):
     "Record state information about a domain or host."
     OPEN = 1
@@ -149,7 +200,7 @@ def _buffer_urls(
     def _load_urls(self, domain: str) -> Deque[UrlPathTuple]:
         if domain in self.urldict:
             if self.compressed:
-                return pickle.loads(bz2.decompress(self.urldict[domain].tuples))  # type: ignore
+                return COMPRESSOR.decompress(self.urldict[domain].tuples)  # type: ignore
             return self.urldict[domain].tuples
         return deque()
 
@@ -197,9 +248,7 @@ def _store_urls(
 
         with self._lock:
             if self.compressed:
-                self.urldict[domain].tuples = bz2.compress(  # type: ignore[assignment]
-                    pickle.dumps(urls, protocol=4)
-                )
+                self.urldict[domain].tuples = COMPRESSOR.compress(urls)  # type: ignore[assignment]
             else:
                 self.urldict[domain].tuples = urls
             self.urldict[domain].total = len(urls)
@@ -453,16 +502,14 @@ def establish_download_schedule(
     def store_rules(self, website: str, rules: Optional[RobotFileParser]) -> None:
         "Store crawling rules for a given website."
         if self.compressed:
-            rules = zlib.compress(  # type: ignore[assignment]
-                pickle.dumps(rules, protocol=4)
-            )
+            rules = COMPRESSOR.compress(rules)  # type: ignore[assignment]
         self.urldict[website].rules = rules
 
     def get_rules(self, website: str) -> Optional[RobotFileParser]:
         "Return the stored crawling rules for the given website."
         if website in self.urldict:
             if self.compressed:
-                return pickle.loads(zlib.decompress(self.urldict[website].rules))  # type: ignore
+                return COMPRESSOR.decompress(self.urldict[website].rules)  # type: ignore
             return self.urldict[website].rules
         return None