From 2ff22ac793cd484f0645a2f2b21cbd8fe63c4d63 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Sun, 18 Aug 2024 00:41:25 +0100 Subject: [PATCH 1/8] Add cache management features. We add a cache management layer on top of Pystow. This takes the form of two classes (both in `oaklib.utilities.caching`): * one representing the cache management policy, i.e. the logic dictating whether a cached file (if present) should be refreshed or not; * one representing the file cache itself. The policy is set once by the main entry point method, using either a default policy of refreshing cached data after 7 days, or another policy explicitly selected by the user with the new `--caching` option. The class that represents the file cache is the one that the rest of OAK should interact with whenever an access to caching data is needed. Ultimately, all calls to the Pystow module should be replaced to calls to FileCache, the use of Pystow becoming an implementation detail entirely encapsulated in FileCache. --- src/oaklib/cli.py | 11 + src/oaklib/constants.py | 4 + .../implementations/llm_implementation.py | 4 +- .../sqldb/sql_implementation.py | 4 +- src/oaklib/utilities/caching.py | 278 ++++++++++++++++++ 5 files changed, 297 insertions(+), 4 deletions(-) create mode 100644 src/oaklib/utilities/caching.py diff --git a/src/oaklib/cli.py b/src/oaklib/cli.py index 810211c7d..d0bb93b38 100644 --- a/src/oaklib/cli.py +++ b/src/oaklib/cli.py @@ -42,6 +42,7 @@ import oaklib.datamodels.taxon_constraints as tcdm from oaklib import datamodels +from oaklib.constants import FILE_CACHE from oaklib.converters.logical_definition_flattener import LogicalDefinitionFlattener from oaklib.datamodels import synonymizer_datamodel from oaklib.datamodels.association import RollupGroup @@ -149,6 +150,7 @@ generate_disjoint_class_expressions_axioms, ) from oaklib.utilities.basic_utils import pairs_as_dict +from oaklib.utilities.caching import CachePolicy from oaklib.utilities.iterator_utils import chunk from oaklib.utilities.kgcl_utilities import ( generate_change_id, @@ -568,6 +570,13 @@ def _apply_changes(impl, changes: List[kgcl.Change]): show_default=True, help="If set, will profile the command", ) +@click.option( + "--caching", + type=CachePolicy.ClickType, + default="1w", + show_default=True, + help="Set the cache management policy", +) def main( verbose: int, quiet: bool, @@ -587,6 +596,7 @@ def main( prefix, profile: bool, import_depth: Optional[int], + caching: CachePolicy, **kwargs, ): """ @@ -635,6 +645,7 @@ def exit(): import requests_cache requests_cache.install_cache(requests_cache_db) + FILE_CACHE.policy = caching resource = OntologyResource() resource.slug = input settings.autosave = autosave diff --git a/src/oaklib/constants.py b/src/oaklib/constants.py index 348adb0c7..837d9d441 100644 --- a/src/oaklib/constants.py +++ b/src/oaklib/constants.py @@ -2,9 +2,13 @@ import pystow +from oaklib.utilities.caching import FileCache + __all__ = [ "OAKLIB_MODULE", + "FILE_CACHE", ] OAKLIB_MODULE = pystow.module("oaklib") +FILE_CACHE = FileCache(OAKLIB_MODULE, '1w') TIMEOUT_SECONDS = 30 diff --git a/src/oaklib/implementations/llm_implementation.py b/src/oaklib/implementations/llm_implementation.py index 6faa01006..43b7e3649 100644 --- a/src/oaklib/implementations/llm_implementation.py +++ b/src/oaklib/implementations/llm_implementation.py @@ -8,7 +8,6 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Dict, Iterable, Iterator, List, Optional, Tuple -import pystow from linkml_runtime.dumpers import yaml_dumper from sssom_schema import Mapping from tenacity import ( @@ -19,6 +18,7 @@ ) from oaklib import BasicOntologyInterface +from oaklib.constants import FILE_CACHE from oaklib.datamodels.class_enrichment import ClassEnrichmentResult from oaklib.datamodels.item_list import ItemList from oaklib.datamodels.obograph import DefinitionPropertyValue @@ -148,7 +148,7 @@ def config_to_prompt(configuration: Optional[ValidationConfiguration]) -> Option for obj in configuration.documentation_objects: if obj.startswith("http:") or obj.startswith("https:"): - path = pystow.ensure("oaklib", "documents", url=obj) + path = FILE_CACHE.ensure("documents", url=obj) else: path = obj with open(path) as f: diff --git a/src/oaklib/implementations/sqldb/sql_implementation.py b/src/oaklib/implementations/sqldb/sql_implementation.py index d05be4986..7a0ee46e3 100644 --- a/src/oaklib/implementations/sqldb/sql_implementation.py +++ b/src/oaklib/implementations/sqldb/sql_implementation.py @@ -63,7 +63,7 @@ import oaklib.datamodels.ontology_metadata as om import oaklib.datamodels.validation_datamodel as vdm -from oaklib.constants import OAKLIB_MODULE +from oaklib.constants import FILE_CACHE from oaklib.datamodels import obograph, ontology_metadata from oaklib.datamodels.association import Association from oaklib.datamodels.obograph import ( @@ -342,7 +342,7 @@ def __post_init__(self): # Option 1 uses direct URL construction: url = f"https://s3.amazonaws.com/bbop-sqlite/{prefix}.db.gz" logging.info(f"Ensuring gunzipped for {url}") - db_path = OAKLIB_MODULE.ensure_gunzip(url=url, autoclean=False) + db_path = FILE_CACHE.ensure_gunzip(url=url, autoclean=False) # Option 2 uses botocore to interface with the S3 API directly: # db_path = OAKLIB_MODULE.ensure_from_s3(s3_bucket="bbop-sqlite", s3_key=f"{prefix}.db") locator = f"sqlite:///{db_path}" diff --git a/src/oaklib/utilities/caching.py b/src/oaklib/utilities/caching.py new file mode 100644 index 000000000..ae6fc9020 --- /dev/null +++ b/src/oaklib/utilities/caching.py @@ -0,0 +1,278 @@ +import os.path +import re +import time +from datetime import timedelta + +from pystow.utils import base_from_gzip_name, name_from_url + +_durations = {'d': 1, 'w': 7, 'm': 30, 'y': 365} + + +class CachePolicy(object): + """Represents the behaviour of a cache. + + Once a CachePolicy object has been created (typically using the static + constructor from_string, or one of the static properties for special + policies), use the refresh_file() method to determine whether a given file + should be refreshed: + + >>> if my_policy.refresh_file(my_cache_file): + >>> # refresh the cache file + >>> else: + >>> # no need to refresh + + Use the refresh() method to check an arbitrary timestamp against the policy + (e.g. if the cached data is not in a file): + + >>> if my_policy.refresh(timestamp_of_last_refresh): + >>> # refresh the data + """ + + def __init__(self, max_age): + """Creates a new instance. + + If positive, the max_age parameter is the number of seconds after which + cached data should be refreshed. This parameter can also accept some + special values: + + - 0 indicates refresh should always occur, regardless of the age of the + cached data; + - -1 indicates the cache should be cleared. + + It is recommended to obtain such special policies using either the + from_string static constructor or the static properties REFRESH, RESET, + rather than calling this constructor directly. This allows comparing a + policy against those pre-established policies as follows: + + >>> if my_policy == CachePolicy.RESET: + >>> # force reset + """ + + self._max_age = max_age + + def refresh(self, then): + """Indicates whether a refresh should occur for data last refreshed at + the indicated time. + + :param then: the time the data were last cached or refreshed, in + seconds since the Unix epoch + :return: True if the data should be refreshed, otherwise False + """ + + return time.time() - then > self._max_age + + def refresh_file(self, pathname): + """Indicates whether the specified file should be refreshed. + + This uses the last modification time of the file to determine the age + of the cached data. If the file does not exist, a refresh will + necessarily be mandated. + + :param pathname: the path to the file that maybe should be refreshed + :return: True if the file should be refreshed, otherwise False + """ + + if not os.path.exists(pathname): + return True + return self.refresh(os.path.getmtime(pathname)) + + @property + def always_refresh(self): + """Indicates whether this policy mandates a systematic refresh of the + cache.""" + + return self._max_age == 0 + + @property + def never_refresh(self): + """Indicates whether this policy mandates never refreshing the + cache.""" + + return self._max_age == timedelta.max.total_seconds() + + @property + def reset(self): + """Indicates whether this policy mandates a reset of the cache.""" + + return self._max_age == -1 + + _refresh_policy = None + _no_refresh_policy = None + _reset_policy = None + _click_type = None + + @classmethod + def from_string(cls, value): + """Creates a new instance from a string representation. + + This is the recommended way of getting a CachePolicy object. The value + can be either: + + - a number of seconds, followed by 's'; + - a number of days, optionally followed by 'd'; + - a number of weeks, followed by 'w'; + - a number of months, followed by 'm'; + - a number of years, followed by 'y'. + + Such a value will result in a policy mandating that cached data are + refreshed after the elapsed number of seconds, days, weeks, months, or + years since they were last cached. Note that in this context, a 'month' + is always 30 days and a 'year' is always 365 days. That is, '3m' is + merely a shortcut for '90d' (or simply '90') and '2y' is merely a + shortcut for '730d'. + + The value can also be: + + - 'refresh', to get the REFRESH policy; + - 'no-refresh', to get the NO_REFRESH policy; + - 'reset' or 'clear', to get the RESET policy. + + Any other value will cause None to be returned. + """ + + value = value.lower() + if value == 'refresh': + return cls.REFRESH + elif value == 'no-refresh': + return cls.NO_REFRESH + elif value in ['reset', 'clear']: + return cls.RESET + else: + if m := re.match('^([0-9]+)([sdwmy])?', value): + num, qual = m.groups() + if not qual: + qual = 'd' + if qual == 's': + return cls(int(num)) + else: + return cls(timedelta(days=int(num) * _durations[qual]).total_seconds()) + return None + + @classmethod + @property + def REFRESH(cls): + """A policy that cached data should always be refreshed.""" + + if cls._refresh_policy is None: + cls._refresh_policy = cls(max_age=0) + return cls._refresh_policy + + @classmethod + @property + def NO_REFRESH(cls): + """A policy that cached data should never be refreshed.""" + + if cls._no_refresh_policy is None: + cls._no_refresh_policy = cls(max_age=timedelta.max.total_seconds()) + return cls._no_refresh_policy + + @classmethod + @property + def RESET(cls): + """A policy that cached data should be cleared and refreshed.""" + + if cls._reset_policy is None: + cls._reset_policy = cls(max_age=-1) + return cls._reset_policy + + @classmethod + @property + def ClickType(cls): + """Helper method to parse a CachePolicy with Click. + + Use that method as the 'type' of a Click option to let Click + automatically convert the value of the option into a CachePolicy + instance. + + Example: + + >>> @click.option("--caching", type=CachePolicy.ClickType, + default="1w") + """ + + if cls._click_type is None: + from click import ParamType + + class CachePolicyParamType(ParamType): + name = 'cache-policy' + + def convert(self, value, param, ctx): + if isinstance(value, cls): + return value + + if p := cls.from_string(value): + return p + else: + self.fail(f"Cannot convert '{value}' to a cache policy", param, ctx) + + cls._click_type = CachePolicyParamType() + + return cls._click_type + + +class FileCache(object): + """Represents a file-based cache. + + This is intended as a layer built on top of Pystow, to add cache management + features that are lacking in Pystow. + """ + + def __init__(self, module, policy): + """Creates a new instance. + + :param module: a Pystow module representing the location where cached + data will be stored; all methods in this class will defer to this + object whenever a file needs to be actually refreshed + :param policy: a CachePolicy object that dictates when cached data + should be refreshed; may also be the string representation of such + a policy, which will then be passed to the CachePolicy.from_string + static constructor + """ + + self._module = module + if isinstance(policy, str): + self._policy = CachePolicy.from_string(policy) + else: + self._policy = policy + + @property + def policy(self): + """Gets the current caching policy used by this instance.""" + + return self._policy + + @policy.setter + def policy(self, policy): + """Sets the caching policy to be used by this instance.""" + + self._policy = policy + + def ensure_gunzip(self, url, name=None, autoclean=True): + """Looks up and maybe downloads and gunzips a file. + + This is a wrapper around Pystow's method of the same name. It behaves + similarly but, if the file is already present in the cache, it will + additionally check whether it needs to be downloaded again, according + to the current caching policy. + """ + + if not name: + name = name_from_url(url) + + db_path = self._module.join(name=base_from_gzip_name(name)) + + if self._policy.refresh_file(db_path): + self._module.ensure_gunzip(url=url, name=name, autoclean=autoclean, force=True) + + return db_path + + def ensure(self, *subkeys, url, name=None): + """Looks up and maybe downloads a file.""" + + if not name: + name = name_from_url(url) + + path = self._module.join(*subkeys, name=name) + + if self._policy.refresh_file(path): + self._module.ensure(*subkeys, url=url, name=name, force=True) From 86c9a9d83248300f01aa0570808a94690e371e27 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Sun, 18 Aug 2024 03:24:18 +0100 Subject: [PATCH 2/8] Re-implement cache-ls and cache-clear. Add new methods to the FileCache class to (1) get the list of files present in the cache and (2) delete files in the cache. Replace the implementations of the cache-ls and cache-clear commands to use the new methods, so that the details of cache listing and clearing remain encapsulated in FileCache. As a side-effect, this automatically fixes the issue that cache listing was only working on Unix-like systems, since the FileCache implementation is pure Python and does not rely on the ls(1) Unix command. --- src/oaklib/cli.py | 29 ++++++--------- src/oaklib/utilities/caching.py | 62 ++++++++++++++++++++++++++++++++- 2 files changed, 71 insertions(+), 20 deletions(-) diff --git a/src/oaklib/cli.py b/src/oaklib/cli.py index d0bb93b38..1991d86eb 100644 --- a/src/oaklib/cli.py +++ b/src/oaklib/cli.py @@ -9,14 +9,12 @@ # See https://stackoverflow.com/questions/47972638/how-can-i-define-the-order-of-click-sub-commands-in-help import json import logging -import os import statistics as stats import sys from collections import defaultdict from enum import Enum, unique from itertools import chain from pathlib import Path -from time import time from types import ModuleType from typing import ( Any, @@ -28,7 +26,6 @@ import click import kgcl_schema.grammar.parser as kgcl_parser -import pystow import sssom.writers as sssom_writers import sssom_schema import yaml @@ -5465,12 +5462,14 @@ def cache_ls(): """ List the contents of the pystow oaklib cache. - TODO: this currently only works on unix-based systems. """ - directory = pystow.api.join("oaklib") - command = f"ls -al {directory}" - click.secho(f"[pystow] {command}", fg="cyan", bold=True) - os.system(command) # noqa:S605 + units = ["B", "KB", "MB", "GB", "TB"] + for path, size, mtime in FILE_CACHE.get_contents(subdirs=True): + i = 0 + while size > 1024 and i < len(units) - 1: + size /= 1024 + i += 1 + click.echo(f"{path} ({size:.2f} {units[i]}, {mtime:%Y-%m-%d})") @main.command() @@ -5486,17 +5485,9 @@ def cache_clear(days_old: int): Clear the contents of the pystow oaklib cache. """ - directory = pystow.api.join("oaklib") - now = time() - for item in Path(directory).glob("*"): - if ".db" not in str(item): - continue - mtime = item.stat().st_mtime - curr_days_old = (int(now) - int(mtime)) / 86400 - logging.info(f"{item} is {curr_days_old}") - if curr_days_old > days_old: - click.echo(f"Deleting {item} which is {curr_days_old}") - item.unlink() + + for name, _, age in FILE_CACHE.clear(subdirs=False, older_than=days_old, pattern="*.db*"): + click.echo(f"Deleted {name} which was {age.days} days old") @main.command() diff --git a/src/oaklib/utilities/caching.py b/src/oaklib/utilities/caching.py index ae6fc9020..5489aad32 100644 --- a/src/oaklib/utilities/caching.py +++ b/src/oaklib/utilities/caching.py @@ -1,7 +1,8 @@ import os.path import re import time -from datetime import timedelta +from datetime import datetime, timedelta +from pathlib import Path from pystow.utils import base_from_gzip_name, name_from_url @@ -276,3 +277,62 @@ def ensure(self, *subkeys, url, name=None): if self._policy.refresh_file(path): self._module.ensure(*subkeys, url=url, name=name, force=True) + + def get_contents(self, subdirs=False): + """Gets a list of files present in the cache. + + This returns a list of (name, size, mtime) tuples, where: + + - name is the filename (relative to the cache directory); + - size is its size in bytes; + - mtime is its modification time, as a datetime object. + + If subdirs is True, the list includes files present in any subdirectory + within the cache. The default is to list only the files immediately + under the cache directory, excluding any subdirectory. + """ + + contents = [] + for path, name in self._iter_files(subdirs=subdirs): + stat = path.stat() + contents.append((name, stat.st_size, datetime.fromtimestamp(stat.st_mtime))) + return contents + + def clear(self, subdirs=False, older_than=None, pattern="*"): + """Deletes files present in the cache. + + :param subdirs: if True, deletes files in subdirectories + :param older_than: if set, only deletes files that were last modified + longer ago than the specified number of days + :param pattern: only deletes files matching the specified pattern + :return: a list of tuples describing the files that were deleted; the + tuples are similar to the ones returned by get_contents, except + that the third item is the age of the deleted file (as a timedelta + object relative to current time) + """ + + now = time.time() + cleared = [] + for path, name in self._iter_files(subdirs=subdirs, pattern=pattern): + stat = path.stat() + age = now - stat.st_mtime + if older_than is not None and age <= older_than * 86400: + continue + cleared.append((name, stat.st_size, timedelta(seconds=age))) + path.unlink() + return cleared + + def _iter_files(self, subdirs=False, pattern="*"): + """Helper method to get the files present in the cache. + + :param subdirs: if True, get files in subdirectories + :param pattern: get files matching the pattern + :return: a list of (path, name) tuples where path is a Path object + pointing to a file in the cache, and name is its name relative to + the cache directory + """ + + base = self._module.join() + if subdirs: + pattern = "**/" + pattern + return [(c, str(c.relative_to(base))) for c in Path(base).glob(pattern) if c.is_file()] From ac3c29d9b47afe8b0575c23114046fc2761704c9 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Sun, 18 Aug 2024 12:30:15 +0100 Subject: [PATCH 3/8] Implement the cache reset policy. The intended difference between the REFRESH and RESET caching policies is that, when a cache lookup is attempted, REFRESH should cause the file that was looked up -- and only that file -- to be refreshed, leaving any other file that may be present in the cache untouched. RESET, on the other hand, should entirely clear the cache, so that not only the file that was looked up should be refreshed, but any other file that may looked up in a subsequent call should be refreshed as well. This commit implements the intended behaviour for the RESET policy. --- src/oaklib/utilities/caching.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/oaklib/utilities/caching.py b/src/oaklib/utilities/caching.py index 5489aad32..546020492 100644 --- a/src/oaklib/utilities/caching.py +++ b/src/oaklib/utilities/caching.py @@ -257,6 +257,9 @@ def ensure_gunzip(self, url, name=None, autoclean=True): to the current caching policy. """ + if self._policy == CachePolicy.RESET: + self.clear(pattern="*.db*") + if not name: name = name_from_url(url) @@ -270,6 +273,9 @@ def ensure_gunzip(self, url, name=None, autoclean=True): def ensure(self, *subkeys, url, name=None): """Looks up and maybe downloads a file.""" + if self._policy == CachePolicy.RESET: + self.clear(pattern="*.db*") + if not name: name = name_from_url(url) From b1db7b95288f65e559a4fd2ed5a7aab97177be56 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Sun, 18 Aug 2024 14:22:05 +0100 Subject: [PATCH 4/8] Fix forced refresh for future timestamps and add tests. In principle, we should never have to compare a timestamp representing a future date when we check whether a cached file should be refreshed. However, files with bogus mtime values and/or computers configured with a bogus system time are certainly not uncommon, so encountering a timestamp higher than the current time can (and will) definitely happen. Under an "always refresh" policy, a refresh must be triggered even if the cached file appears to "newer than now", so we explicitly implement that behaviour here. We also add a complete test fixture for the CachePolicy class. --- src/oaklib/utilities/caching.py | 3 + tests/test_utilities/test_caching.py | 95 ++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 tests/test_utilities/test_caching.py diff --git a/src/oaklib/utilities/caching.py b/src/oaklib/utilities/caching.py index 546020492..7c59e8d26 100644 --- a/src/oaklib/utilities/caching.py +++ b/src/oaklib/utilities/caching.py @@ -60,6 +60,9 @@ def refresh(self, then): :return: True if the data should be refreshed, otherwise False """ + if self._max_age <= 0: + # Forceful refresh/reset, even if "then" is somehow in the future + return True return time.time() - then > self._max_age def refresh_file(self, pathname): diff --git a/tests/test_utilities/test_caching.py b/tests/test_utilities/test_caching.py new file mode 100644 index 000000000..7d40dcdf3 --- /dev/null +++ b/tests/test_utilities/test_caching.py @@ -0,0 +1,95 @@ +import os +import time +import unittest + +from oaklib.utilities.caching import CachePolicy + + +class TestCachePolicy(unittest.TestCase): + + def test_refresh_policy(self): + policy = CachePolicy.from_string("refresh") + + self.assertTrue(policy.always_refresh) + self.assertFalse(policy.never_refresh) + self.assertFalse(policy.reset) + + self.assertEqual(CachePolicy.REFRESH, policy) + + now = time.time() + self.assertTrue(policy.refresh(now)) + self.assertTrue(policy.refresh(now + 86400)) # 1 day in the future + self.assertTrue(policy.refresh(now - 86400)) # 1 day in the past + + def test_never_refresh_policy(self): + policy = CachePolicy.from_string("no-refresh") + + self.assertTrue(policy.never_refresh) + self.assertFalse(policy.always_refresh) + self.assertFalse(policy.reset) + + self.assertEqual(CachePolicy.NO_REFRESH, policy) + + now = time.time() + self.assertFalse(policy.refresh(now)) + self.assertFalse(policy.refresh(now + 86400)) + self.assertFalse(policy.refresh(now - 86400)) + + # inexistent file is always refreshed even under "no-refresh" + self.assertTrue(policy.refresh_file("inexistent-file")) + + def test_reset_policy(self): + policy = CachePolicy.from_string("reset") + self.assertEqual(policy, CachePolicy.from_string("clear")) + + self.assertTrue(policy.reset) + self.assertFalse(policy.always_refresh) + self.assertFalse(policy.never_refresh) + + self.assertEqual(CachePolicy.RESET, policy) + + now = time.time() + self.assertTrue(policy.refresh(now)) + self.assertTrue(policy.refresh(now + 86400)) + self.assertTrue(policy.refresh(now - 86400)) + + def test_refresh_after_1day_policy(self): + policy = CachePolicy.from_string('1d') + + self.assertFalse(policy.always_refresh) + self.assertFalse(policy.never_refresh) + self.assertFalse(policy.reset) + + now = time.time() + self.assertTrue(policy.refresh(now - 90000)) # 25 hours in the past + self.assertFalse(policy.refresh(now - 82800)) # 23 hours in the past + + def test_refresh_file(self): + now = time.time() + + # Create dummy file with known mtime 3 days in the past + path = "tests/output/dummy-cache" + with open(path, "w"): + pass + os.utime(path, (now - 259200, now - 259200)) + + self.assertTrue(CachePolicy.REFRESH.refresh_file(path)) + self.assertTrue(CachePolicy.RESET.refresh_file(path)) + self.assertFalse(CachePolicy.NO_REFRESH.refresh_file(path)) + self.assertTrue(CachePolicy.from_string('2d').refresh_file(path)) + self.assertFalse(CachePolicy.from_string('4d').refresh_file(path)) + + os.unlink(path) + + # Inexistent file gets refreshed even under no-refresh + self.assertTrue(CachePolicy.NO_REFRESH.refresh_file(path)) + + def test_parsing_durations(self): + self.assertEqual(CachePolicy.from_string("1")._max_age, 86400) + self.assertEqual(CachePolicy.from_string("1d")._max_age, 86400) + self.assertEqual(CachePolicy.from_string("86400s")._max_age, 86400) + self.assertEqual(CachePolicy.from_string("1w")._max_age, 86400 * 7) + self.assertEqual(CachePolicy.from_string("1m")._max_age, 86400 * 30) + self.assertEqual(CachePolicy.from_string("1y")._max_age, 86400 * 365) + + self.assertIsNone(CachePolicy.from_string("bogus")) From e1f08a3c1bc1261bd2eaf01c44b0ca6c4659038c Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Sun, 18 Aug 2024 14:47:16 +0100 Subject: [PATCH 5/8] Add some documentation for --caching. In the SQLite tutorial, in the section that briefly mentions that automatically downloaded SQLite files are cached in ``.data/oaklib``, we describe in more details how the cache works and how it can be controlled using the `--caching` option. --- docs/intro/tutorial07.rst | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/intro/tutorial07.rst b/docs/intro/tutorial07.rst index 3ebdfb6c6..2d0532ae4 100644 --- a/docs/intro/tutorial07.rst +++ b/docs/intro/tutorial07.rst @@ -64,6 +64,25 @@ This will download the pato.db sqlite file once, and cache it. PyStow is used to cache the file, and the default location is ``~/.data/oaklib``. +By default, a cached SQLite file will be automatically refreshed (downloaded +again) if it is older than 7 days. That behavior can be controlled with the +global ``--caching`` option. For example, to force OAK to always download the +file regardless of its age: + +.. code-block:: + + runoak --caching=refresh -i sqlite:obo:pato search t~shape + +Other possible values for the ``--caching`` option include: + +- ``no-refresh`` to prevent OAK from re-downloading the file even it is older + than 7 days; +- ``Xd`` to refresh a cached file older than _X_ days; +- ``Xw`` to refresh a cached file older than _X_ weeks. + +You may also use the ``cache-clear`` command to force clearing any cached +SQLite file at anytime. + Building your own SQLite files ------------------- From 64a83e30c54fd8af68faea8632159d15c961ee9e Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Sun, 18 Aug 2024 22:50:39 +0100 Subject: [PATCH 6/8] Add complete documentation for the `--caching` option. Add a new section in the CLI reference documentation to explain how the cache works and how it can be controlled using the `--caching` option. Replace the previous, shorter documentation in the SQLite tutorial by a simple mention of the cache with a link to the newly added reference section. --- docs/cli.rst | 49 +++++++++++++++++++++++++++++++++++++++ docs/intro/tutorial07.rst | 19 ++------------- 2 files changed, 51 insertions(+), 17 deletions(-) diff --git a/docs/cli.rst b/docs/cli.rst index 1adf0422d..808a629b1 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -91,6 +91,55 @@ and tracing upwards through is_a and part_of relationships: uberon viz -p i,p hand foot +Cache Control +------------- + +OAK may download data from remote sources as part of its normal operations. For +example, using the :code:`sqlite:obo:...` input selector will cause OAK to +fetch the requested Semantic-SQL database from a centralised repository. +Whenever that happens, the downloaded data will be cached in a local directory +so that subsequent commands using the same input selector do not have to +download the file again. + +By default, OAK will refresh (download again) a previously downloaded file if +it was last downloaded more than 7 days ago. + +The global option :code:`--caching` gives the user some control on how the +cache works. + +To change the default cache expiry lifetime of 7 days, the :code:`--caching` +option accepts a value of the form :code:`ND`, where *N* is a positive integer +and *D* can be either :code:`s`, :code:`d`, :code:`w`, :code:`m`, or :code:`y` +to indicate that *N* is a number of seconds, days, weeks, months, or years, +respectively. If the *D* part is omitted, it defaults to :code:`d`. + +For example, :code:`--caching=3w` instructs OAK to refresh a cached file if it +was last refreshed 21 days ago. + +The :code:`--caching` option also accepts the following special values: + +- :code:`refresh` to force OAK to always refresh a file regardless of its age; +- :code:`no-refresh` to do the opposite, that is, preventing OAK from + refreshing a file regardless of its age; +- :code:`clear` to forcefully clear the cache (which will trigger a refresh as + a consequence); +- :code:`reset` is a synonym of :code:`clear`. + +Note the difference between :code:`refresh` and :code:`clear`. The former will +only cause the requested file to be refreshed, leaving any other file that may +exist in the cache untouched. The latter will delete all cached files, so that +not only the requested file will be downloaded again, but any other +previously cached file will also have to be downloaded again the next time they +are requested. + +In both case, refreshing and clearing will only happen if the OAK command in +which the :code:`--caching` option is used attempts to look up a cached file. +Otherwise the option will have no effect. + +To forcefully clear the cache independently of any command, the +:ref:`cache-clear` command may be used. The contents of the cache may be +explored at any time with the :ref:`cache-ls` command. + Commands ----------- diff --git a/docs/intro/tutorial07.rst b/docs/intro/tutorial07.rst index 2d0532ae4..e94f184c9 100644 --- a/docs/intro/tutorial07.rst +++ b/docs/intro/tutorial07.rst @@ -65,23 +65,8 @@ This will download the pato.db sqlite file once, and cache it. PyStow is used to cache the file, and the default location is ``~/.data/oaklib``. By default, a cached SQLite file will be automatically refreshed (downloaded -again) if it is older than 7 days. That behavior can be controlled with the -global ``--caching`` option. For example, to force OAK to always download the -file regardless of its age: - -.. code-block:: - - runoak --caching=refresh -i sqlite:obo:pato search t~shape - -Other possible values for the ``--caching`` option include: - -- ``no-refresh`` to prevent OAK from re-downloading the file even it is older - than 7 days; -- ``Xd`` to refresh a cached file older than _X_ days; -- ``Xw`` to refresh a cached file older than _X_ weeks. - -You may also use the ``cache-clear`` command to force clearing any cached -SQLite file at anytime. +again) if it is older than 7 days. For details on how to alter the behavior of +the cache, see the :ref:`Cache Control` section in the CLI documentation. Building your own SQLite files ------------------- From 83d2a595bf3ad2a8c1564aaeea50b0a6299b535a Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Tue, 20 Aug 2024 00:01:16 +0100 Subject: [PATCH 7/8] Allow controlling the cache through a configuration file. This commit adds the possibility to configure the file cache to apply pattern-specific caching policies. This is controlled by a configuration file ($XDG_CONFIG_HOME/ontology-access-kit/cache.conf, under GNU/Linux) containing "pattern=policy" pairs, where pattern is a shell-type globbing pattern and policy is a string of the same type as expected by the newly introduced --caching option. --- docs/cli.rst | 55 ++++++++++++++++ src/oaklib/cli.py | 5 +- src/oaklib/constants.py | 2 +- src/oaklib/utilities/caching.py | 99 +++++++++++++++++++++------- tests/input/cache.conf | 16 +++++ tests/test_utilities/test_caching.py | 43 +++++++++++- 6 files changed, 190 insertions(+), 30 deletions(-) create mode 100644 tests/input/cache.conf diff --git a/docs/cli.rst b/docs/cli.rst index 808a629b1..83ceb2784 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -104,6 +104,12 @@ download the file again. By default, OAK will refresh (download again) a previously downloaded file if it was last downloaded more than 7 days ago. +The behavior of the cache can be controlled in two ways: with an option on the +command line and with a configuration. + +Controlling the cache on the command line +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + The global option :code:`--caching` gives the user some control on how the cache works. @@ -140,6 +146,55 @@ To forcefully clear the cache independently of any command, the :ref:`cache-clear` command may be used. The contents of the cache may be explored at any time with the :ref:`cache-ls` command. +Controlling the cache with a configuration file +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Finer control of how the cache works is possible through a configuration file +that OAK will look up for at the following locations: + +- under GNU/Linux: in ``$XDG_CONFIG_HOME/ontology-access-kit/cache.conf``; +- under macOS: in ``$HOME/Library/Preferences/ontology-access-kit/cache.conf``; +- under Windows: in ``%LOCALAPPDATA%\ontology-access-kit\ontology-access-kit\cache.conf``, + or ``%APPDATA%\ontology-access-kit\ontology-access-kit\cache.conf`` if the + user is using a roaming profile. + +The file should contain lines of the form :code:`pattern = policy`, where: + +- *pattern* is a shell-type globbing pattern indicating the files that will be + concerned by the policy set forth on the line; +- *policy* is the same type of value as expected by the :code:`--caching` + option as explained in the previous section. + +Blank lines and lines starting with :code:`#` are ignored. + +If the *pattern* is :code:`default` (or :code:`*`), the corresponding policy +will be used for any cached file that does not have a matching policy. + +Here is a sample configuration file: + +.. code-block:: + + # Uberon will be refreshed if older than 1 month + uberon.db = 1m + # FBbt will be refreshed if older than 2 weeks + fbbt.db = 2w + # Other FlyBase ontologies will be refreshed if older than 2 months + fb*.db = 2m + # All other files will be refreshed if older than 3 weeks + default = 3w + +Note that when looking up the policy to apply to a given file, patterns are +tried in the order they appear in the file. This is why the :code:`fbbt.db` +pattern in the example above must be listed *before* the less specific +:code:`fb*.db` pattern, otherwise it would be ignored. (This does not apply to +the default pattern -- whether it is specified as :code:`default` or as +:code:`*` -- which is always tried after all the other patterns.) + +The :code:`--caching` option described in the previous section always takes +precedence over the configuration file. That is, all rules set forth in the +configuration will be ignored if the :code:`--caching` option is specified on +the command line. + Commands ----------- diff --git a/src/oaklib/cli.py b/src/oaklib/cli.py index 1991d86eb..ab9541e3a 100644 --- a/src/oaklib/cli.py +++ b/src/oaklib/cli.py @@ -570,7 +570,6 @@ def _apply_changes(impl, changes: List[kgcl.Change]): @click.option( "--caching", type=CachePolicy.ClickType, - default="1w", show_default=True, help="Set the cache management policy", ) @@ -593,7 +592,7 @@ def main( prefix, profile: bool, import_depth: Optional[int], - caching: CachePolicy, + caching: Optional[CachePolicy], **kwargs, ): """ @@ -642,7 +641,7 @@ def exit(): import requests_cache requests_cache.install_cache(requests_cache_db) - FILE_CACHE.policy = caching + FILE_CACHE.force_policy(caching) resource = OntologyResource() resource.slug = input settings.autosave = autosave diff --git a/src/oaklib/constants.py b/src/oaklib/constants.py index 837d9d441..7160ae825 100644 --- a/src/oaklib/constants.py +++ b/src/oaklib/constants.py @@ -10,5 +10,5 @@ ] OAKLIB_MODULE = pystow.module("oaklib") -FILE_CACHE = FileCache(OAKLIB_MODULE, '1w') +FILE_CACHE = FileCache(OAKLIB_MODULE) TIMEOUT_SECONDS = 30 diff --git a/src/oaklib/utilities/caching.py b/src/oaklib/utilities/caching.py index 7c59e8d26..c8010b30e 100644 --- a/src/oaklib/utilities/caching.py +++ b/src/oaklib/utilities/caching.py @@ -1,12 +1,18 @@ +import fnmatch +import logging import os.path import re import time from datetime import datetime, timedelta from pathlib import Path +from appdirs import user_config_dir from pystow.utils import base_from_gzip_name, name_from_url +from oaklib.datamodels.vocabulary import APP_NAME + _durations = {'d': 1, 'w': 7, 'm': 30, 'y': 365} +_logger = logging.getLogger(__name__) class CachePolicy(object): @@ -221,35 +227,30 @@ class FileCache(object): features that are lacking in Pystow. """ - def __init__(self, module, policy): + def __init__(self, module): """Creates a new instance. :param module: a Pystow module representing the location where cached data will be stored; all methods in this class will defer to this object whenever a file needs to be actually refreshed - :param policy: a CachePolicy object that dictates when cached data - should be refreshed; may also be the string representation of such - a policy, which will then be passed to the CachePolicy.from_string - static constructor """ self._module = module - if isinstance(policy, str): - self._policy = CachePolicy.from_string(policy) - else: - self._policy = policy - - @property - def policy(self): - """Gets the current caching policy used by this instance.""" - - return self._policy - - @policy.setter - def policy(self, policy): - """Sets the caching policy to be used by this instance.""" + self._default_policy = CachePolicy.from_string('1w') + self._forced_policy = None + self._policies = [] + self._config_file = os.path.join(user_config_dir(APP_NAME), "cache.conf") + self._config_read = False + + def force_policy(self, policy): + """Forces the cache to use the specified policy, regardless of any + otherwise configured policies. + + :param policy: the policy to use; may be None to allow the use of + configured policies + """ - self._policy = policy + self._forced_policy = policy def ensure_gunzip(self, url, name=None, autoclean=True): """Looks up and maybe downloads and gunzips a file. @@ -260,15 +261,16 @@ def ensure_gunzip(self, url, name=None, autoclean=True): to the current caching policy. """ - if self._policy == CachePolicy.RESET: + if self._forced_policy == CachePolicy.RESET: self.clear(pattern="*.db*") if not name: name = name_from_url(url) - db_path = self._module.join(name=base_from_gzip_name(name)) + ungz_name = base_from_gzip_name(name) + db_path = self._module.join(name=ungz_name) - if self._policy.refresh_file(db_path): + if self._get_policy(ungz_name).refresh_file(db_path): self._module.ensure_gunzip(url=url, name=name, autoclean=autoclean, force=True) return db_path @@ -276,7 +278,7 @@ def ensure_gunzip(self, url, name=None, autoclean=True): def ensure(self, *subkeys, url, name=None): """Looks up and maybe downloads a file.""" - if self._policy == CachePolicy.RESET: + if self._forced_policy == CachePolicy.RESET: self.clear(pattern="*.db*") if not name: @@ -284,9 +286,11 @@ def ensure(self, *subkeys, url, name=None): path = self._module.join(*subkeys, name=name) - if self._policy.refresh_file(path): + if self._get_policy(name).refresh_file(path): self._module.ensure(*subkeys, url=url, name=name, force=True) + return path + def get_contents(self, subdirs=False): """Gets a list of files present in the cache. @@ -345,3 +349,48 @@ def _iter_files(self, subdirs=False, pattern="*"): if subdirs: pattern = "**/" + pattern return [(c, str(c.relative_to(base))) for c in Path(base).glob(pattern) if c.is_file()] + + def _get_policy(self, name): + """Gets the caching policy to use for the specified name.""" + + if self._forced_policy is not None: + return self._forced_policy + + if not self._config_read: + self._get_configuration(self._config_file) + + for pattern, policy in self._policies: + if fnmatch.fnmatch(name, pattern): + return policy + + return self._default_policy + + def _get_configuration(self, pathname): + """Gets cache policies from a configuration file.""" + + if not os.path.exists(pathname): + return + + filename = os.path.basename(pathname) + with open(pathname, "r") as f: + for n, line in enumerate(f): + if line.startswith("#") or line.isspace(): + continue + + items = line.split("=", maxsplit=1) + pattern = items[0].strip() + if len(items) != 2: + _logger.warning(f"{filename}({n}): Ignoring missing caching policy for {pattern}") + continue + + policy = CachePolicy.from_string(items[1].strip()) + if policy is None: + _logger.warning(f"{filename}({n}): Ignoring invalid caching policy for {pattern}") + continue + + if pattern in ["default", "*"]: + self._default_policy = policy + else: + self._policies.append((pattern, policy)) + + self._config_read = True diff --git a/tests/input/cache.conf b/tests/input/cache.conf new file mode 100644 index 000000000..408e6757d --- /dev/null +++ b/tests/input/cache.conf @@ -0,0 +1,16 @@ +# Test file for the file cache configuration + +# Default policy: refresh after 1 week +default = 1w + +# Refresh Uberon after 2 weeks +uberon.db = 2w + +# Refresh FlyBase ontologies after 1 month +fb*.db = 1m + +# Warning: pattern without associated policy +missing_policy.db + +# Warning: invalid policy +invalid_policy.db = invalid diff --git a/tests/test_utilities/test_caching.py b/tests/test_utilities/test_caching.py index 7d40dcdf3..e8dff2467 100644 --- a/tests/test_utilities/test_caching.py +++ b/tests/test_utilities/test_caching.py @@ -2,7 +2,7 @@ import time import unittest -from oaklib.utilities.caching import CachePolicy +from oaklib.utilities.caching import CachePolicy, FileCache class TestCachePolicy(unittest.TestCase): @@ -93,3 +93,44 @@ def test_parsing_durations(self): self.assertEqual(CachePolicy.from_string("1y")._max_age, 86400 * 365) self.assertIsNone(CachePolicy.from_string("bogus")) + +class TestFileCache(unittest.TestCase): + + def test_parse_cache_configuration(self): + cache = FileCache(None) # we don't need a Pystow module here + + with self.assertLogs() as log: + cache._get_configuration("tests/input/cache.conf") + self.assertTrue("missing caching policy" in log.output[0]) + self.assertTrue("invalid caching policy" in log.output[1]) + + self.assertEqual(cache._default_policy._max_age, 86400 * 7) + self.assertEqual(cache._policies[0][0], "uberon.db") + self.assertEqual(cache._policies[0][1]._max_age, 86400 * 7 * 2) + self.assertEqual(cache._policies[1][0], "fb*.db") + self.assertEqual(cache._policies[1][1]._max_age, 86400 * 30) + + def test_policy_selector(self): + cache = FileCache(None) + cache._policies.append(("uberon.db", CachePolicy.from_string("2w"))) + cache._policies.append(("fbbt.db", CachePolicy.from_string("3w"))) + cache._policies.append(("fb*.db", CachePolicy.from_string("1m"))) + cache._policies.append(("fbcv.db", CachePolicy.from_string("1y"))) + + # Prevent a configuration file from messing with the test + cache._config_read = True + + # Check the right policy is selected + self.assertEqual(cache._get_policy("uberon.db")._max_age, 86400 * 7 * 2) + self.assertEqual(cache._get_policy("fbbt.db")._max_age, 86400 * 7 * 3) + self.assertEqual(cache._get_policy("fbdv.db")._max_age, 86400 * 30) + self.assertEqual(cache._get_policy("fbcv.db")._max_age, 86400 * 30) + self.assertEqual(cache._get_policy("other.db")._max_age, 86400 * 7) + + # Check that "forced policy" takes precedence + cache.force_policy(CachePolicy.from_string("2d")) + self.assertEqual(cache._get_policy("uberon.db")._max_age, 86400 * 2) + self.assertEqual(cache._get_policy("fbbt.db")._max_age, 86400 * 2) + self.assertEqual(cache._get_policy("fbdv.db")._max_age, 86400 * 2) + self.assertEqual(cache._get_policy("fbcv.db")._max_age, 86400 * 2) + self.assertEqual(cache._get_policy("other.db")._max_age, 86400 * 2) From f041562ddf206ea68d2be6bcbe9d42ee9efb8f88 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Tue, 20 Aug 2024 18:36:32 +0100 Subject: [PATCH 8/8] Misc documentation fix. The "user_config_dir" returned by the Appdirs package under macOS is not in "~/Library/Prefences" but under "~/Library/Application Support" (Appdirs documentation is not up to date). Also, there is no need to mention the roaming directory under Windows, as Appdirs will never use that directory unless we explicitly asks it do so (which we don't). There is also no need for a show_default=True parameter with the --caching option, since that option has _no_ default. --- docs/cli.rst | 8 +++----- src/oaklib/cli.py | 1 - 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/docs/cli.rst b/docs/cli.rst index 83ceb2784..253e654cc 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -105,7 +105,7 @@ By default, OAK will refresh (download again) a previously downloaded file if it was last downloaded more than 7 days ago. The behavior of the cache can be controlled in two ways: with an option on the -command line and with a configuration. +command line and with a configuration file. Controlling the cache on the command line ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -153,10 +153,8 @@ Finer control of how the cache works is possible through a configuration file that OAK will look up for at the following locations: - under GNU/Linux: in ``$XDG_CONFIG_HOME/ontology-access-kit/cache.conf``; -- under macOS: in ``$HOME/Library/Preferences/ontology-access-kit/cache.conf``; -- under Windows: in ``%LOCALAPPDATA%\ontology-access-kit\ontology-access-kit\cache.conf``, - or ``%APPDATA%\ontology-access-kit\ontology-access-kit\cache.conf`` if the - user is using a roaming profile. +- under macOS: in ``$HOME/Library/Application Support/ontology-access-kit/cache.conf``; +- under Windows: in ``%LOCALAPPDATA%\ontology-access-kit\ontology-access-kit\cache.conf``. The file should contain lines of the form :code:`pattern = policy`, where: diff --git a/src/oaklib/cli.py b/src/oaklib/cli.py index ab9541e3a..8fd51aa5e 100644 --- a/src/oaklib/cli.py +++ b/src/oaklib/cli.py @@ -570,7 +570,6 @@ def _apply_changes(impl, changes: List[kgcl.Change]): @click.option( "--caching", type=CachePolicy.ClickType, - show_default=True, help="Set the cache management policy", ) def main(