From dafe5ac599e125fa50748aff6d24bd2b9f4978b8 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Tue, 20 Apr 2021 09:48:58 +0200 Subject: [PATCH 01/16] Configuration ci (#1049) * Add a non-functional entry point * Allow setting of API key through CLI - Add function to set any field in the configuration file - Add function to read out the configuration file - Towards full configurability from CLI * Remove autocomplete promise, use _defaults Autocomplete seems to be incompatible with `choices`, so I'll ignore that for now. We also use `config._defaults` instead of an explicit list to avoid duplication. * Add server configuration * Allow fields to be set directly non-interactively With the `openml configure FIELD VALUE` command. * Combine error and check functionalities Otherwise you have to duplicate all checks in the error message function. * Share logic about setting/collecting the value * Complete CLI for other fields. Max_retries is excluded because it should not be user configurable, and will most likely be removed. Verbosity is configurable but is currently not actually used. * Bring back sanitizing user input And extend it to the bool inputs. * Add small bit of info about the command line tool * Add API key configuration note in the introduction * Add to progress log * Refactor flow of wait_until_valid_input --- .flake8 | 1 + doc/progress.rst | 1 + doc/usage.rst | 4 + examples/20_basic/introduction_tutorial.py | 8 +- openml/cli.py | 331 +++++++++++++++++++++ openml/config.py | 42 ++- setup.py | 1 + 7 files changed, 378 insertions(+), 10 deletions(-) create mode 100644 openml/cli.py diff --git a/.flake8 b/.flake8 index 08bb8ea10..211234f22 100644 --- a/.flake8 +++ b/.flake8 @@ -5,6 +5,7 @@ select = C,E,F,W,B,T ignore = E203, E402, W503 per-file-ignores = *__init__.py:F401 + *cli.py:T001 exclude = venv examples diff --git a/doc/progress.rst b/doc/progress.rst index f27dd1137..2fbf95b31 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -15,6 +15,7 @@ Changelog * DOC #1051: Document existing extensions to OpenML-Python besides the shipped scikit-learn extension. * FIX #1035: Render class attributes and methods again. +* ADD #1049: Add a command line tool for configuration openml-python. * FIX #1042: Fixes a rare concurrency issue with OpenML-Python and joblib which caused the joblib worker pool to fail. * FIX #1053: Fixes a bug which could prevent importing the package in a docker container. diff --git a/doc/usage.rst b/doc/usage.rst index 23ef4ec84..e106e6d60 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -59,6 +59,10 @@ which are separated by newlines. The following keys are defined: * 1: info output * 2: debug output +This file is easily configurable by the ``openml`` command line interface. +To see where the file is stored, and what its values are, use `openml configure none`. +Set any field with ``openml configure FIELD`` or even all fields with just ``openml configure``. + ~~~~~~~~~~~~ Key concepts ~~~~~~~~~~~~ diff --git a/examples/20_basic/introduction_tutorial.py b/examples/20_basic/introduction_tutorial.py index 151692fdc..737362e49 100644 --- a/examples/20_basic/introduction_tutorial.py +++ b/examples/20_basic/introduction_tutorial.py @@ -42,13 +42,17 @@ # * After logging in, open your account page (avatar on the top right) # * Open 'Account Settings', then 'API authentication' to find your API key. # -# There are two ways to authenticate: +# There are two ways to permanently authenticate: # +# * Use the ``openml`` CLI tool with ``openml configure apikey MYKEY``, +# replacing **MYKEY** with your API key. # * Create a plain text file **~/.openml/config** with the line # **'apikey=MYKEY'**, replacing **MYKEY** with your API key. The config # file must be in the directory ~/.openml/config and exist prior to # importing the openml module. -# * Run the code below, replacing 'YOURKEY' with your API key. +# +# Alternatively, by running the code below and replacing 'YOURKEY' with your API key, +# you authenticate for the duration of the python process. # # .. warning:: This example uploads data. For that reason, this example # connects to the test server instead. This prevents the live server from diff --git a/openml/cli.py b/openml/cli.py new file mode 100644 index 000000000..b26e67d2e --- /dev/null +++ b/openml/cli.py @@ -0,0 +1,331 @@ +"""" Command Line Interface for `openml` to configure its settings. """ + +import argparse +import os +import pathlib +import string +from typing import Union, Callable +from urllib.parse import urlparse + + +from openml import config + + +def is_hex(string_: str) -> bool: + return all(c in string.hexdigits for c in string_) + + +def looks_like_url(url: str) -> bool: + # There's no thorough url parser, but we only seem to use netloc. + try: + return bool(urlparse(url).netloc) + except Exception: + return False + + +def wait_until_valid_input( + prompt: str, check: Callable[[str], str], sanitize: Union[Callable[[str], str], None] +) -> str: + """ Asks `prompt` until an input is received which returns True for `check`. + + Parameters + ---------- + prompt: str + message to display + check: Callable[[str], str] + function to call with the given input, that provides an error message if the input is not + valid otherwise, and False-like otherwise. + sanitize: Callable[[str], str], optional + A function which attempts to sanitize the user input (e.g. auto-complete). + + Returns + ------- + valid input + + """ + + while True: + response = input(prompt) + if sanitize: + response = sanitize(response) + error_message = check(response) + if error_message: + print(error_message, end="\n\n") + else: + return response + + +def print_configuration(): + file = config.determine_config_file_path() + header = f"File '{file}' contains (or defaults to):" + print(header) + + max_key_length = max(map(len, config.get_config_as_dict())) + for field, value in config.get_config_as_dict().items(): + print(f"{field.ljust(max_key_length)}: {value}") + + +def verbose_set(field, value): + config.set_field_in_config_file(field, value) + print(f"{field} set to '{value}'.") + + +def configure_apikey(value: str) -> None: + def check_apikey(apikey: str) -> str: + if len(apikey) != 32: + return f"The key should contain 32 characters but contains {len(apikey)}." + if not is_hex(apikey): + return "Some characters are not hexadecimal." + return "" + + instructions = ( + f"Your current API key is set to: '{config.apikey}'. " + "You can get an API key at https://new.openml.org. " + "You must create an account if you don't have one yet:\n" + " 1. Log in with the account.\n" + " 2. Navigate to the profile page (top right circle > Your Profile). \n" + " 3. Click the API Key button to reach the page with your API key.\n" + "If you have any difficulty following these instructions, let us know on Github." + ) + + configure_field( + field="apikey", + value=value, + check_with_message=check_apikey, + intro_message=instructions, + input_message="Please enter your API key:", + ) + + +def configure_server(value: str) -> None: + def check_server(server: str) -> str: + is_shorthand = server in ["test", "production"] + if is_shorthand or looks_like_url(server): + return "" + return "Must be 'test', 'production' or a url." + + def replace_shorthand(server: str) -> str: + if server == "test": + return "https://test.openml.org/api/v1/xml" + if server == "production": + return "https://www.openml.org/api/v1/xml" + return server + + configure_field( + field="server", + value=value, + check_with_message=check_server, + intro_message="Specify which server you wish to connect to.", + input_message="Specify a url or use 'test' or 'production' as a shorthand: ", + sanitize=replace_shorthand, + ) + + +def configure_cachedir(value: str) -> None: + def check_cache_dir(path: str) -> str: + p = pathlib.Path(path) + if p.is_file(): + return f"'{path}' is a file, not a directory." + expanded = p.expanduser() + if not expanded.is_absolute(): + return f"'{path}' is not absolute (even after expanding '~')." + if not expanded.exists(): + try: + os.mkdir(expanded) + except PermissionError: + return f"'{path}' does not exist and there are not enough permissions to create it." + return "" + + configure_field( + field="cachedir", + value=value, + check_with_message=check_cache_dir, + intro_message="Configuring the cache directory. It can not be a relative path.", + input_message="Specify the directory to use (or create) as cache directory: ", + ) + print("NOTE: Data from your old cache directory is not moved over.") + + +def configure_connection_n_retries(value: str) -> None: + def valid_connection_retries(n: str) -> str: + if not n.isdigit(): + return f"Must be an integer number (smaller than {config.max_retries})." + if int(n) > config.max_retries: + return f"connection_n_retries may not exceed {config.max_retries}." + if int(n) == 0: + return "connection_n_retries must be non-zero." + return "" + + configure_field( + field="connection_n_retries", + value=value, + check_with_message=valid_connection_retries, + intro_message="Configuring the number of times to attempt to connect to the OpenML Server", + input_message=f"Enter an integer between 0 and {config.max_retries}: ", + ) + + +def configure_avoid_duplicate_runs(value: str) -> None: + def is_python_bool(bool_: str) -> str: + if bool_ in ["True", "False"]: + return "" + return "Must be 'True' or 'False' (mind the capital)." + + def autocomplete_bool(bool_: str) -> str: + if bool_.lower() in ["n", "no", "f", "false", "0"]: + return "False" + if bool_.lower() in ["y", "yes", "t", "true", "1"]: + return "True" + return bool_ + + intro_message = ( + "If set to True, when `run_flow_on_task` or similar methods are called a lookup is " + "performed to see if there already exists such a run on the server. " + "If so, download those results instead. " + "If set to False, runs will always be executed." + ) + + configure_field( + field="avoid_duplicate_runs", + value=value, + check_with_message=is_python_bool, + intro_message=intro_message, + input_message="Enter 'True' or 'False': ", + sanitize=autocomplete_bool, + ) + + +def configure_verbosity(value: str) -> None: + def is_zero_through_two(verbosity: str) -> str: + if verbosity in ["0", "1", "2"]: + return "" + return "Must be '0', '1' or '2'." + + intro_message = ( + "Set the verbosity of log messages which should be shown by openml-python." + " 0: normal output (warnings and errors)" + " 1: info output (some high-level progress output)" + " 2: debug output (detailed information (for developers))" + ) + + configure_field( + field="verbosity", + value=value, + check_with_message=is_zero_through_two, + intro_message=intro_message, + input_message="Enter '0', '1' or '2': ", + ) + + +def configure_field( + field: str, + value: Union[None, str], + check_with_message: Callable[[str], str], + intro_message: str, + input_message: str, + sanitize: Union[Callable[[str], str], None] = None, +) -> None: + """ Configure `field` with `value`. If `value` is None ask the user for input. + + `value` and user input are first corrected/auto-completed with `convert_value` if provided, + then validated with `check_with_message` function. + If the user input a wrong value in interactive mode, the user gets to input a new value. + The new valid value is saved in the openml configuration file. + In case an invalid `value` is supplied directly (non-interactive), no changes are made. + + Parameters + ---------- + field: str + Field to set. + value: str, None + Value to field to. If `None` will ask user for input. + check_with_message: Callable[[str], str] + Function which validates `value` or user input, and returns either an error message if it + is invalid, or a False-like value if `value` is valid. + intro_message: str + Message that is printed once if user input is requested (e.g. instructions). + input_message: str + Message that comes with the input prompt. + sanitize: Union[Callable[[str], str], None] + A function to convert user input to 'more acceptable' input, e.g. for auto-complete. + If no correction of user input is possible, return the original value. + If no function is provided, don't attempt to correct/auto-complete input. + """ + if value is not None: + if sanitize: + value = sanitize(value) + malformed_input = check_with_message(value) + if malformed_input: + print(malformed_input) + quit() + else: + print(intro_message) + value = wait_until_valid_input( + prompt=input_message, check=check_with_message, sanitize=sanitize, + ) + verbose_set(field, value) + + +def configure(args: argparse.Namespace): + """ Calls the right submenu(s) to edit `args.field` in the configuration file. """ + set_functions = { + "apikey": configure_apikey, + "server": configure_server, + "cachedir": configure_cachedir, + "connection_n_retries": configure_connection_n_retries, + "avoid_duplicate_runs": configure_avoid_duplicate_runs, + "verbosity": configure_verbosity, + } + + def not_supported_yet(_): + print(f"Setting '{args.field}' is not supported yet.") + + if args.field not in ["all", "none"]: + set_functions.get(args.field, not_supported_yet)(args.value) + else: + if args.value is not None: + print(f"Can not set value ('{args.value}') when field is specified as '{args.field}'.") + quit() + print_configuration() + + if args.field == "all": + for set_field_function in set_functions.values(): + print() # Visually separating the output by field. + set_field_function(args.value) + + +def main() -> None: + subroutines = {"configure": configure} + + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers(dest="subroutine") + + parser_configure = subparsers.add_parser( + "configure", + description="Set or read variables in your configuration file. For more help also see " + "'https://openml.github.io/openml-python/master/usage.html#configuration'.", + ) + + configurable_fields = [f for f in config._defaults if f not in ["max_retries"]] + + parser_configure.add_argument( + "field", + type=str, + choices=[*configurable_fields, "all", "none"], + default="all", + nargs="?", + help="The field you wish to edit. " + "Choosing 'all' lets you configure all fields one by one. " + "Choosing 'none' will print out the current configuration.", + ) + + parser_configure.add_argument( + "value", type=str, default=None, nargs="?", help="The value to set the FIELD to.", + ) + + args = parser.parse_args() + subroutines.get(args.subroutine, lambda _: parser.print_help())(args) + + +if __name__ == "__main__": + main() diff --git a/openml/config.py b/openml/config.py index 4516e96e1..7295ea82e 100644 --- a/openml/config.py +++ b/openml/config.py @@ -9,7 +9,7 @@ import os from pathlib import Path import platform -from typing import Tuple, cast +from typing import Tuple, cast, Any from io import StringIO import configparser @@ -177,6 +177,16 @@ def stop_using_configuration_for_example(cls): cls._start_last_called = False +def determine_config_file_path() -> Path: + if platform.system() == "Linux": + config_dir = Path(os.environ.get("XDG_CONFIG_HOME", Path("~") / ".config" / "openml")) + else: + config_dir = Path("~") / ".openml" + # Still use os.path.expanduser to trigger the mock in the unit test + config_dir = Path(os.path.expanduser(config_dir)) + return config_dir / "config" + + def _setup(config=None): """Setup openml package. Called on first import. @@ -193,13 +203,8 @@ def _setup(config=None): global connection_n_retries global max_retries - if platform.system() == "Linux": - config_dir = Path(os.environ.get("XDG_CONFIG_HOME", Path("~") / ".config" / "openml")) - else: - config_dir = Path("~") / ".openml" - # Still use os.path.expanduser to trigger the mock in the unit test - config_dir = Path(os.path.expanduser(config_dir)) - config_file = config_dir / "config" + config_file = determine_config_file_path() + config_dir = config_file.parent # read config file, create directory for config file if not os.path.exists(config_dir): @@ -258,6 +263,27 @@ def _get(config, key): ) +def set_field_in_config_file(field: str, value: Any): + """ Overwrites the `field` in the configuration file with the new `value`. """ + if field not in _defaults: + return ValueError(f"Field '{field}' is not valid and must be one of '{_defaults.keys()}'.") + + globals()[field] = value + config_file = determine_config_file_path() + config = _parse_config(str(config_file)) + with open(config_file, "w") as fh: + for f in _defaults.keys(): + # We can't blindly set all values based on globals() because when the user + # sets it through config.FIELD it should not be stored to file. + # There doesn't seem to be a way to avoid writing defaults to file with configparser, + # because it is impossible to distinguish from an explicitly set value that matches + # the default value, to one that was set to its default because it was omitted. + value = config.get("FAKE_SECTION", f) + if f == field: + value = globals()[f] + fh.write(f"{f} = {value}\n") + + def _parse_config(config_file: str): """ Parse the config file, set up defaults. """ config = configparser.RawConfigParser(defaults=_defaults) diff --git a/setup.py b/setup.py index 2d2a638b5..bad7da2b4 100644 --- a/setup.py +++ b/setup.py @@ -102,4 +102,5 @@ "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", ], + entry_points={"console_scripts": ["openml=openml.cli:main"]}, ) From 6b719819f8614b3f72c9c8af131b782086b64d0e Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 21 Apr 2021 11:35:18 +0200 Subject: [PATCH 02/16] Speed up dataset unit tests (#1056) * Speed up dataset unit tests by only loading necessary datasets * Revert "Speed up dataset unit tests" This reverts commit 861b52df109a126d6ffaeb29c3c1010254dbc30c. * address suggestions from Pieter --- tests/test_datasets/test_dataset.py | 40 +++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 416fce534..1aeffdbb4 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -24,13 +24,43 @@ def setUp(self): # Load dataset id 2 - dataset 2 is interesting because it contains # missing values, categorical features etc. - self.dataset = openml.datasets.get_dataset(2, download_data=False) + self._dataset = None # titanic as missing values, categories, and string - self.titanic = openml.datasets.get_dataset(40945, download_data=False) + self._titanic = None # these datasets have some boolean features - self.pc4 = openml.datasets.get_dataset(1049, download_data=False) - self.jm1 = openml.datasets.get_dataset(1053, download_data=False) - self.iris = openml.datasets.get_dataset(61, download_data=False) + self._pc4 = None + self._jm1 = None + self._iris = None + + @property + def dataset(self): + if self._dataset is None: + self._dataset = openml.datasets.get_dataset(2, download_data=False) + return self._dataset + + @property + def titanic(self): + if self._titanic is None: + self._titanic = openml.datasets.get_dataset(40945, download_data=False) + return self._titanic + + @property + def pc4(self): + if self._pc4 is None: + self._pc4 = openml.datasets.get_dataset(1049, download_data=False) + return self._pc4 + + @property + def jm1(self): + if self._jm1 is None: + self._jm1 = openml.datasets.get_dataset(1053, download_data=False) + return self._jm1 + + @property + def iris(self): + if self._iris is None: + self._iris = openml.datasets.get_dataset(61, download_data=False) + return self._iris def test_repr(self): # create a bare-bones dataset as would be returned by From 10c9dc527c3ace65e1f761d05269d60ee3656ddd Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 21 Apr 2021 11:39:02 +0200 Subject: [PATCH 03/16] Fix documentation links (#1048) * fix warnings, make sphinx fail on warnings * fix a few links * fix a bunch of links * fix more links * fix all remaining links * and finally add the link checker * debug workflow * more debug * undo debug * Add to changelog * fix new warning * clean up more errors * Fix link after rebase * Apply suggestions from code review Co-authored-by: PGijsbers Co-authored-by: PGijsbers --- .github/workflows/docs.yaml | 4 + doc/_templates/class.rst | 2 + doc/api.rst | 225 ++++++++++++++---- doc/conf.py | 6 + doc/contributing.rst | 14 +- doc/index.rst | 6 +- doc/progress.rst | 8 + doc/usage.rst | 23 +- examples/20_basic/introduction_tutorial.py | 10 +- .../simple_flows_and_runs_tutorial.py | 4 +- examples/20_basic/simple_suites_tutorial.py | 7 +- examples/30_extended/configure_logging.py | 4 +- .../30_extended/create_upload_tutorial.py | 14 +- examples/30_extended/custom_flow_.py | 1 + examples/30_extended/flow_id_tutorial.py | 2 +- .../30_extended/flows_and_runs_tutorial.py | 28 +-- examples/30_extended/study_tutorial.py | 1 + examples/30_extended/suites_tutorial.py | 3 +- .../task_manual_iteration_tutorial.py | 2 +- examples/30_extended/tasks_tutorial.py | 6 +- .../40_paper/2015_neurips_feurer_example.py | 2 +- examples/40_paper/2018_kdd_rijn_example.py | 2 +- .../40_paper/2018_neurips_perrone_example.py | 2 +- examples/README.txt | 6 +- openml/__init__.py | 2 +- openml/extensions/sklearn/extension.py | 32 +-- openml/flows/flow.py | 5 +- openml/flows/functions.py | 6 +- openml/runs/functions.py | 8 +- openml/study/study.py | 4 - openml/tasks/task.py | 10 - setup.py | 2 +- 32 files changed, 293 insertions(+), 158 deletions(-) diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 2219c7fac..ab83aef5c 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -17,6 +17,10 @@ jobs: run: | cd doc make html + - name: Check links + run: | + cd doc + make linkcheck - name: Pull latest gh-pages if: (contains(github.ref, 'develop') || contains(github.ref, 'master')) && github.event_name == 'push' run: | diff --git a/doc/_templates/class.rst b/doc/_templates/class.rst index 307b0199c..72405badb 100644 --- a/doc/_templates/class.rst +++ b/doc/_templates/class.rst @@ -1,3 +1,5 @@ +:orphan: + :mod:`{{module}}`.{{objname}} {{ underline }}============== diff --git a/doc/api.rst b/doc/api.rst index 8a72e6b69..86bfd121e 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -2,64 +2,33 @@ .. _api: -APIs -**** +API +*** -Top-level Classes ------------------ -.. currentmodule:: openml - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - OpenMLBenchmarkSuite - OpenMLClassificationTask - OpenMLClusteringTask - OpenMLDataFeature - OpenMLDataset - OpenMLEvaluation - OpenMLFlow - OpenMLLearningCurveTask - OpenMLParameter - OpenMLRegressionTask - OpenMLRun - OpenMLSetup - OpenMLSplit - OpenMLStudy - OpenMLSupervisedTask - OpenMLTask +Modules +======= -.. _api_extensions: +:mod:`openml.datasets` +---------------------- +.. automodule:: openml.datasets + :no-members: + :no-inherited-members: -Extensions ----------- +Dataset Classes +~~~~~~~~~~~~~~~ -.. currentmodule:: openml.extensions +.. currentmodule:: openml.datasets .. autosummary:: :toctree: generated/ :template: class.rst - Extension - sklearn.SklearnExtension - -.. currentmodule:: openml.extensions - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - get_extension_by_flow - get_extension_by_model - register_extension - + OpenMLDataFeature + OpenMLDataset -Modules -------- +Dataset Functions +~~~~~~~~~~~~~~~~~ -:mod:`openml.datasets`: Dataset Functions ------------------------------------------ .. currentmodule:: openml.datasets .. autosummary:: @@ -77,20 +46,56 @@ Modules edit_dataset fork_dataset -:mod:`openml.evaluations`: Evaluation Functions ------------------------------------------------ +:mod:`openml.evaluations` +------------------------- +.. automodule:: openml.evaluations + :no-members: + :no-inherited-members: + +Evaluations Classes +~~~~~~~~~~~~~~~~~~~ + +.. currentmodule:: openml.evaluations + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + OpenMLEvaluation + +Evaluations Functions +~~~~~~~~~~~~~~~~~~~~~ + .. currentmodule:: openml.evaluations .. autosummary:: :toctree: generated/ :template: function.rst - list_evaluations - list_evaluation_measures - list_evaluations_setups + list_evaluations + list_evaluation_measures + list_evaluations_setups :mod:`openml.flows`: Flow Functions ----------------------------------- +.. automodule:: openml.flows + :no-members: + :no-inherited-members: + +Flow Classes +~~~~~~~~~~~~ + +.. currentmodule:: openml.flows + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + OpenMLFlow + +Flow Functions +~~~~~~~~~~~~~~ + .. currentmodule:: openml.flows .. autosummary:: @@ -104,6 +109,24 @@ Modules :mod:`openml.runs`: Run Functions ---------------------------------- +.. automodule:: openml.runs + :no-members: + :no-inherited-members: + +Run Classes +~~~~~~~~~~~ + +.. currentmodule:: openml.runs + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + OpenMLRun + +Run Functions +~~~~~~~~~~~~~ + .. currentmodule:: openml.runs .. autosummary:: @@ -122,6 +145,25 @@ Modules :mod:`openml.setups`: Setup Functions ------------------------------------- +.. automodule:: openml.setups + :no-members: + :no-inherited-members: + +Setup Classes +~~~~~~~~~~~~~ + +.. currentmodule:: openml.setups + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + OpenMLParameter + OpenMLSetup + +Setup Functions +~~~~~~~~~~~~~~~ + .. currentmodule:: openml.setups .. autosummary:: @@ -135,6 +177,25 @@ Modules :mod:`openml.study`: Study Functions ------------------------------------ +.. automodule:: openml.study + :no-members: + :no-inherited-members: + +Study Classes +~~~~~~~~~~~~~ + +.. currentmodule:: openml.study + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + OpenMLBenchmarkSuite + OpenMLStudy + +Study Functions +~~~~~~~~~~~~~~~ + .. currentmodule:: openml.study .. autosummary:: @@ -158,6 +219,31 @@ Modules :mod:`openml.tasks`: Task Functions ----------------------------------- +.. automodule:: openml.tasks + :no-members: + :no-inherited-members: + +Task Classes +~~~~~~~~~~~~ + +.. currentmodule:: openml.tasks + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + OpenMLClassificationTask + OpenMLClusteringTask + OpenMLLearningCurveTask + OpenMLRegressionTask + OpenMLSplit + OpenMLSupervisedTask + OpenMLTask + TaskType + +Task Functions +~~~~~~~~~~~~~~ + .. currentmodule:: openml.tasks .. autosummary:: @@ -168,3 +254,38 @@ Modules get_task get_tasks list_tasks + +.. _api_extensions: + +Extensions +========== + +.. automodule:: openml.extensions + :no-members: + :no-inherited-members: + +Extension Classes +----------------- + +.. currentmodule:: openml.extensions + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + Extension + sklearn.SklearnExtension + +Extension Functions +------------------- + +.. currentmodule:: openml.extensions + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + get_extension_by_flow + get_extension_by_model + register_extension + diff --git a/doc/conf.py b/doc/conf.py index f0f26318c..1f016561b 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -114,6 +114,11 @@ # If true, keep warnings as "system message" paragraphs in the built documents. # keep_warnings = False +# Complain about all broken internal links - broken external links can be +# found with `make linkcheck` +# +# currently disabled because without intersphinx we cannot link to numpy.ndarray +# nitpicky = True # -- Options for HTML output ---------------------------------------------- @@ -344,3 +349,4 @@ def setup(app): app.add_css_file("codehighlightstyle.css") + app.warningiserror = True diff --git a/doc/contributing.rst b/doc/contributing.rst index 354a91d1c..927c21034 100644 --- a/doc/contributing.rst +++ b/doc/contributing.rst @@ -19,7 +19,7 @@ In particular, a few ways to contribute to openml-python are: For more information, see the :ref:`extensions` below. * Bug reports. If something doesn't work for you or is cumbersome, please open a new issue to let - us know about the problem. See `this section `_. + us know about the problem. See `this section `_. * `Cite OpenML `_ if you use it in a scientific publication. @@ -38,10 +38,10 @@ Content of the Library To leverage support from the community and to tap in the potential of OpenML, interfacing with popular machine learning libraries is essential. However, the OpenML-Python team does not have the capacity to develop and maintain such interfaces on its own. For this, we -have built an extension interface to allows others to contribute back. Building a suitable +have built an extension interface to allows others to contribute back. Building a suitable extension for therefore requires an understanding of the current OpenML-Python support. -`This example `_ +The :ref:`sphx_glr_examples_20_basic_simple_flows_and_runs_tutorial.py` tutorial shows how scikit-learn currently works with OpenML-Python as an extension. The *sklearn* extension packaged with the `openml-python `_ repository can be used as a template/benchmark to build the new extension. @@ -50,7 +50,7 @@ repository can be used as a template/benchmark to build the new extension. API +++ * The extension scripts must import the `openml` package and be able to interface with - any function from the OpenML-Python `API `_. + any function from the OpenML-Python :ref:`api`. * The extension has to be defined as a Python class and must inherit from :class:`openml.extensions.Extension`. * This class needs to have all the functions from `class Extension` overloaded as required. @@ -61,7 +61,7 @@ API Interfacing with OpenML-Python ++++++++++++++++++++++++++++++ -Once the new extension class has been defined, the openml-python module to +Once the new extension class has been defined, the openml-python module to :meth:`openml.extensions.register_extension` must be called to allow OpenML-Python to interface the new extension. @@ -73,8 +73,8 @@ Each extension created should be a stand-alone repository, compatible with the `OpenML-Python repository `_. The extension repository should work off-the-shelf with *OpenML-Python* installed. -Create a `public Github repo `_ with -the following directory structure: +Create a `public Github repo `_ +with the following directory structure: :: diff --git a/doc/index.rst b/doc/index.rst index b78b7c009..c4164dc82 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -40,7 +40,7 @@ Example run.publish() print(f'View the run online: {run.openml_url}') -You can find more examples in our `examples gallery `_. +You can find more examples in our :ref:`sphx_glr_examples`. ---------------------------- How to get OpenML for python @@ -60,7 +60,7 @@ Content * :ref:`usage` * :ref:`api` -* `Examples `_ +* :ref:`sphx_glr_examples` * :ref:`contributing` * :ref:`progress` @@ -70,7 +70,7 @@ Further information * `OpenML documentation `_ * `OpenML client APIs `_ -* `OpenML developer guide `_ +* `OpenML developer guide `_ * `Contact information `_ * `Citation request `_ * `OpenML blog `_ diff --git a/doc/progress.rst b/doc/progress.rst index 2fbf95b31..8d3f4ec1d 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -6,6 +6,14 @@ Changelog ========= +0.12.2 +~~~~~~ + +* DOC: Fixes a few broken links in the documentation. +* MAINT/DOC: Automatically check for broken external links when building the documentation. +* MAINT/DOC: Fail documentation building on warnings. This will make the documentation building + fail if a reference cannot be found (i.e. an internal link is broken). + 0.12.1 ~~~~~~ diff --git a/doc/usage.rst b/doc/usage.rst index e106e6d60..7bf247f4d 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -14,11 +14,13 @@ User Guide This document will guide you through the most important use cases, functions and classes in the OpenML Python API. Throughout this document, we will use -`pandas `_ to format and filter tables. +`pandas `_ to format and filter tables. -~~~~~~~~~~~~~~~~~~~~~~ +.. _installation: + +~~~~~~~~~~~~~~~~~~~~~ Installation & Set up -~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~ The OpenML Python package is a connector to `OpenML `_. It allows you to use and share datasets and tasks, run @@ -27,7 +29,7 @@ machine learning algorithms on them and then share the results online. The following tutorial gives a short introduction on how to install and set up the OpenML Python connector, followed up by a simple example. -* `Introduction `_ +* `:ref:`sphx_glr_examples_20_basic_introduction_tutorial.py` ~~~~~~~~~~~~~ Configuration @@ -97,7 +99,7 @@ for which a flow should be optimized. Below you can find our tutorial regarding tasks and if you want to know more you can read the `OpenML guide `_: -* `Tasks `_ +* :ref:`sphx_glr_examples_30_extended_tasks_tutorial.py` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Running machine learning algorithms and uploading results @@ -120,14 +122,14 @@ automatically calculates several metrics which can be used to compare the performance of different flows to each other. So far, the OpenML Python connector works only with estimator objects following -the `scikit-learn estimator API `_. +the `scikit-learn estimator API `_. Those can be directly run on a task, and a flow will automatically be created or downloaded from the server if it already exists. The next tutorial covers how to train different machine learning models, how to run machine learning models on OpenML data and how to share the results: -* `Flows and Runs `_ +* :ref:`sphx_glr_examples_20_basic_simple_flows_and_runs_tutorial.py` ~~~~~~~~ Datasets @@ -142,12 +144,12 @@ available metadata. The tutorial which follows explains how to get a list of datasets, how to filter the list to find the dataset that suits your requirements and how to download a dataset: -* `Filter and explore datasets `_ +* :ref:`sphx_glr_examples_30_extended_datasets_tutorial.py` OpenML is about sharing machine learning results and the datasets they were obtained on. Learn how to share your datasets in the following tutorial: -* `Upload a dataset `_ +* :ref:`sphx_glr_examples_30_extended_create_upload_tutorial.py` *********************** Extending OpenML-Python @@ -159,7 +161,8 @@ scikit-learn extension in :class:`openml.extensions.sklearn.SklearnExtension` as Runtime measurement is incorporated in the OpenML sklearn-extension. Example usage and potential usage for Hyperparameter Optimisation can be found in the example tutorial: -`HPO using OpenML `_ + +* :ref:`sphx_glr_examples_30_extended_fetch_runtimes_tutorial.py` Here is a list of currently maintained OpenML extensions: diff --git a/examples/20_basic/introduction_tutorial.py b/examples/20_basic/introduction_tutorial.py index 737362e49..765fada12 100644 --- a/examples/20_basic/introduction_tutorial.py +++ b/examples/20_basic/introduction_tutorial.py @@ -1,6 +1,6 @@ """ -Setup -===== +Introduction tutorial & Setup +============================= An example how to set up OpenML-Python followed up by a simple example. """ @@ -26,7 +26,7 @@ # pip install openml # # For further information, please check out the installation guide at -# https://openml.github.io/openml-python/master/contributing.html#installation +# :ref:`installation`. # ############################################################################ @@ -38,7 +38,7 @@ # You will receive an API key, which will authenticate you to the server # and allow you to download and upload datasets, tasks, runs and flows. # -# * Create an OpenML account (free) on http://www.openml.org. +# * Create an OpenML account (free) on https://www.openml.org. # * After logging in, open your account page (avatar on the top right) # * Open 'Account Settings', then 'API authentication' to find your API key. # @@ -103,7 +103,7 @@ # For this tutorial, our configuration publishes to the test server # as to not crowd the main server with runs created by examples. myrun = run.publish() -print(f"kNN on {data.name}: http://test.openml.org/r/{myrun.run_id}") +print(f"kNN on {data.name}: {myrun.openml_url}") ############################################################################ openml.config.stop_using_configuration_for_example() diff --git a/examples/20_basic/simple_flows_and_runs_tutorial.py b/examples/20_basic/simple_flows_and_runs_tutorial.py index e88add911..48740e800 100644 --- a/examples/20_basic/simple_flows_and_runs_tutorial.py +++ b/examples/20_basic/simple_flows_and_runs_tutorial.py @@ -42,8 +42,8 @@ # ================== myrun = run.publish() -print("Run was uploaded to http://test.openml.org/r/" + str(myrun.run_id)) -print("The flow can be found at http://test.openml.org/f/" + str(myrun.flow_id)) +print(f"Run was uploaded to {myrun.openml_url}") +print(f"The flow can be found at {myrun.flow.openml_url}") ############################################################################ openml.config.stop_using_configuration_for_example() diff --git a/examples/20_basic/simple_suites_tutorial.py b/examples/20_basic/simple_suites_tutorial.py index 37f1eeffb..92dfb3c04 100644 --- a/examples/20_basic/simple_suites_tutorial.py +++ b/examples/20_basic/simple_suites_tutorial.py @@ -62,7 +62,6 @@ # Further examples # ================ # -# * `Advanced benchmarking suites tutorial <../30_extended/suites_tutorial.html>`_ -# * `Benchmarking studies tutorial <../30_extended/study_tutorial.html>`_ -# * `Using studies to compare linear and non-linear classifiers -# <../40_paper/2018_ida_strang_example.html>`_ +# * :ref:`sphx_glr_examples_30_extended_suites_tutorial.py` +# * :ref:`sphx_glr_examples_30_extended_study_tutorial.py` +# * :ref:`sphx_glr_examples_40_paper_2018_ida_strang_example.py` diff --git a/examples/30_extended/configure_logging.py b/examples/30_extended/configure_logging.py index a600b0632..2dae4047f 100644 --- a/examples/30_extended/configure_logging.py +++ b/examples/30_extended/configure_logging.py @@ -6,8 +6,6 @@ Explains openml-python logging, and shows how to configure it. """ ################################################################################## -# Logging -# ^^^^^^^ # Openml-python uses the `Python logging module `_ # to provide users with log messages. Each log message is assigned a level of importance, see # the table in Python's logging tutorial @@ -16,7 +14,7 @@ # By default, openml-python will print log messages of level `WARNING` and above to console. # All log messages (including `DEBUG` and `INFO`) are also saved in a file, which can be # found in your cache directory (see also the -# `introduction tutorial <../20_basic/introduction_tutorial.html>`_). +# :ref:`sphx_glr_examples_20_basic_introduction_tutorial.py`). # These file logs are automatically deleted if needed, and use at most 2MB of space. # # It is possible to configure what log levels to send to console and file. diff --git a/examples/30_extended/create_upload_tutorial.py b/examples/30_extended/create_upload_tutorial.py index a4e1d9655..f80726396 100644 --- a/examples/30_extended/create_upload_tutorial.py +++ b/examples/30_extended/create_upload_tutorial.py @@ -67,7 +67,7 @@ "Robert Tibshirani (2004) (Least Angle Regression) " "Annals of Statistics (with discussion), 407-499" ) -paper_url = "http://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf" +paper_url = "https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf" ############################################################################ # Create the dataset object @@ -110,7 +110,7 @@ data=data, # A version label which is provided by the user. version_label="test", - original_data_url="http://www4.stat.ncsu.edu/~boos/var.select/diabetes.html", + original_data_url="https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html", paper_url=paper_url, ) @@ -126,7 +126,7 @@ # OrderedDicts in the case of sparse data. # # Weather dataset: -# http://storm.cis.fordham.edu/~gweiss/data-mining/datasets.html +# https://storm.cis.fordham.edu/~gweiss/data-mining/datasets.html data = [ ["sunny", 85, 85, "FALSE", "no"], @@ -200,8 +200,8 @@ # storing the type of data for each column as well as the attribute names. # Therefore, when providing a Pandas DataFrame, OpenML can infer this # information without needing to explicitly provide it when calling the -# function :func:`create_dataset`. In this regard, you only need to pass -# ``'auto'`` to the ``attributes`` parameter. +# function :func:`openml.datasets.create_dataset`. In this regard, you only +# need to pass ``'auto'`` to the ``attributes`` parameter. df = pd.DataFrame(data, columns=[col_name for col_name, _ in attribute_names]) # enforce the categorical column to have a categorical dtype @@ -214,8 +214,8 @@ # We enforce the column 'outlook' and 'play' to be a categorical # dtype while the column 'windy' is kept as a boolean column. 'temperature' # and 'humidity' are kept as numeric columns. Then, we can -# call :func:`create_dataset` by passing the dataframe and fixing the parameter -# ``attributes`` to ``'auto'``. +# call :func:`openml.datasets.create_dataset` by passing the dataframe and +# fixing the parameter ``attributes`` to ``'auto'``. weather_dataset = create_dataset( name="Weather", diff --git a/examples/30_extended/custom_flow_.py b/examples/30_extended/custom_flow_.py index 02aef9c5c..1dde40233 100644 --- a/examples/30_extended/custom_flow_.py +++ b/examples/30_extended/custom_flow_.py @@ -130,6 +130,7 @@ # The exact format of the predictions will depend on the task. # # The predictions should always be a list of lists, each list should contain: +# # - the repeat number: for repeated evaluation strategies. (e.g. repeated cross-validation) # - the fold number: for cross-validation. (what should this be for holdout?) # - 0: this field is for backward compatibility. diff --git a/examples/30_extended/flow_id_tutorial.py b/examples/30_extended/flow_id_tutorial.py index e77df8d1a..d9465575e 100644 --- a/examples/30_extended/flow_id_tutorial.py +++ b/examples/30_extended/flow_id_tutorial.py @@ -35,7 +35,7 @@ # This piece of code is rather involved. First, it retrieves a # :class:`~openml.extensions.Extension` which is registered and can handle the given model, # in our case it is :class:`openml.extensions.sklearn.SklearnExtension`. Second, the extension -# converts the classifier into an instance of :class:`openml.flow.OpenMLFlow`. Third and finally, +# converts the classifier into an instance of :class:`openml.OpenMLFlow`. Third and finally, # the publish method checks whether the current flow is already present on OpenML. If not, # it uploads the flow, otherwise, it updates the current instance with all information computed # by the server (which is obviously also done when uploading/publishing a flow). diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py index 9f8c89375..bbf255e17 100644 --- a/examples/30_extended/flows_and_runs_tutorial.py +++ b/examples/30_extended/flows_and_runs_tutorial.py @@ -69,7 +69,7 @@ myrun = run.publish() # For this tutorial, our configuration publishes to the test server # as to not pollute the main server. -print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id)) +print(f"Uploaded to {myrun.openml_url}") ############################################################################ # We can now also inspect the flow object which was automatically created: @@ -115,7 +115,7 @@ run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False) myrun = run.publish() -print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id)) +print(f"Uploaded to {myrun.openml_url}") # The above pipeline works with the helper functions that internally deal with pandas DataFrame. @@ -159,7 +159,7 @@ run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format="array") myrun = run.publish() -print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id)) +print(f"Uploaded to {myrun.openml_url}") ############################################################################### # Running flows on tasks offline for later upload @@ -210,16 +210,16 @@ # compare your results with the rest of the class and learn from # them. Some tasks you could try (or browse openml.org): # -# * EEG eye state: data_id:`1471 `_, -# task_id:`14951 `_ -# * Volcanoes on Venus: data_id:`1527 `_, -# task_id:`10103 `_ -# * Walking activity: data_id:`1509 `_, -# task_id:`9945 `_, 150k instances. -# * Covertype (Satellite): data_id:`150 `_, -# task_id:`218 `_, 500k instances. -# * Higgs (Physics): data_id:`23512 `_, -# task_id:`52950 `_, 100k instances, missing values. +# * EEG eye state: data_id:`1471 `_, +# task_id:`14951 `_ +# * Volcanoes on Venus: data_id:`1527 `_, +# task_id:`10103 `_ +# * Walking activity: data_id:`1509 `_, +# task_id:`9945 `_, 150k instances. +# * Covertype (Satellite): data_id:`150 `_, +# task_id:`218 `_, 500k instances. +# * Higgs (Physics): data_id:`23512 `_, +# task_id:`52950 `_, 100k instances, missing values. # Easy benchmarking: for task_id in [115]: # Add further tasks. Disclaimer: they might take some time @@ -229,7 +229,7 @@ run = openml.runs.run_model_on_task(clf, task, avoid_duplicate_runs=False) myrun = run.publish() - print(f"kNN on {data.name}: http://test.openml.org/r/{myrun.run_id}") + print(f"kNN on {data.name}: {myrun.openml_url}") ############################################################################ diff --git a/examples/30_extended/study_tutorial.py b/examples/30_extended/study_tutorial.py index 3c93a7e81..76cca4840 100644 --- a/examples/30_extended/study_tutorial.py +++ b/examples/30_extended/study_tutorial.py @@ -25,6 +25,7 @@ # connects to the test server at test.openml.org before doing so. # This prevents the crowding of the main server with example datasets, # tasks, runs, and so on. +# ############################################################################ diff --git a/examples/30_extended/suites_tutorial.py b/examples/30_extended/suites_tutorial.py index f583b6957..cc26b78db 100644 --- a/examples/30_extended/suites_tutorial.py +++ b/examples/30_extended/suites_tutorial.py @@ -6,7 +6,7 @@ How to list, download and upload benchmark suites. If you want to learn more about benchmark suites, check out our -`brief introductory tutorial <../20_basic/simple_suites_tutorial.html>`_ or the +brief introductory tutorial :ref:`sphx_glr_examples_20_basic_simple_suites_tutorial.py` or the `OpenML benchmark docs `_. """ ############################################################################ @@ -24,6 +24,7 @@ # connects to the test server at test.openml.org before doing so. # This prevents the main server from crowding with example datasets, # tasks, runs, and so on. +# ############################################################################ diff --git a/examples/30_extended/task_manual_iteration_tutorial.py b/examples/30_extended/task_manual_iteration_tutorial.py index 533f645b2..c30ff66a3 100644 --- a/examples/30_extended/task_manual_iteration_tutorial.py +++ b/examples/30_extended/task_manual_iteration_tutorial.py @@ -6,7 +6,7 @@ ``openml.runs.run_model_on_task`` which automatically runs the model on all splits of the task. However, sometimes it is necessary to manually split a dataset to perform experiments outside of the functions provided by OpenML. One such example is in the benchmark library -`HPOlib2 `_ which extensively uses data from OpenML, +`HPOBench `_ which extensively uses data from OpenML, but not OpenML's functionality to conduct runs. """ diff --git a/examples/30_extended/tasks_tutorial.py b/examples/30_extended/tasks_tutorial.py index c755d265e..2166d5a03 100644 --- a/examples/30_extended/tasks_tutorial.py +++ b/examples/30_extended/tasks_tutorial.py @@ -36,7 +36,7 @@ ############################################################################ # **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, which we convert # into a -# `pandas dataframe `_ +# `pandas dataframe `_ # to have better visualization capabilities and easier access: tasks = pd.DataFrame.from_dict(tasks, orient="index") @@ -76,7 +76,7 @@ ############################################################################ # Resampling strategies can be found on the -# `OpenML Website `_. +# `OpenML Website `_. # # Similar to listing tasks by task type, we can list tasks by tags: @@ -105,7 +105,7 @@ # instances per task. To make things easier, the tasks do not contain highly # unbalanced data and sparse data. However, the tasks include missing values and # categorical features. You can find out more about the *OpenML 100* on -# `the OpenML benchmarking page `_. +# `the OpenML benchmarking page `_. # # Finally, it is also possible to list all tasks on OpenML with: diff --git a/examples/40_paper/2015_neurips_feurer_example.py b/examples/40_paper/2015_neurips_feurer_example.py index 733a436ad..721186016 100644 --- a/examples/40_paper/2015_neurips_feurer_example.py +++ b/examples/40_paper/2015_neurips_feurer_example.py @@ -12,7 +12,7 @@ | Efficient and Robust Automated Machine Learning | Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter | In *Advances in Neural Information Processing Systems 28*, 2015 -| Available at http://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf +| Available at https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf """ # noqa F401 # License: BSD 3-Clause diff --git a/examples/40_paper/2018_kdd_rijn_example.py b/examples/40_paper/2018_kdd_rijn_example.py index 752419ea3..d3ce59f35 100644 --- a/examples/40_paper/2018_kdd_rijn_example.py +++ b/examples/40_paper/2018_kdd_rijn_example.py @@ -13,7 +13,7 @@ | Hyperparameter importance across datasets | Jan N. van Rijn and Frank Hutter | In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018 -| Available at https://dl.acm.org/citation.cfm?id=3220058 +| Available at https://dl.acm.org/doi/10.1145/3219819.3220058 """ # License: BSD 3-Clause diff --git a/examples/40_paper/2018_neurips_perrone_example.py b/examples/40_paper/2018_neurips_perrone_example.py index 5ae339ae2..0d72846ac 100644 --- a/examples/40_paper/2018_neurips_perrone_example.py +++ b/examples/40_paper/2018_neurips_perrone_example.py @@ -11,7 +11,7 @@ | Scalable Hyperparameter Transfer Learning | Valerio Perrone and Rodolphe Jenatton and Matthias Seeger and Cedric Archambeau | In *Advances in Neural Information Processing Systems 31*, 2018 -| Available at http://papers.nips.cc/paper/7917-scalable-hyperparameter-transfer-learning.pdf +| Available at https://papers.nips.cc/paper/7917-scalable-hyperparameter-transfer-learning.pdf This example demonstrates how OpenML runs can be used to construct a surrogate model. diff --git a/examples/README.txt b/examples/README.txt index b90c0e1cb..332a5b990 100644 --- a/examples/README.txt +++ b/examples/README.txt @@ -1,3 +1,3 @@ -======== -Examples -======== +================ +Examples Gallery +================ diff --git a/openml/__init__.py b/openml/__init__.py index 0bab3b1d5..abb83ac0c 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -12,7 +12,7 @@ In particular, this module implements a python interface for the `OpenML REST API `_ (`REST on wikipedia -`_). +`_). """ # License: BSD 3-Clause diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index a0c551e83..5991a7044 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -104,25 +104,29 @@ def can_handle_model(cls, model: Any) -> bool: def trim_flow_name( cls, long_name: str, extra_trim_length: int = 100, _outer: bool = True ) -> str: - """ Shorten generated sklearn flow name to at most `max_length` characters. + """ Shorten generated sklearn flow name to at most ``max_length`` characters. Flows are assumed to have the following naming structure: - (model_selection)? (pipeline)? (steps)+ + ``(model_selection)? (pipeline)? (steps)+`` and will be shortened to: - sklearn.(selection.)?(pipeline.)?(steps)+ + ``sklearn.(selection.)?(pipeline.)?(steps)+`` e.g. (white spaces and newlines added for readability) - sklearn.pipeline.Pipeline( - columntransformer=sklearn.compose._column_transformer.ColumnTransformer( - numeric=sklearn.pipeline.Pipeline( - imputer=sklearn.preprocessing.imputation.Imputer, - standardscaler=sklearn.preprocessing.data.StandardScaler), - nominal=sklearn.pipeline.Pipeline( - simpleimputer=sklearn.impute.SimpleImputer, - onehotencoder=sklearn.preprocessing._encoders.OneHotEncoder)), - variancethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, - svc=sklearn.svm.classes.SVC) + + .. code :: + + sklearn.pipeline.Pipeline( + columntransformer=sklearn.compose._column_transformer.ColumnTransformer( + numeric=sklearn.pipeline.Pipeline( + imputer=sklearn.preprocessing.imputation.Imputer, + standardscaler=sklearn.preprocessing.data.StandardScaler), + nominal=sklearn.pipeline.Pipeline( + simpleimputer=sklearn.impute.SimpleImputer, + onehotencoder=sklearn.preprocessing._encoders.OneHotEncoder)), + variancethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, + svc=sklearn.svm.classes.SVC) + -> - sklearn.Pipeline(ColumnTransformer,VarianceThreshold,SVC) + ``sklearn.Pipeline(ColumnTransformer,VarianceThreshold,SVC)`` Parameters ---------- diff --git a/openml/flows/flow.py b/openml/flows/flow.py index 2acbcb0d1..2a340e625 100644 --- a/openml/flows/flow.py +++ b/openml/flows/flow.py @@ -19,8 +19,9 @@ class OpenMLFlow(OpenMLBase): :meth:`openml.flows.create_flow_from_model`. Using this helper function ensures that all relevant fields are filled in. - Implements https://github.com/openml/website/blob/master/openml_OS/ \ - views/pages/api_new/v1/xsd/openml.implementation.upload.xsd. + Implements `openml.implementation.upload.xsd + `_. Parameters ---------- diff --git a/openml/flows/functions.py b/openml/flows/functions.py index 5e8e9dc93..048fa92a4 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -245,7 +245,7 @@ def flow_exists(name: str, external_version: str) -> Union[int, bool]: Notes ----- - see http://www.openml.org/api_docs/#!/flow/get_flow_exists_name_version + see https://www.openml.org/api_docs/#!/flow/get_flow_exists_name_version """ if not (isinstance(name, str) and len(name) > 0): raise ValueError("Argument 'name' should be a non-empty string") @@ -288,14 +288,14 @@ def get_flow_id( name : str Name of the flow. Must provide either ``model`` or ``name``. exact_version : bool - Whether to return the ``flow_id`` of the exact version or all ``flow_id``s where the name + Whether to return the flow id of the exact version or all flow ids where the name of the flow matches. This is only taken into account for a model where a version number is available. Returns ------- int or bool, List - flow id iff exists, ``False`` otherwise, List if exact_version is ``False`` + flow id iff exists, ``False`` otherwise, List if ``exact_version is False`` """ if model is None and name is None: raise ValueError( diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 92044a1b4..8bbe3b956 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -63,8 +63,8 @@ def run_model_on_task( ---------- model : sklearn model A model which has a function fit(X,Y) and predict(X), - all supervised estimators of scikit learn follow this definition of a model [1] - [1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html) + all supervised estimators of scikit learn follow this definition of a model + (https://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html) task : OpenMLTask or int or str Task to perform or Task id. This may be a model instead if the first argument is an OpenMLTask. @@ -166,8 +166,8 @@ def run_flow_on_task( flow : OpenMLFlow A flow wraps a machine learning model together with relevant information. The model has a function fit(X,Y) and predict(X), - all supervised estimators of scikit learn follow this definition of a model [1] - [1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html) + all supervised estimators of scikit learn follow this definition of a model + (https://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html) task : OpenMLTask Task to perform. This may be an OpenMLFlow instead if the first argument is an OpenMLTask. avoid_duplicate_runs : bool, optional (default=True) diff --git a/openml/study/study.py b/openml/study/study.py index 2b00bb05c..dbbef6e89 100644 --- a/openml/study/study.py +++ b/openml/study/study.py @@ -186,8 +186,6 @@ class OpenMLStudy(BaseStudy): According to this list of run ids, the study object receives a list of OpenML object ids (datasets, flows, tasks and setups). - Inherits from :class:`openml.BaseStudy` - Parameters ---------- study_id : int @@ -268,8 +266,6 @@ class OpenMLBenchmarkSuite(BaseStudy): According to this list of task ids, the suite object receives a list of OpenML object ids (datasets). - Inherits from :class:`openml.BaseStudy` - Parameters ---------- suite_id : int diff --git a/openml/tasks/task.py b/openml/tasks/task.py index ab54db780..6a1f2a4c5 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -199,8 +199,6 @@ def _parse_publish_response(self, xml_response: Dict): class OpenMLSupervisedTask(OpenMLTask, ABC): """OpenML Supervised Classification object. - Inherited from :class:`openml.OpenMLTask` - Parameters ---------- target_name : str @@ -293,8 +291,6 @@ def estimation_parameters(self, est_parameters): class OpenMLClassificationTask(OpenMLSupervisedTask): """OpenML Classification object. - Inherited from :class:`openml.OpenMLSupervisedTask` - Parameters ---------- class_labels : List of str (optional) @@ -338,8 +334,6 @@ def __init__( class OpenMLRegressionTask(OpenMLSupervisedTask): """OpenML Regression object. - - Inherited from :class:`openml.OpenMLSupervisedTask` """ def __init__( @@ -372,8 +366,6 @@ def __init__( class OpenMLClusteringTask(OpenMLTask): """OpenML Clustering object. - Inherited from :class:`openml.OpenMLTask` - Parameters ---------- target_name : str (optional) @@ -451,8 +443,6 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": class OpenMLLearningCurveTask(OpenMLClassificationTask): """OpenML Learning Curve object. - - Inherited from :class:`openml.OpenMLClassificationTask` """ def __init__( diff --git a/setup.py b/setup.py index bad7da2b4..f5e70abb5 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ long_description=README, long_description_content_type="text/markdown", license="BSD 3-clause", - url="http://openml.org/", + url="https://openml.org/", project_urls={ "Documentation": "https://openml.github.io/openml-python/", "Source Code": "https://github.com/openml/openml-python", From 62014cdb80fe7a19d105abd70d999edc8e84c817 Mon Sep 17 00:00:00 2001 From: Neeratyoy Mallik Date: Mon, 26 Apr 2021 22:35:05 +0200 Subject: [PATCH 04/16] Convert sparse labels to pandas series (#1059) * Convert sparse labels to pandas series * Handling sparse labels as Series * Handling sparse targets when dataset as arrays * Revamping sparse dataset tests * Removing redundant unit test * Cleaning target column formatting * Minor comment edit --- openml/datasets/dataset.py | 17 ++++++++++++----- tests/test_datasets/test_dataset.py | 21 ++++++++++++++++++--- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 0c065b855..122e2e697 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -628,7 +628,7 @@ def _encode_if_category(column): ) elif array_format == "dataframe": if scipy.sparse.issparse(data): - return pd.DataFrame.sparse.from_spmatrix(data, columns=attribute_names) + data = pd.DataFrame.sparse.from_spmatrix(data, columns=attribute_names) else: data_type = "sparse-data" if scipy.sparse.issparse(data) else "non-sparse data" logger.warning( @@ -732,6 +732,7 @@ def get_data( else: target = [target] targets = np.array([True if column in target else False for column in attribute_names]) + target_names = np.array([column for column in attribute_names if column in target]) if np.sum(targets) > 1: raise NotImplementedError( "Number of requested targets %d is not implemented." % np.sum(targets) @@ -752,11 +753,17 @@ def get_data( attribute_names = [att for att, k in zip(attribute_names, targets) if not k] x = self._convert_array_format(x, dataset_format, attribute_names) - if scipy.sparse.issparse(y): - y = np.asarray(y.todense()).astype(target_dtype).flatten() - y = y.squeeze() - y = self._convert_array_format(y, dataset_format, attribute_names) + if dataset_format == "array" and scipy.sparse.issparse(y): + # scikit-learn requires dense representation of targets + y = np.asarray(y.todense()).astype(target_dtype) + # dense representation of single column sparse arrays become a 2-d array + # need to flatten it to a 1-d array for _convert_array_format() + y = y.squeeze() + y = self._convert_array_format(y, dataset_format, target_names) y = y.astype(target_dtype) if dataset_format == "array" else y + if len(y.shape) > 1 and y.shape[1] == 1: + # single column targets should be 1-d for both `array` and `dataframe` formats + y = y.squeeze() data, targets = x, y return data, targets, categorical, attribute_names diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 1aeffdbb4..e9cb86c50 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -287,7 +287,7 @@ def setUp(self): self.sparse_dataset = openml.datasets.get_dataset(4136, download_data=False) - def test_get_sparse_dataset_with_target(self): + def test_get_sparse_dataset_array_with_target(self): X, y, _, attribute_names = self.sparse_dataset.get_data( dataset_format="array", target="class" ) @@ -303,7 +303,22 @@ def test_get_sparse_dataset_with_target(self): self.assertEqual(len(attribute_names), 20000) self.assertNotIn("class", attribute_names) - def test_get_sparse_dataset(self): + def test_get_sparse_dataset_dataframe_with_target(self): + X, y, _, attribute_names = self.sparse_dataset.get_data( + dataset_format="dataframe", target="class" + ) + self.assertIsInstance(X, pd.DataFrame) + self.assertIsInstance(X.dtypes[0], pd.SparseDtype) + self.assertEqual(X.shape, (600, 20000)) + + self.assertIsInstance(y, pd.Series) + self.assertIsInstance(y.dtypes, pd.SparseDtype) + self.assertEqual(y.shape, (600,)) + + self.assertEqual(len(attribute_names), 20000) + self.assertNotIn("class", attribute_names) + + def test_get_sparse_dataset_array(self): rval, _, categorical, attribute_names = self.sparse_dataset.get_data(dataset_format="array") self.assertTrue(sparse.issparse(rval)) self.assertEqual(rval.dtype, np.float32) @@ -315,7 +330,7 @@ def test_get_sparse_dataset(self): self.assertEqual(len(attribute_names), 20001) self.assertTrue(all([isinstance(att, str) for att in attribute_names])) - def test_get_sparse_dataframe(self): + def test_get_sparse_dataset_dataframe(self): rval, *_ = self.sparse_dataset.get_data() self.assertIsInstance(rval, pd.DataFrame) np.testing.assert_array_equal( From 6e8a9db03fd1af9d3eb0623970101413b531cc69 Mon Sep 17 00:00:00 2001 From: Neeratyoy Mallik Date: Thu, 29 Apr 2021 09:09:23 +0200 Subject: [PATCH 05/16] Adding warnings to all examples switching to a test server (#1061) * Adding warnings to all examples switching to a test server * Creating warnings in new text cells * Fixing a bug * Debugging doc build failures * Update openml/config.py Co-authored-by: Matthias Feurer * Fixing GUI commit bug * Using a common warning message for docs * Renaming warning message file * Editing the non-edited file Co-authored-by: Matthias Feurer --- doc/test_server_usage_warning.txt | 3 +++ examples/20_basic/introduction_tutorial.py | 8 ++++---- .../simple_flows_and_runs_tutorial.py | 12 ++++++------ .../30_extended/create_upload_tutorial.py | 5 ++--- examples/30_extended/custom_flow_.py | 9 ++++----- examples/30_extended/datasets_tutorial.py | 3 +++ examples/30_extended/flow_id_tutorial.py | 6 +++++- .../30_extended/flows_and_runs_tutorial.py | 13 ++++++++----- examples/30_extended/run_setup_tutorial.py | 8 +++----- examples/30_extended/study_tutorial.py | 19 +++++++------------ examples/30_extended/suites_tutorial.py | 17 +++++++---------- examples/30_extended/tasks_tutorial.py | 9 ++++++--- openml/config.py | 5 +++++ 13 files changed, 63 insertions(+), 54 deletions(-) create mode 100644 doc/test_server_usage_warning.txt diff --git a/doc/test_server_usage_warning.txt b/doc/test_server_usage_warning.txt new file mode 100644 index 000000000..2b7eb696b --- /dev/null +++ b/doc/test_server_usage_warning.txt @@ -0,0 +1,3 @@ +This example uploads data. For that reason, this example connects to the test server at test.openml.org. +This prevents the main server from crowding with example datasets, tasks, runs, and so on. +The use of this test server can affect behaviour and performance of the OpenML-Python API. \ No newline at end of file diff --git a/examples/20_basic/introduction_tutorial.py b/examples/20_basic/introduction_tutorial.py index 765fada12..26d3143dd 100644 --- a/examples/20_basic/introduction_tutorial.py +++ b/examples/20_basic/introduction_tutorial.py @@ -53,10 +53,7 @@ # # Alternatively, by running the code below and replacing 'YOURKEY' with your API key, # you authenticate for the duration of the python process. -# -# .. warning:: This example uploads data. For that reason, this example -# connects to the test server instead. This prevents the live server from -# crowding with example datasets, tasks, studies, and so on. + ############################################################################ @@ -65,6 +62,9 @@ import openml from sklearn import neighbors +############################################################################ +# .. warning:: +# .. include:: ../../test_server_usage_warning.txt openml.config.start_using_configuration_for_example() ############################################################################ diff --git a/examples/20_basic/simple_flows_and_runs_tutorial.py b/examples/20_basic/simple_flows_and_runs_tutorial.py index 48740e800..1d3bb5d6f 100644 --- a/examples/20_basic/simple_flows_and_runs_tutorial.py +++ b/examples/20_basic/simple_flows_and_runs_tutorial.py @@ -10,15 +10,15 @@ import openml from sklearn import ensemble, neighbors + +############################################################################ +# .. warning:: +# .. include:: ../../test_server_usage_warning.txt +openml.config.start_using_configuration_for_example() + ############################################################################ # Train a machine learning model # ============================== -# -# .. warning:: This example uploads data. For that reason, this example -# connects to the test server at test.openml.org. This prevents the main -# server from crowding with example datasets, tasks, runs, and so on. - -openml.config.start_using_configuration_for_example() # NOTE: We are using dataset 20 from the test server: https://test.openml.org/d/20 dataset = openml.datasets.get_dataset(20) diff --git a/examples/30_extended/create_upload_tutorial.py b/examples/30_extended/create_upload_tutorial.py index f80726396..7825d8cf7 100644 --- a/examples/30_extended/create_upload_tutorial.py +++ b/examples/30_extended/create_upload_tutorial.py @@ -16,9 +16,8 @@ from openml.datasets.functions import create_dataset ############################################################################ -# .. warning:: This example uploads data. For that reason, this example -# connects to the test server at test.openml.org. This prevents the main -# server from crowding with example datasets, tasks, runs, and so on. +# .. warning:: +# .. include:: ../../test_server_usage_warning.txt openml.config.start_using_configuration_for_example() ############################################################################ diff --git a/examples/30_extended/custom_flow_.py b/examples/30_extended/custom_flow_.py index 1dde40233..1259acf57 100644 --- a/examples/30_extended/custom_flow_.py +++ b/examples/30_extended/custom_flow_.py @@ -13,12 +13,8 @@ and also show how to link runs to the custom flow. """ -#################################################################################################### - # License: BSD 3-Clause -# .. warning:: This example uploads data. For that reason, this example -# connects to the test server at test.openml.org. This prevents the main -# server from crowding with example datasets, tasks, runs, and so on. + from collections import OrderedDict import numpy as np @@ -26,6 +22,9 @@ from openml import OpenMLClassificationTask from openml.runs.functions import format_prediction +#################################################################################################### +# .. warning:: +# .. include:: ../../test_server_usage_warning.txt openml.config.start_using_configuration_for_example() #################################################################################################### diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py index 7a51cce70..e8aa94f2b 100644 --- a/examples/30_extended/datasets_tutorial.py +++ b/examples/30_extended/datasets_tutorial.py @@ -114,6 +114,9 @@ # Edit a created dataset # ====================== # This example uses the test server, to avoid editing a dataset on the main server. +# +# .. warning:: +# .. include:: ../../test_server_usage_warning.txt openml.config.start_using_configuration_for_example() ############################################################################ # Edit non-critical fields, allowed for all authorized users: diff --git a/examples/30_extended/flow_id_tutorial.py b/examples/30_extended/flow_id_tutorial.py index d9465575e..137f8d14e 100644 --- a/examples/30_extended/flow_id_tutorial.py +++ b/examples/30_extended/flow_id_tutorial.py @@ -16,10 +16,14 @@ import openml -# Activating test server +############################################################################ +# .. warning:: +# .. include:: ../../test_server_usage_warning.txt openml.config.start_using_configuration_for_example() +############################################################################ +# Defining a classifier clf = sklearn.tree.DecisionTreeClassifier() #################################################################################################### diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py index bbf255e17..714ce7b55 100644 --- a/examples/30_extended/flows_and_runs_tutorial.py +++ b/examples/30_extended/flows_and_runs_tutorial.py @@ -10,17 +10,20 @@ import openml from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree + +############################################################################ +# We'll use the test server for the rest of this tutorial. +# +# .. warning:: +# .. include:: ../../test_server_usage_warning.txt +openml.config.start_using_configuration_for_example() + ############################################################################ # Train machine learning models # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # # Train a scikit-learn model on the data manually. -# -# .. warning:: This example uploads data. For that reason, this example -# connects to the test server at test.openml.org. This prevents the main -# server from crowding with example datasets, tasks, runs, and so on. -openml.config.start_using_configuration_for_example() # NOTE: We are using dataset 68 from the test server: https://test.openml.org/d/68 dataset = openml.datasets.get_dataset(68) X, y, categorical_indicator, attribute_names = dataset.get_data( diff --git a/examples/30_extended/run_setup_tutorial.py b/examples/30_extended/run_setup_tutorial.py index 8579d1d38..1bb123aad 100644 --- a/examples/30_extended/run_setup_tutorial.py +++ b/examples/30_extended/run_setup_tutorial.py @@ -24,10 +24,6 @@ 2) Download the flow, reinstantiate the model with same hyperparameters, and solve the same task again; 3) We will verify that the obtained results are exactly the same. - -.. warning:: This example uploads data. For that reason, this example - connects to the test server at test.openml.org. This prevents the main - server from crowding with example datasets, tasks, runs, and so on. """ # License: BSD 3-Clause @@ -43,7 +39,9 @@ from sklearn.ensemble import RandomForestClassifier from sklearn.decomposition import TruncatedSVD - +############################################################################ +# .. warning:: +# .. include:: ../../test_server_usage_warning.txt openml.config.start_using_configuration_for_example() ############################################################################### diff --git a/examples/30_extended/study_tutorial.py b/examples/30_extended/study_tutorial.py index 76cca4840..b66c49096 100644 --- a/examples/30_extended/study_tutorial.py +++ b/examples/30_extended/study_tutorial.py @@ -2,9 +2,7 @@ ================= Benchmark studies ================= - How to list, download and upload benchmark studies. - In contrast to `benchmark suites `_ which hold a list of tasks, studies hold a list of runs. As runs contain all information on flows and tasks, all required information about a study can be retrieved. @@ -20,15 +18,6 @@ import openml -############################################################################ -# .. warning:: This example uploads data. For that reason, this example -# connects to the test server at test.openml.org before doing so. -# This prevents the crowding of the main server with example datasets, -# tasks, runs, and so on. -# -############################################################################ - - ############################################################################ # Listing studies # *************** @@ -66,6 +55,13 @@ ) print(evaluations.head()) +############################################################################ +# We'll use the test server for the rest of this tutorial. +# +# .. warning:: +# .. include:: ../../test_server_usage_warning.txt +openml.config.start_using_configuration_for_example() + ############################################################################ # Uploading studies # ================= @@ -73,7 +69,6 @@ # Creating a study is as simple as creating any kind of other OpenML entity. # In this examples we'll create a few runs for the OpenML-100 benchmark # suite which is available on the OpenML test server. -openml.config.start_using_configuration_for_example() # Model to be used clf = RandomForestClassifier() diff --git a/examples/30_extended/suites_tutorial.py b/examples/30_extended/suites_tutorial.py index cc26b78db..9b8c1d73d 100644 --- a/examples/30_extended/suites_tutorial.py +++ b/examples/30_extended/suites_tutorial.py @@ -19,14 +19,6 @@ import openml -############################################################################ -# .. warning:: This example uploads data. For that reason, this example -# connects to the test server at test.openml.org before doing so. -# This prevents the main server from crowding with example datasets, -# tasks, runs, and so on. -# -############################################################################ - ############################################################################ # Listing suites @@ -66,6 +58,13 @@ tasks = tasks.query("tid in @suite.tasks") print(tasks.describe().transpose()) +############################################################################ +# We'll use the test server for the rest of this tutorial. +# +# .. warning:: +# .. include:: ../../test_server_usage_warning.txt +openml.config.start_using_configuration_for_example() + ############################################################################ # Uploading suites # ================ @@ -74,8 +73,6 @@ # entity - the only reason why we need so much code in this example is # because we upload some random data. -openml.config.start_using_configuration_for_example() - # We'll take a random subset of at least ten tasks of all available tasks on # the test server: all_tasks = list(openml.tasks.list_tasks().keys()) diff --git a/examples/30_extended/tasks_tutorial.py b/examples/30_extended/tasks_tutorial.py index 2166d5a03..3f70d64fe 100644 --- a/examples/30_extended/tasks_tutorial.py +++ b/examples/30_extended/tasks_tutorial.py @@ -172,6 +172,12 @@ # necessary (e.g. when other measure make no sense), since it will create a new task, which # scatters results across tasks. +############################################################################ +# We'll use the test server for the rest of this tutorial. +# +# .. warning:: +# .. include:: ../../test_server_usage_warning.txt +openml.config.start_using_configuration_for_example() ############################################################################ # Example @@ -185,9 +191,6 @@ # will be returned. -# using test server for example uploads -openml.config.start_using_configuration_for_example() - try: my_task = openml.tasks.create_task( task_type=TaskType.SUPERVISED_CLASSIFICATION, diff --git a/openml/config.py b/openml/config.py index 7295ea82e..f2264dc2a 100644 --- a/openml/config.py +++ b/openml/config.py @@ -10,6 +10,7 @@ from pathlib import Path import platform from typing import Tuple, cast, Any +import warnings from io import StringIO import configparser @@ -157,6 +158,10 @@ def start_using_configuration_for_example(cls): # Test server key for examples server = cls._test_server apikey = cls._test_apikey + warnings.warn( + "Switching to the test server {} to not upload results to the live server. " + "Using the test server may result in reduced performance of the API!".format(server) + ) @classmethod def stop_using_configuration_for_example(cls): From 968e2510df7086d3a31b015c33259e15e10aa855 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 4 May 2021 16:23:49 +0200 Subject: [PATCH 06/16] Create dedicated extensions page (#1068) --- doc/conf.py | 1 + doc/contributing.rst | 66 --------------------------------- doc/extensions.rst | 87 ++++++++++++++++++++++++++++++++++++++++++++ doc/index.rst | 1 + doc/usage.rst | 18 ++------- 5 files changed, 92 insertions(+), 81 deletions(-) create mode 100644 doc/extensions.rst diff --git a/doc/conf.py b/doc/conf.py index 1f016561b..a10187486 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -138,6 +138,7 @@ ("User Guide", "usage"), ("API", "api"), ("Examples", "examples/index"), + ("Extensions", "extensions"), ("Contributing", "contributing"), ("Changelog", "progress"), ], diff --git a/doc/contributing.rst b/doc/contributing.rst index 927c21034..e87a02dfb 100644 --- a/doc/contributing.rst +++ b/doc/contributing.rst @@ -29,70 +29,4 @@ In particular, a few ways to contribute to openml-python are: .. _extensions: -Connecting new machine learning libraries -========================================= -Content of the Library -~~~~~~~~~~~~~~~~~~~~~~ - -To leverage support from the community and to tap in the potential of OpenML, interfacing -with popular machine learning libraries is essential. However, the OpenML-Python team does -not have the capacity to develop and maintain such interfaces on its own. For this, we -have built an extension interface to allows others to contribute back. Building a suitable -extension for therefore requires an understanding of the current OpenML-Python support. - -The :ref:`sphx_glr_examples_20_basic_simple_flows_and_runs_tutorial.py` tutorial -shows how scikit-learn currently works with OpenML-Python as an extension. The *sklearn* -extension packaged with the `openml-python `_ -repository can be used as a template/benchmark to build the new extension. - - -API -+++ -* The extension scripts must import the `openml` package and be able to interface with - any function from the OpenML-Python :ref:`api`. -* The extension has to be defined as a Python class and must inherit from - :class:`openml.extensions.Extension`. -* This class needs to have all the functions from `class Extension` overloaded as required. -* The redefined functions should have adequate and appropriate docstrings. The - `Sklearn Extension API :class:`openml.extensions.sklearn.SklearnExtension.html` - is a good benchmark to follow. - - -Interfacing with OpenML-Python -++++++++++++++++++++++++++++++ -Once the new extension class has been defined, the openml-python module to -:meth:`openml.extensions.register_extension` must be called to allow OpenML-Python to -interface the new extension. - - -Hosting the library -~~~~~~~~~~~~~~~~~~~ - -Each extension created should be a stand-alone repository, compatible with the -`OpenML-Python repository `_. -The extension repository should work off-the-shelf with *OpenML-Python* installed. - -Create a `public Github repo `_ -with the following directory structure: - -:: - -| [repo name] -| |-- [extension name] -| | |-- __init__.py -| | |-- extension.py -| | |-- config.py (optionally) - - - -Recommended -~~~~~~~~~~~ -* Test cases to keep the extension up to date with the `openml-python` upstream changes. -* Documentation of the extension API, especially if any new functionality added to OpenML-Python's - extension design. -* Examples to show how the new extension interfaces and works with OpenML-Python. -* Create a PR to add the new extension to the OpenML-Python API documentation. - - -Happy contributing! diff --git a/doc/extensions.rst b/doc/extensions.rst new file mode 100644 index 000000000..ea12dda6a --- /dev/null +++ b/doc/extensions.rst @@ -0,0 +1,87 @@ +:orphan: + +.. _extensions: + +========== +Extensions +========== + +OpenML-Python provides an extension interface to connect other machine learning libraries than +scikit-learn to OpenML. Please check the :ref:`api_extensions` and use the +scikit-learn extension in :class:`openml.extensions.sklearn.SklearnExtension` as a starting point. + +List of extensions +================== + +Here is a list of currently maintained OpenML extensions: + +* :class:`openml.extensions.sklearn.SklearnExtension` +* `openml-keras `_ +* `openml-pytorch `_ +* `openml-tensorflow (for tensorflow 2+) `_ + + +Connecting new machine learning libraries +========================================= + +Content of the Library +~~~~~~~~~~~~~~~~~~~~~~ + +To leverage support from the community and to tap in the potential of OpenML, interfacing +with popular machine learning libraries is essential. However, the OpenML-Python team does +not have the capacity to develop and maintain such interfaces on its own. For this, we +have built an extension interface to allows others to contribute back. Building a suitable +extension for therefore requires an understanding of the current OpenML-Python support. + +The :ref:`sphx_glr_examples_20_basic_simple_flows_and_runs_tutorial.py` tutorial +shows how scikit-learn currently works with OpenML-Python as an extension. The *sklearn* +extension packaged with the `openml-python `_ +repository can be used as a template/benchmark to build the new extension. + + +API ++++ +* The extension scripts must import the `openml` package and be able to interface with + any function from the OpenML-Python :ref:`api`. +* The extension has to be defined as a Python class and must inherit from + :class:`openml.extensions.Extension`. +* This class needs to have all the functions from `class Extension` overloaded as required. +* The redefined functions should have adequate and appropriate docstrings. The + `Sklearn Extension API :class:`openml.extensions.sklearn.SklearnExtension.html` + is a good benchmark to follow. + + +Interfacing with OpenML-Python +++++++++++++++++++++++++++++++ +Once the new extension class has been defined, the openml-python module to +:meth:`openml.extensions.register_extension` must be called to allow OpenML-Python to +interface the new extension. + + +Hosting the library +~~~~~~~~~~~~~~~~~~~ + +Each extension created should be a stand-alone repository, compatible with the +`OpenML-Python repository `_. +The extension repository should work off-the-shelf with *OpenML-Python* installed. + +Create a `public Github repo `_ +with the following directory structure: + +:: + +| [repo name] +| |-- [extension name] +| | |-- __init__.py +| | |-- extension.py +| | |-- config.py (optionally) + +Recommended +~~~~~~~~~~~ +* Test cases to keep the extension up to date with the `openml-python` upstream changes. +* Documentation of the extension API, especially if any new functionality added to OpenML-Python's + extension design. +* Examples to show how the new extension interfaces and works with OpenML-Python. +* Create a PR to add the new extension to the OpenML-Python API documentation. + +Happy contributing! diff --git a/doc/index.rst b/doc/index.rst index c4164dc82..b0140c1d0 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -61,6 +61,7 @@ Content * :ref:`usage` * :ref:`api` * :ref:`sphx_glr_examples` +* :ref:`extensions` * :ref:`contributing` * :ref:`progress` diff --git a/doc/usage.rst b/doc/usage.rst index 7bf247f4d..0d51f232a 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -155,19 +155,7 @@ obtained on. Learn how to share your datasets in the following tutorial: Extending OpenML-Python *********************** -OpenML-Python provides an extension interface to connect other machine learning libraries than -scikit-learn to OpenML. Please check the :ref:`api_extensions` and use the -scikit-learn extension in :class:`openml.extensions.sklearn.SklearnExtension` as a starting point. - -Runtime measurement is incorporated in the OpenML sklearn-extension. Example usage and potential -usage for Hyperparameter Optimisation can be found in the example tutorial: - -* :ref:`sphx_glr_examples_30_extended_fetch_runtimes_tutorial.py` - - -Here is a list of currently maintained OpenML extensions: - -* `openml-keras `_ -* `openml-pytorch `_ -* `openml-tensorflow(for tensorflow 2+) `_ +OpenML-Python provides an extension interface to connect machine learning libraries directly to +the API and ships a ``scikit-learn`` extension. You can find more information in the Section +:ref:`extensions`' From b0e944d4a3d24acc6837ddfd4dd4c7255dfc5a71 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Tue, 4 May 2021 20:18:56 +0200 Subject: [PATCH 07/16] Remove E500 from list of exception to raise (#1071) OpenML code 500 indicates no results for a flow query, and was likely confused with the HTTP code 500 for internal server error. --- openml/_api_calls.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/openml/_api_calls.py b/openml/_api_calls.py index aee67d8c6..624b0da45 100644 --- a/openml/_api_calls.py +++ b/openml/_api_calls.py @@ -247,9 +247,8 @@ def _send_request(request_method, url, data, files=None, md5_checksum=None): OpenMLHashException, ) as e: if isinstance(e, OpenMLServerException): - if e.code not in [107, 500]: + if e.code not in [107]: # 107: database connection error - # 500: internal server error raise elif isinstance(e, xml.parsers.expat.ExpatError): if request_method != "get" or retry_counter >= n_retries: From 97d67e7e2ca8e236e9af314e2e15d8916d73b4ee Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 7 May 2021 17:08:23 +0200 Subject: [PATCH 08/16] Add a Docker Image for testing and doc building in an isolated environment (#1075) * Initial structure * Add doc and test functionality for mounted repo * Add branch support and safeguards * Update docker usage and name, add structure * Improved formatting * Add reference to docker image from main docs * Add Workflow to build and push docker image * Use environment variable directly * Try other formatting for SHA tag * Try format as string * Only push latest * Explicitly make context relative * Checkout repository * Install wheel and setuptools before other packages * Rename master to main * Add information about Docker PR * Make 'note' italtics instead of content Co-authored-by: Matthias Feurer Co-authored-by: Matthias Feurer --- .github/workflows/release_docker.yaml | 31 ++++++++++ CONTRIBUTING.md | 4 ++ doc/progress.rst | 2 +- doc/usage.rst | 13 ++++ docker/Dockerfile | 19 ++++++ docker/readme.md | 86 +++++++++++++++++++++++++++ docker/startup.sh | 75 +++++++++++++++++++++++ 7 files changed, 229 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/release_docker.yaml create mode 100644 docker/Dockerfile create mode 100644 docker/readme.md create mode 100644 docker/startup.sh diff --git a/.github/workflows/release_docker.yaml b/.github/workflows/release_docker.yaml new file mode 100644 index 000000000..c4522c0be --- /dev/null +++ b/.github/workflows/release_docker.yaml @@ -0,0 +1,31 @@ +name: release-docker + +on: + push: + branches: + - 'develop' + - 'docker' + +jobs: + docker: + runs-on: ubuntu-latest + steps: + - name: Set up QEMU + uses: docker/setup-qemu-action@v1 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + - name: Login to DockerHub + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - uses: actions/checkout@v2 + - name: Build and push + id: docker_build + uses: docker/build-push-action@v2 + with: + context: ./docker/ + push: true + tags: openml/openml-python:latest + - name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6fe4fd605..3351bc36d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -178,6 +178,10 @@ following rules before you submit a pull request: - If any source file is being added to the repository, please add the BSD 3-Clause license to it. +*Note*: We recommend to follow the instructions below to install all requirements locally. +However it is also possible to use the [openml-python docker image](https://github.com/openml/openml-python/blob/main/docker/readme.md) for testing and building documentation. +This can be useful for one-off contributions or when you are experiencing installation issues. + First install openml with its test dependencies by running ```bash $ pip install -e .[test] diff --git a/doc/progress.rst b/doc/progress.rst index 8d3f4ec1d..5b3aae784 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -8,7 +8,7 @@ Changelog 0.12.2 ~~~~~~ - +* ADD #1075: A docker image is now automatically built on a push to develop. It can be used to build docs or run tests in an isolated environment. * DOC: Fixes a few broken links in the documentation. * MAINT/DOC: Automatically check for broken external links when building the documentation. * MAINT/DOC: Fail documentation building on warnings. This will make the documentation building diff --git a/doc/usage.rst b/doc/usage.rst index 0d51f232a..fd7d5fbec 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -65,6 +65,19 @@ This file is easily configurable by the ``openml`` command line interface. To see where the file is stored, and what its values are, use `openml configure none`. Set any field with ``openml configure FIELD`` or even all fields with just ``openml configure``. +~~~~~~ +Docker +~~~~~~ + +It is also possible to try out the latest development version of ``openml-python`` with docker: + +``` + docker run -it openml/openml-python +``` + + +See the `openml-python docker documentation `_ for more information. + ~~~~~~~~~~~~ Key concepts ~~~~~~~~~~~~ diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 000000000..5fcc16e34 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,19 @@ +# Dockerfile to build an image with preinstalled dependencies +# Useful building docs or running unix tests from a Windows host. +FROM python:3 + +RUN git clone https://github.com/openml/openml-python.git omlp +WORKDIR omlp +RUN python -m venv venv +RUN venv/bin/pip install wheel setuptools +RUN venv/bin/pip install -e .[test,examples,docs,examples_unix] + +WORKDIR / +RUN mkdir scripts +ADD startup.sh scripts/ +# Due to the nature of the Docker container it might often be built from Windows. +# It is typical to have the files with \r\n line-ending, we want to remove it for the unix image. +RUN sed -i 's/\r//g' scripts/startup.sh + +# overwrite the default `python` entrypoint +ENTRYPOINT ["/bin/bash", "/scripts/startup.sh"] diff --git a/docker/readme.md b/docker/readme.md new file mode 100644 index 000000000..47ad6d23b --- /dev/null +++ b/docker/readme.md @@ -0,0 +1,86 @@ +# OpenML Python Container + +This docker container has the latest development version of openml-python downloaded and pre-installed. +It can be used to run the unit tests or build the docs in a fresh and/or isolated unix environment. +Instructions only tested on a Windows host machine. + +First pull the docker image: + + docker pull openml/openml-python + +## Usage + + + docker run -it openml/openml-python [DOC,TEST] [BRANCH] + +The image is designed to work with two specified directories which may be mounted ([`docker --mount documentation`](https://docs.docker.com/storage/bind-mounts/#start-a-container-with-a-bind-mount)). +You can mount your openml-python folder to the `/code` directory to run tests or build docs on your local files. +You can mount an `/output` directory to which the container will write output (currently only used for docs). +Each can be mounted by adding a `--mount type=bind,source=SOURCE,destination=/DESTINATION` where `SOURCE` is the absolute path to your code or output directory, and `DESTINATION` is either `code` or `output`. + +E.g. mounting a code directory: + + docker run -i --mount type=bind,source="E:\\repositories/openml-python",destination="/code" -t openml/openml-python + +E.g. mounting an output directory: + + docker run -i --mount type=bind,source="E:\\files/output",destination="/output" -t openml/openml-python + +You can mount both at the same time. + +### Bash (default) +By default bash is invoked, you should also use the `-i` flag when starting the container so it processes input: + + docker run -it openml/openml-python + +### Building Documentation +There are two ways to build documentation, either directly from the `HEAD` of a branch on Github or from your local directory. + +#### Building from a local repository +Building from a local directory requires you to mount it to the ``/code`` directory: + + docker run --mount type=bind,source=PATH_TO_REPOSITORY,destination=/code -t openml/openml-python doc + +The produced documentation will be in your repository's ``doc/build`` folder. +If an `/output` folder is mounted, the documentation will *also* be copied there. + +#### Building from an online repository +Building from a remote repository requires you to specify a branch. +The branch may be specified by name directly if it exists on the original repository (https://github.com/openml/openml-python/): + + docker run --mount type=bind,source=PATH_TO_OUTPUT,destination=/output -t openml/openml-python doc BRANCH + +Where `BRANCH` is the name of the branch for which to generate the documentation. +It is also possible to build the documentation from the branch on a fork, in this case the `BRANCH` should be specified as `GITHUB_NAME#BRANCH` (e.g. `PGijsbers#my_feature`) and the name of the forked repository should be `openml-python`. + +### Running tests +There are two ways to run tests, either directly from the `HEAD` of a branch on Github or from your local directory. +It works similar to building docs, but should specify `test` as mode. +For example, to run tests on your local repository: + + docker run --mount type=bind,source=PATH_TO_REPOSITORY,destination=/code -t openml/openml-python test + +Running tests from the state of an online repository is supported similar to building documentation (i.e. specify `BRANCH` instead of mounting `/code`). + +## Troubleshooting + +When you are mounting a directory you can check that it is mounted correctly by running the image in bash mode. +Navigate to the `/code` and `/output` directories and see if the expected files are there. +If e.g. there is no code in your mounted `/code`, you should double-check the provided path to your host directory. + +## Notes for developers +This section contains some notes about the structure of the image, intended for those who want to work on it. + +### Added Directories +The `openml/openml-python` image is built on a vanilla `python:3` image. +Additionally it contains the following files are directories: + + - `/omlp`: contains the openml-python repository in the state with which the image was built by default. + If working with a `BRANCH`, this repository will be set to the `HEAD` of `BRANCH`. + - `/omlp/venv/`: contains the used virtual environment for `doc` and `test`. It has `openml-python` dependencies pre-installed. + When invoked with `doc` or `test`, the dependencies will be updated based on the `setup.py` of the `BRANCH` or mounted `/code`. + - `/scripts/startup.sh`: the entrypoint of the image. Takes care of the automated features (e.g. `doc` and `test`). + +## Building the image +To build the image yourself, execute `docker build -f Dockerfile .` from this directory. +It will use the `startup.sh` as is, so any local changes will be present in the image. diff --git a/docker/startup.sh b/docker/startup.sh new file mode 100644 index 000000000..1946a69cc --- /dev/null +++ b/docker/startup.sh @@ -0,0 +1,75 @@ +# Entry script to allow docker to be ran for bash, tests and docs. +# The script assumes a code repository can be mounted to ``/code`` and an output directory to ``/output``. +# Executes ``mode`` on ``branch`` or the provided ``code`` directory. +# $1: Mode, optional. Options: +# - test: execute unit tests +# - doc: build documentation, requires a mounted ``output`` directory if built from a branch. +# - if not provided: execute bash. +# $2: Branch, optional. +# Mutually exclusive with mounting a ``code`` directory. +# Can be a branch on a Github fork, specified with the USERNAME#BRANCH format. +# The test or doc build is executed on this branch. + +if [ -z "$1" ]; then + echo "Executing in BASH mode." + bash + exit +fi + +# doc and test modes require mounted directories and/or specified branches +if ! [ -d "/code" ] && [ -z "$2" ]; then + echo "To perform $1 a code repository must be mounted to '/code' or a branch must be specified." >> /dev/stderr + exit 1 +fi +if [ -d "/code" ] && [ -n "$2" ]; then + # We want to avoid switching the git environment from within the docker container + echo "You can not specify a branch for a mounted code repository." >> /dev/stderr + exit 1 +fi +if [ "$1" == "doc" ] && [ -n "$2" ] && ! [ -d "/output" ]; then + echo "To build docs from an online repository, you need to mount an output directory." >> /dev/stderr + exit 1 +fi + +if [ -n "$2" ]; then + # if a branch is provided, we will pull it into the `omlp` local repository that was created with the image. + cd omlp + if [[ $2 == *#* ]]; then + # If a branch is specified on a fork (with NAME#BRANCH format), we have to construct the url before pulling + # We add a trailing '#' delimiter so the second element doesn't get the trailing newline from <<< + readarray -d '#' -t fork_name_and_branch<<<"$2#" + fork_url="https://github.com/${fork_name_and_branch[0]}/openml-python.git" + fork_branch="${fork_name_and_branch[1]}" + echo git fetch "$fork_url" "$fork_branch":branch_from_fork + git fetch "$fork_url" "$fork_branch":branch_from_fork + branch=branch_from_fork + else + branch=$2 + fi + if ! git checkout "$branch" ; then + echo "Could not checkout $branch. If the branch lives on a fork, specify it as USER#BRANCH. Make sure to push the branch." >> /dev/stderr + exit 1 + fi + git pull + code_dir="/omlp" +else + code_dir="/code" +fi + +source /omlp/venv/bin/activate +cd $code_dir +# The most recent ``master`` is already installed, but we want to update any outdated dependencies +pip install -e .[test,examples,docs,examples_unix] + +if [ "$1" == "test" ]; then + pytest -n 4 --durations=20 --timeout=600 --timeout-method=thread --dist load -sv +fi + +if [ "$1" == "doc" ]; then + cd doc + make html + make linkcheck + if [ -d "/output" ]; then + cp -r /omlp/doc/build /output + fi +fi From a505162b974133f48e3082216127802e1341bdef Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Mon, 10 May 2021 09:33:44 +0200 Subject: [PATCH 09/16] Create a non-linear retry policy. (#1065) Create a second configurable retry policy. The configuration now allows for a `human` and `robot` retry policy, intended for interactive use and scripts, respectively. --- doc/progress.rst | 1 + doc/usage.rst | 9 +++- openml/_api_calls.py | 16 ++++++- openml/cli.py | 40 +++++++++++++--- openml/config.py | 46 ++++++++++++------- openml/testing.py | 4 +- tests/test_datasets/test_dataset_functions.py | 5 ++ tests/test_openml/test_api_calls.py | 2 +- tests/test_openml/test_config.py | 6 +-- 9 files changed, 97 insertions(+), 32 deletions(-) diff --git a/doc/progress.rst b/doc/progress.rst index 5b3aae784..05b4b64c4 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -8,6 +8,7 @@ Changelog 0.12.2 ~~~~~~ +* ADD #1065: Add a ``retry_policy`` configuration option that determines the frequency and number of times to attempt to retry server requests. * ADD #1075: A docker image is now automatically built on a push to develop. It can be used to build docs or run tests in an isolated environment. * DOC: Fixes a few broken links in the documentation. * MAINT/DOC: Automatically check for broken external links when building the documentation. diff --git a/doc/usage.rst b/doc/usage.rst index fd7d5fbec..4b40decc8 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -52,9 +52,14 @@ which are separated by newlines. The following keys are defined: * if set to ``True``, when ``run_flow_on_task`` or similar methods are called a lookup is performed to see if there already exists such a run on the server. If so, download those results instead. * if not given, will default to ``True``. +* retry_policy: + * Defines how to react when the server is unavailable or experiencing high load. It determines both how often to attempt to reconnect and how quickly to do so. Please don't use ``human`` in an automated script that you run more than one instance of, it might increase the time to complete your jobs and that of others. + * human (default): For people running openml in interactive fashion. Try only a few times, but in quick succession. + * robot: For people using openml in an automated fashion. Keep trying to reconnect for a longer time, quickly increasing the time between retries. + * connection_n_retries: - * number of connection retries. - * default: 2. Maximum number of retries: 20. + * number of connection retries + * default depends on retry_policy (5 for ``human``, 50 for ``robot``) * verbosity: * 0: normal output diff --git a/openml/_api_calls.py b/openml/_api_calls.py index 624b0da45..b5ed976bc 100644 --- a/openml/_api_calls.py +++ b/openml/_api_calls.py @@ -3,7 +3,9 @@ import time import hashlib import logging +import math import pathlib +import random import requests import urllib.parse import xml @@ -217,7 +219,7 @@ def __is_checksum_equal(downloaded_file, md5_checksum=None): def _send_request(request_method, url, data, files=None, md5_checksum=None): - n_retries = max(1, min(config.connection_n_retries, config.max_retries)) + n_retries = max(1, config.connection_n_retries) response = None with requests.Session() as session: @@ -261,7 +263,17 @@ def _send_request(request_method, url, data, files=None, md5_checksum=None): if retry_counter >= n_retries: raise else: - time.sleep(retry_counter) + + def robot(n: int) -> float: + wait = (1 / (1 + math.exp(-(n * 0.5 - 4)))) * 60 + variation = random.gauss(0, wait / 10) + return max(1.0, wait + variation) + + def human(n: int) -> float: + return max(1.0, n) + + delay = {"human": human, "robot": robot}[config.retry_policy](retry_counter) + time.sleep(delay) if response is None: raise ValueError("This should never happen!") return response diff --git a/openml/cli.py b/openml/cli.py index b26e67d2e..15654cfc6 100644 --- a/openml/cli.py +++ b/openml/cli.py @@ -149,11 +149,9 @@ def check_cache_dir(path: str) -> str: def configure_connection_n_retries(value: str) -> None: def valid_connection_retries(n: str) -> str: if not n.isdigit(): - return f"Must be an integer number (smaller than {config.max_retries})." - if int(n) > config.max_retries: - return f"connection_n_retries may not exceed {config.max_retries}." - if int(n) == 0: - return "connection_n_retries must be non-zero." + return f"'{n}' is not a valid positive integer." + if int(n) <= 0: + return "connection_n_retries must be positive." return "" configure_field( @@ -161,7 +159,7 @@ def valid_connection_retries(n: str) -> str: value=value, check_with_message=valid_connection_retries, intro_message="Configuring the number of times to attempt to connect to the OpenML Server", - input_message=f"Enter an integer between 0 and {config.max_retries}: ", + input_message="Enter a positive integer: ", ) @@ -217,6 +215,35 @@ def is_zero_through_two(verbosity: str) -> str: ) +def configure_retry_policy(value: str) -> None: + def is_known_policy(policy: str) -> str: + if policy in ["human", "robot"]: + return "" + return "Must be 'human' or 'robot'." + + def autocomplete_policy(policy: str) -> str: + for option in ["human", "robot"]: + if option.startswith(policy.lower()): + return option + return policy + + intro_message = ( + "Set the retry policy which determines how to react if the server is unresponsive." + "We recommend 'human' for interactive usage and 'robot' for scripts." + "'human': try a few times in quick succession, less reliable but quicker response." + "'robot': try many times with increasing intervals, more reliable but slower response." + ) + + configure_field( + field="retry_policy", + value=value, + check_with_message=is_known_policy, + intro_message=intro_message, + input_message="Enter 'human' or 'robot': ", + sanitize=autocomplete_policy, + ) + + def configure_field( field: str, value: Union[None, str], @@ -272,6 +299,7 @@ def configure(args: argparse.Namespace): "apikey": configure_apikey, "server": configure_server, "cachedir": configure_cachedir, + "retry_policy": configure_retry_policy, "connection_n_retries": configure_connection_n_retries, "avoid_duplicate_runs": configure_avoid_duplicate_runs, "verbosity": configure_verbosity, diff --git a/openml/config.py b/openml/config.py index f2264dc2a..8593ad484 100644 --- a/openml/config.py +++ b/openml/config.py @@ -9,7 +9,7 @@ import os from pathlib import Path import platform -from typing import Tuple, cast, Any +from typing import Tuple, cast, Any, Optional import warnings from io import StringIO @@ -95,11 +95,10 @@ def set_file_log_level(file_output_level: int): else os.path.join("~", ".openml") ), "avoid_duplicate_runs": "True", - "connection_n_retries": "10", - "max_retries": "20", + "retry_policy": "human", + "connection_n_retries": "5", } - # Default values are actually added here in the _setup() function which is # called at the end of this module server = str(_defaults["server"]) # so mypy knows it is a string @@ -122,9 +121,26 @@ def get_server_base_url() -> str: cache_directory = str(_defaults["cachedir"]) # so mypy knows it is a string avoid_duplicate_runs = True if _defaults["avoid_duplicate_runs"] == "True" else False -# Number of retries if the connection breaks +retry_policy = _defaults["retry_policy"] connection_n_retries = int(_defaults["connection_n_retries"]) -max_retries = int(_defaults["max_retries"]) + + +def set_retry_policy(value: str, n_retries: Optional[int] = None) -> None: + global retry_policy + global connection_n_retries + default_retries_by_policy = dict(human=5, robot=50) + + if value not in default_retries_by_policy: + raise ValueError( + f"Detected retry_policy '{value}' but must be one of {default_retries_by_policy}" + ) + if n_retries is not None and not isinstance(n_retries, int): + raise TypeError(f"`n_retries` must be of type `int` or `None` but is `{type(n_retries)}`.") + if isinstance(n_retries, int) and n_retries < 1: + raise ValueError(f"`n_retries` is '{n_retries}' but must be positive.") + + retry_policy = value + connection_n_retries = default_retries_by_policy[value] if n_retries is None else n_retries class ConfigurationForExamples: @@ -205,8 +221,6 @@ def _setup(config=None): global server global cache_directory global avoid_duplicate_runs - global connection_n_retries - global max_retries config_file = determine_config_file_path() config_dir = config_file.parent @@ -238,8 +252,12 @@ def _get(config, key): apikey = _get(config, "apikey") server = _get(config, "server") short_cache_dir = _get(config, "cachedir") - connection_n_retries = int(_get(config, "connection_n_retries")) - max_retries = int(_get(config, "max_retries")) + + n_retries = _get(config, "connection_n_retries") + if n_retries is not None: + n_retries = int(n_retries) + + set_retry_policy(_get(config, "retry_policy"), n_retries) cache_directory = os.path.expanduser(short_cache_dir) # create the cache subdirectory @@ -261,12 +279,6 @@ def _get(config, key): "not working properly." % config_dir ) - if connection_n_retries > max_retries: - raise ValueError( - "A higher number of retries than {} is not allowed to keep the " - "server load reasonable".format(max_retries) - ) - def set_field_in_config_file(field: str, value: Any): """ Overwrites the `field` in the configuration file with the new `value`. """ @@ -317,7 +329,7 @@ def get_config_as_dict(): config["cachedir"] = cache_directory config["avoid_duplicate_runs"] = avoid_duplicate_runs config["connection_n_retries"] = connection_n_retries - config["max_retries"] = max_retries + config["retry_policy"] = retry_policy return config diff --git a/openml/testing.py b/openml/testing.py index f8e22bb4c..922d373b2 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -94,8 +94,9 @@ def setUp(self, n_levels: int = 1): openml.config.cache_directory = self.workdir # Increase the number of retries to avoid spurious server failures + self.retry_policy = openml.config.retry_policy self.connection_n_retries = openml.config.connection_n_retries - openml.config.connection_n_retries = 10 + openml.config.set_retry_policy("robot", n_retries=20) def tearDown(self): os.chdir(self.cwd) @@ -109,6 +110,7 @@ def tearDown(self): raise openml.config.server = self.production_server openml.config.connection_n_retries = self.connection_n_retries + openml.config.retry_policy = self.retry_policy @classmethod def _mark_entity_for_removal(self, entity_type, entity_id): diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index ec9dd6c53..9d67ee177 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -506,6 +506,9 @@ def test__getarff_md5_issue(self): "oml:md5_checksum": "abc", "oml:url": "https://www.openml.org/data/download/61", } + n = openml.config.connection_n_retries + openml.config.connection_n_retries = 1 + self.assertRaisesRegex( OpenMLHashException, "Checksum of downloaded file is unequal to the expected checksum abc when downloading " @@ -514,6 +517,8 @@ def test__getarff_md5_issue(self): description, ) + openml.config.connection_n_retries = n + def test__get_dataset_features(self): features_file = _get_dataset_features_file(self.workdir, 2) self.assertIsInstance(features_file, str) diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py index 459a0cdf5..16bdbc7df 100644 --- a/tests/test_openml/test_api_calls.py +++ b/tests/test_openml/test_api_calls.py @@ -29,4 +29,4 @@ def test_retry_on_database_error(self, Session_class_mock, _): ): openml._api_calls._send_request("get", "/abc", {}) - self.assertEqual(Session_class_mock.return_value.__enter__.return_value.get.call_count, 10) + self.assertEqual(Session_class_mock.return_value.__enter__.return_value.get.call_count, 20) diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py index 2e2c609db..638f02420 100644 --- a/tests/test_openml/test_config.py +++ b/tests/test_openml/test_config.py @@ -44,8 +44,8 @@ def test_get_config_as_dict(self): _config["server"] = "https://test.openml.org/api/v1/xml" _config["cachedir"] = self.workdir _config["avoid_duplicate_runs"] = False - _config["connection_n_retries"] = 10 - _config["max_retries"] = 20 + _config["connection_n_retries"] = 20 + _config["retry_policy"] = "robot" self.assertIsInstance(config, dict) self.assertEqual(len(config), 6) self.assertDictEqual(config, _config) @@ -57,8 +57,8 @@ def test_setup_with_config(self): _config["server"] = "https://www.openml.org/api/v1/xml" _config["cachedir"] = self.workdir _config["avoid_duplicate_runs"] = True + _config["retry_policy"] = "human" _config["connection_n_retries"] = 100 - _config["max_retries"] = 1000 orig_config = openml.config.get_config_as_dict() openml.config._setup(_config) updated_config = openml.config.get_config_as_dict() From 3aee2e05186f2151e45c9ddc5bdd0709459bfce3 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 12 May 2021 16:21:35 +0200 Subject: [PATCH 10/16] Fetch before checkout (#1079) Because the repository at the time of building the docker image is not aware of branches that are created afterwards, which means otherwise those are only accessible through the openml#branch syntax. --- docker/startup.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/startup.sh b/docker/startup.sh index 1946a69cc..4c4a87776 100644 --- a/docker/startup.sh +++ b/docker/startup.sh @@ -44,6 +44,7 @@ if [ -n "$2" ]; then git fetch "$fork_url" "$fork_branch":branch_from_fork branch=branch_from_fork else + git fetch origin "$2" branch=$2 fi if ! git checkout "$branch" ; then From c8cfc907c386c8075d97bc95a1381741066201f7 Mon Sep 17 00:00:00 2001 From: Sahithya Ravi <44670788+sahithyaravi1493@users.noreply.github.com> Date: Fri, 14 May 2021 16:04:52 +0200 Subject: [PATCH 11/16] doc update (#1077) * doc update * fixes --- doc/usage.rst | 6 +++--- openml/extensions/sklearn/extension.py | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/doc/usage.rst b/doc/usage.rst index 4b40decc8..b69e3530a 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -76,9 +76,9 @@ Docker It is also possible to try out the latest development version of ``openml-python`` with docker: -``` - docker run -it openml/openml-python -``` + + ``docker run -it openml/openml-python`` + See the `openml-python docker documentation `_ for more information. diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index 5991a7044..d49a9a9c5 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -65,7 +65,10 @@ class SklearnExtension(Extension): - """Connect scikit-learn to OpenML-Python.""" + """Connect scikit-learn to OpenML-Python. + The estimators which use this extension must be scikit-learn compatible, + i.e needs to be a subclass of sklearn.base.BaseEstimator". + """ ################################################################################################ # General setup From bb17e72d1866e1d23dcf2eace2ca4bdd73af9d39 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Mon, 17 May 2021 10:30:35 +0200 Subject: [PATCH 12/16] Rename master to main (#1076) * rename master to main * update changelog * fix documentation building script * rename master to main in all remaining docs * drop badges Co-authored-by: PGijsbers --- .github/workflows/docs.yaml | 6 +++--- CONTRIBUTING.md | 6 +++--- PULL_REQUEST_TEMPLATE.md | 2 +- README.md | 14 +------------- doc/contributing.rst | 6 +++--- doc/progress.rst | 2 ++ doc/usage.rst | 6 ++---- docker/startup.sh | 2 +- examples/30_extended/custom_flow_.py | 4 ++-- examples/40_paper/2015_neurips_feurer_example.py | 2 +- openml/cli.py | 2 +- 11 files changed, 20 insertions(+), 32 deletions(-) diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index ab83aef5c..c14bd07d0 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -22,19 +22,19 @@ jobs: cd doc make linkcheck - name: Pull latest gh-pages - if: (contains(github.ref, 'develop') || contains(github.ref, 'master')) && github.event_name == 'push' + if: (contains(github.ref, 'develop') || contains(github.ref, 'main')) && github.event_name == 'push' run: | cd .. git clone https://github.com/openml/openml-python.git --branch gh-pages --single-branch gh-pages - name: Copy new doc into gh-pages - if: (contains(github.ref, 'develop') || contains(github.ref, 'master')) && github.event_name == 'push' + if: (contains(github.ref, 'develop') || contains(github.ref, 'main')) && github.event_name == 'push' run: | branch_name=${GITHUB_REF##*/} cd ../gh-pages rm -rf $branch_name cp -r ../openml-python/doc/build/html $branch_name - name: Push to gh-pages - if: (contains(github.ref, 'develop') || contains(github.ref, 'master')) && github.event_name == 'push' + if: (contains(github.ref, 'develop') || contains(github.ref, 'main')) && github.event_name == 'push' run: | last_commit=$(git log --pretty=format:"%an: %s") cd ../gh-pages diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3351bc36d..688dbd7a9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,6 @@ This document describes the workflow on how to contribute to the openml-python package. If you are interested in connecting a machine learning package with OpenML (i.e. -write an openml-python extension) or want to find other ways to contribute, see [this page](https://openml.github.io/openml-python/master/contributing.html#contributing). +write an openml-python extension) or want to find other ways to contribute, see [this page](https://openml.github.io/openml-python/main/contributing.html#contributing). Scope of the package -------------------- @@ -20,7 +20,7 @@ keep the number of potential installation dependencies as low as possible. Therefore, the connection to other machine learning libraries such as *pytorch*, *keras* or *tensorflow* should not be done directly inside this package, but in a separate package using the OpenML Python connector. -More information on OpenML Python connectors can be found [here](https://openml.github.io/openml-python/master/contributing.html#contributing). +More information on OpenML Python connectors can be found [here](https://openml.github.io/openml-python/main/contributing.html#contributing). Reporting bugs -------------- @@ -100,7 +100,7 @@ local disk: $ git checkout -b feature/my-feature ``` - Always use a ``feature`` branch. It's good practice to never work on the ``master`` or ``develop`` branch! + Always use a ``feature`` branch. It's good practice to never work on the ``main`` or ``develop`` branch! To make the nature of your pull request easily visible, please prepend the name of the branch with the type of changes you want to merge, such as ``feature`` if it contains a new feature, ``fix`` for a bugfix, ``doc`` for documentation and ``maint`` for other maintenance on the package. 4. Develop the feature on your feature branch. Add changed files using ``git add`` and then ``git commit`` files: diff --git a/PULL_REQUEST_TEMPLATE.md b/PULL_REQUEST_TEMPLATE.md index 47a5741e6..f0bee81e0 100644 --- a/PULL_REQUEST_TEMPLATE.md +++ b/PULL_REQUEST_TEMPLATE.md @@ -1,6 +1,6 @@ -This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome! \ No newline at end of file +This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome! diff --git a/doc/contributing.rst b/doc/contributing.rst index e87a02dfb..c8fd5347a 100644 --- a/doc/contributing.rst +++ b/doc/contributing.rst @@ -10,7 +10,7 @@ Contribution to the OpenML package is highly appreciated in all forms. In particular, a few ways to contribute to openml-python are: * A direct contribution to the package, by means of improving the - code, documentation or examples. To get started, see `this file `_ + code, documentation or examples. To get started, see `this file `_ with details on how to set up your environment to develop for openml-python. * A contribution to an openml-python extension. An extension package allows OpenML to interface @@ -19,13 +19,13 @@ In particular, a few ways to contribute to openml-python are: For more information, see the :ref:`extensions` below. * Bug reports. If something doesn't work for you or is cumbersome, please open a new issue to let - us know about the problem. See `this section `_. + us know about the problem. See `this section `_. * `Cite OpenML `_ if you use it in a scientific publication. * Visit one of our `hackathons `_. - * Contribute to another OpenML project, such as `the main OpenML project `_. + * Contribute to another OpenML project, such as `the main OpenML project `_. .. _extensions: diff --git a/doc/progress.rst b/doc/progress.rst index 05b4b64c4..1ed7d4d2f 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -8,9 +8,11 @@ Changelog 0.12.2 ~~~~~~ + * ADD #1065: Add a ``retry_policy`` configuration option that determines the frequency and number of times to attempt to retry server requests. * ADD #1075: A docker image is now automatically built on a push to develop. It can be used to build docs or run tests in an isolated environment. * DOC: Fixes a few broken links in the documentation. +* MAINT: Rename `master` brach to ` main` branch. * MAINT/DOC: Automatically check for broken external links when building the documentation. * MAINT/DOC: Fail documentation building on warnings. This will make the documentation building fail if a reference cannot be found (i.e. an internal link is broken). diff --git a/doc/usage.rst b/doc/usage.rst index b69e3530a..7abaacb10 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -40,7 +40,8 @@ directory of the user and is called config. It consists of ``key = value`` pairs which are separated by newlines. The following keys are defined: * apikey: - * required to access the server. The `OpenML setup `_ describes how to obtain an API key. + * required to access the server. The :ref:`sphx_glr_examples_20_basic_introduction_tutorial.py` + describes how to obtain an API key. * server: * default: ``http://www.openml.org``. Alternatively, use ``test.openml.org`` for the test server. @@ -76,11 +77,8 @@ Docker It is also possible to try out the latest development version of ``openml-python`` with docker: - ``docker run -it openml/openml-python`` - - See the `openml-python docker documentation `_ for more information. ~~~~~~~~~~~~ diff --git a/docker/startup.sh b/docker/startup.sh index 4c4a87776..2a75a621c 100644 --- a/docker/startup.sh +++ b/docker/startup.sh @@ -59,7 +59,7 @@ fi source /omlp/venv/bin/activate cd $code_dir -# The most recent ``master`` is already installed, but we want to update any outdated dependencies +# The most recent ``main`` is already installed, but we want to update any outdated dependencies pip install -e .[test,examples,docs,examples_unix] if [ "$1" == "test" ]; then diff --git a/examples/30_extended/custom_flow_.py b/examples/30_extended/custom_flow_.py index 1259acf57..ae5f37631 100644 --- a/examples/30_extended/custom_flow_.py +++ b/examples/30_extended/custom_flow_.py @@ -4,7 +4,7 @@ ================================ The most convenient way to create a flow for your machine learning workflow is to generate it -automatically as described in the `Obtain Flow IDs `_ tutorial. # noqa E501 +automatically as described in the :ref:`sphx_glr_examples_30_extended_flow_id_tutorial.py` tutorial. However, there are scenarios where this is not possible, such as when the flow uses a framework without an extension or when the flow is described by a script. @@ -31,7 +31,7 @@ # 1. Defining the flow # ==================== # The first step is to define all the hyperparameters of your flow. -# The API pages feature a descriptions of each variable of the `OpenMLFlow `_. # noqa E501 +# The API pages feature a descriptions of each variable of the :class:`openml.flows.OpenMLFlow`. # Note that `external version` and `name` together uniquely identify a flow. # # The AutoML Benchmark runs AutoML systems across a range of tasks. diff --git a/examples/40_paper/2015_neurips_feurer_example.py b/examples/40_paper/2015_neurips_feurer_example.py index 721186016..3960c3852 100644 --- a/examples/40_paper/2015_neurips_feurer_example.py +++ b/examples/40_paper/2015_neurips_feurer_example.py @@ -4,7 +4,7 @@ A tutorial on how to get the datasets used in the paper introducing *Auto-sklearn* by Feurer et al.. -Auto-sklearn website: https://automl.github.io/auto-sklearn/master/ +Auto-sklearn website: https://automl.github.io/auto-sklearn/ Publication ~~~~~~~~~~~ diff --git a/openml/cli.py b/openml/cli.py index 15654cfc6..cfd453e9f 100644 --- a/openml/cli.py +++ b/openml/cli.py @@ -331,7 +331,7 @@ def main() -> None: parser_configure = subparsers.add_parser( "configure", description="Set or read variables in your configuration file. For more help also see " - "'https://openml.github.io/openml-python/master/usage.html#configuration'.", + "'https://openml.github.io/openml-python/main/usage.html#configuration'.", ) configurable_fields = [f for f in config._defaults if f not in ["max_retries"]] From 79e647df81e98e41ab4e65a27f928e3e328db4ed Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Tue, 18 May 2021 19:42:26 +0200 Subject: [PATCH 13/16] Extend extensions page (#1080) * started working on additional information for extension * extended documentation * final pass over extensions * Update doc/extensions.rst Co-authored-by: Matthias Feurer * Update doc/extensions.rst Co-authored-by: Matthias Feurer * changes suggested by MF * Update doc/extensions.rst Co-authored-by: PGijsbers * Update doc/extensions.rst Co-authored-by: PGijsbers * Update doc/extensions.rst Co-authored-by: PGijsbers * added info to optional method * fix documentation building * updated doc Co-authored-by: Matthias Feurer Co-authored-by: PGijsbers --- doc/contributing.rst | 6 +--- doc/extensions.rst | 86 +++++++++++++++++++++++++++++++++++++++++--- doc/usage.rst | 4 ++- 3 files changed, 86 insertions(+), 10 deletions(-) diff --git a/doc/contributing.rst b/doc/contributing.rst index c8fd5347a..f710f8a71 100644 --- a/doc/contributing.rst +++ b/doc/contributing.rst @@ -25,8 +25,4 @@ In particular, a few ways to contribute to openml-python are: * Visit one of our `hackathons `_. - * Contribute to another OpenML project, such as `the main OpenML project `_. - -.. _extensions: - - + * Contribute to another OpenML project, such as `the main OpenML project `_. diff --git a/doc/extensions.rst b/doc/extensions.rst index ea12dda6a..0e3d7989e 100644 --- a/doc/extensions.rst +++ b/doc/extensions.rst @@ -27,9 +27,14 @@ Connecting new machine learning libraries Content of the Library ~~~~~~~~~~~~~~~~~~~~~~ -To leverage support from the community and to tap in the potential of OpenML, interfacing -with popular machine learning libraries is essential. However, the OpenML-Python team does -not have the capacity to develop and maintain such interfaces on its own. For this, we +To leverage support from the community and to tap in the potential of OpenML, +interfacing with popular machine learning libraries is essential. +The OpenML-Python package is capable of downloading meta-data and results (data, +flows, runs), regardless of the library that was used to upload it. +However, in order to simplify the process of uploading flows and runs from a +specific library, an additional interface can be built. +The OpenML-Python team does not have the capacity to develop and maintain such +interfaces on its own. For this reason, we have built an extension interface to allows others to contribute back. Building a suitable extension for therefore requires an understanding of the current OpenML-Python support. @@ -48,7 +53,7 @@ API * This class needs to have all the functions from `class Extension` overloaded as required. * The redefined functions should have adequate and appropriate docstrings. The `Sklearn Extension API :class:`openml.extensions.sklearn.SklearnExtension.html` - is a good benchmark to follow. + is a good example to follow. Interfacing with OpenML-Python @@ -57,6 +62,79 @@ Once the new extension class has been defined, the openml-python module to :meth:`openml.extensions.register_extension` must be called to allow OpenML-Python to interface the new extension. +The following methods should get implemented. Although the documentation in +the `Extension` interface should always be leading, here we list some additional +information and best practices. +The `Sklearn Extension API :class:`openml.extensions.sklearn.SklearnExtension.html` +is a good example to follow. Note that most methods are relatively simple and can be implemented in several lines of code. + +* General setup (required) + + * :meth:`can_handle_flow`: Takes as argument an OpenML flow, and checks + whether this can be handled by the current extension. The OpenML database + consists of many flows, from various workbenches (e.g., scikit-learn, Weka, + mlr). This method is called before a model is being deserialized. + Typically, the flow-dependency field is used to check whether the specific + library is present, and no unknown libraries are present there. + * :meth:`can_handle_model`: Similar as :meth:`can_handle_flow`, except that + in this case a Python object is given. As such, in many cases, this method + can be implemented by checking whether this adheres to a certain base class. +* Serialization and De-serialization (required) + + * :meth:`flow_to_model`: deserializes the OpenML Flow into a model (if the + library can indeed handle the flow). This method has an important interplay + with :meth:`model_to_flow`. + Running these two methods in succession should result in exactly the same + model (or flow). This property can be used for unit testing (e.g., build a + model with hyperparameters, make predictions on a task, serialize it to a flow, + deserialize it back, make it predict on the same task, and check whether the + predictions are exactly the same.) + The example in the scikit-learn interface might seem daunting, but note that + here some complicated design choices were made, that allow for all sorts of + interesting research questions. It is probably good practice to start easy. + * :meth:`model_to_flow`: The inverse of :meth:`flow_to_model`. Serializes a + model into an OpenML Flow. The flow should preserve the class, the library + version, and the tunable hyperparameters. + * :meth:`get_version_information`: Return a tuple with the version information + of the important libraries. + * :meth:`create_setup_string`: No longer used, and will be deprecated soon. +* Performing runs (required) + + * :meth:`is_estimator`: Gets as input a class, and checks whether it has the + status of estimator in the library (typically, whether it has a train method + and a predict method). + * :meth:`seed_model`: Sets a random seed to the model. + * :meth:`_run_model_on_fold`: One of the main requirements for a library to + generate run objects for the OpenML server. Obtains a train split (with + labels) and a test split (without labels) and the goal is to train a model + on the train split and return the predictions on the test split. + On top of the actual predictions, also the class probabilities should be + determined. + For classifiers that do not return class probabilities, this can just be the + hot-encoded predicted label. + The predictions will be evaluated on the OpenML server. + Also, additional information can be returned, for example, user-defined + measures (such as runtime information, as this can not be inferred on the + server). + Additionally, information about a hyperparameter optimization trace can be + provided. + * :meth:`obtain_parameter_values`: Obtains the hyperparameters of a given + model and the current values. Please note that in the case of a hyperparameter + optimization procedure (e.g., random search), you only should return the + hyperparameters of this procedure (e.g., the hyperparameter grid, budget, + etc) and that the chosen model will be inferred from the optimization trace. + * :meth:`check_if_model_fitted`: Check whether the train method of the model + has been called (and as such, whether the predict method can be used). +* Hyperparameter optimization (optional) + + * :meth:`instantiate_model_from_hpo_class`: If a given run has recorded the + hyperparameter optimization trace, then this method can be used to + reinstantiate the model with hyperparameters of a given hyperparameter + optimization iteration. Has some similarities with :meth:`flow_to_model` (as + this method also sets the hyperparameters of a model). + Note that although this method is required, it is not necessary to implement + any logic if hyperparameter optimization is not implemented. Simply raise + a `NotImplementedError` then. Hosting the library ~~~~~~~~~~~~~~~~~~~ diff --git a/doc/usage.rst b/doc/usage.rst index 7abaacb10..dd85d989c 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -77,7 +77,9 @@ Docker It is also possible to try out the latest development version of ``openml-python`` with docker: - ``docker run -it openml/openml-python`` +.. code:: bash + + docker run -it openml/openml-python See the `openml-python docker documentation `_ for more information. From 0b786e405ec74e6ea9724b8a79c924a15c17b375 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 19 May 2021 17:46:33 +0200 Subject: [PATCH 14/16] Don't fail when Parquet server can't be reached (#1085) The Parquet file is optional, and failing to reach it (and download it) should not prevent the usage of the other cached/downloaded files. --- openml/datasets/functions.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 746285650..1b5c40e12 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -8,6 +8,7 @@ import numpy as np import arff import pandas as pd +import urllib3 import xmltodict from scipy.sparse import coo_matrix @@ -425,7 +426,10 @@ def get_dataset( arff_file = _get_dataset_arff(description) if download_data else None if "oml:minio_url" in description and download_data: - parquet_file = _get_dataset_parquet(description) + try: + parquet_file = _get_dataset_parquet(description) + except urllib3.exceptions.MaxRetryError: + parquet_file = None else: parquet_file = None remove_dataset_cache = False From 68f51a93f9adbff713bd8b638c6895c6b992b2a0 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 20 May 2021 11:12:20 +0200 Subject: [PATCH 15/16] Allow tasks to be downloaded without dataqualities (#1086) * Allow tasks to be downloaded without dataqualities Previously ``download_qualities`` would be left at the default of True with no way to overwrite it. * Deprecate the use of strings for identifying tasks --- doc/progress.rst | 1 + openml/datasets/functions.py | 8 +++++--- openml/tasks/functions.py | 37 +++++++++++++++++++++++------------- 3 files changed, 30 insertions(+), 16 deletions(-) diff --git a/doc/progress.rst b/doc/progress.rst index 1ed7d4d2f..32259928a 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -11,6 +11,7 @@ Changelog * ADD #1065: Add a ``retry_policy`` configuration option that determines the frequency and number of times to attempt to retry server requests. * ADD #1075: A docker image is now automatically built on a push to develop. It can be used to build docs or run tests in an isolated environment. +* ADD: You can now avoid downloading 'qualities' meta-data when downloading a task with the ``download_qualities`` parameter of ``openml.tasks.get_task[s]`` functions. * DOC: Fixes a few broken links in the documentation. * MAINT: Rename `master` brach to ` main` branch. * MAINT/DOC: Automatically check for broken external links when building the documentation. diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 1b5c40e12..34156eff7 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -370,7 +370,7 @@ def get_dataset( ---------- dataset_id : int or str Dataset ID of the dataset to download - download_data : bool, optional (default=True) + download_data : bool (default=True) If True, also download the data file. Beware that some datasets are large and it might make the operation noticeably slower. Metadata is also still retrieved. If False, create the OpenMLDataset and only populate it with the metadata. @@ -378,12 +378,14 @@ def get_dataset( version : int, optional (default=None) Specifies the version if `dataset_id` is specified by name. If no version is specified, retrieve the least recent still active version. - error_if_multiple : bool, optional (default=False) + error_if_multiple : bool (default=False) If ``True`` raise an error if multiple datasets are found with matching criteria. - cache_format : str, optional (default='pickle') + cache_format : str (default='pickle') Format for caching the dataset - may be feather or pickle Note that the default 'pickle' option may load slower than feather when no.of.rows is very high. + download_qualities : bool (default=True) + Option to download 'qualities' meta-data in addition to the minimal dataset description. Returns ------- dataset : :class:`openml.OpenMLDataset` diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index f775f5e10..2c5a56ad7 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -1,10 +1,10 @@ # License: BSD 3-Clause - +import warnings from collections import OrderedDict import io import re import os -from typing import Union, Dict, Optional +from typing import Union, Dict, Optional, List import pandas as pd import xmltodict @@ -297,17 +297,21 @@ def __list_tasks(api_call, output_format="dict"): return tasks -def get_tasks(task_ids, download_data=True): +def get_tasks( + task_ids: List[int], download_data: bool = True, download_qualities: bool = True +) -> List[OpenMLTask]: """Download tasks. This function iterates :meth:`openml.tasks.get_task`. Parameters ---------- - task_ids : iterable - Integers/Strings representing task ids. - download_data : bool + task_ids : List[int] + A list of task ids to download. + download_data : bool (default = True) Option to trigger download of data along with the meta data. + download_qualities : bool (default=True) + Option to download 'qualities' meta-data in addition to the minimal dataset description. Returns ------- @@ -315,12 +319,14 @@ def get_tasks(task_ids, download_data=True): """ tasks = [] for task_id in task_ids: - tasks.append(get_task(task_id, download_data)) + tasks.append(get_task(task_id, download_data, download_qualities)) return tasks @openml.utils.thread_safe_if_oslo_installed -def get_task(task_id: int, download_data: bool = True) -> OpenMLTask: +def get_task( + task_id: int, download_data: bool = True, download_qualities: bool = True +) -> OpenMLTask: """Download OpenML task for a given task ID. Downloads the task representation, while the data splits can be @@ -329,25 +335,30 @@ def get_task(task_id: int, download_data: bool = True) -> OpenMLTask: Parameters ---------- - task_id : int or str - The OpenML task id. - download_data : bool + task_id : int + The OpenML task id of the task to download. + download_data : bool (default=True) Option to trigger download of data along with the meta data. + download_qualities : bool (default=True) + Option to download 'qualities' meta-data in addition to the minimal dataset description. Returns ------- task """ + if not isinstance(task_id, int): + warnings.warn("Task id must be specified as `int` from 0.14.0 onwards.", DeprecationWarning) + try: task_id = int(task_id) except (ValueError, TypeError): - raise ValueError("Dataset ID is neither an Integer nor can be " "cast to an Integer.") + raise ValueError("Dataset ID is neither an Integer nor can be cast to an Integer.") tid_cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id,) try: task = _get_task_description(task_id) - dataset = get_dataset(task.dataset_id, download_data) + dataset = get_dataset(task.dataset_id, download_data, download_qualities=download_qualities) # List of class labels availaible in dataset description # Including class labels as part of task meta data handles # the case where data download was initially disabled From b0765a59471b780d655143f2566785a2776f90ba Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Thu, 20 May 2021 14:53:10 +0200 Subject: [PATCH 16/16] prepare release 0.12.2 (#1082) --- doc/progress.rst | 4 ++++ openml/__version__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/progress.rst b/doc/progress.rst index 32259928a..b0c182e05 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -13,6 +13,10 @@ Changelog * ADD #1075: A docker image is now automatically built on a push to develop. It can be used to build docs or run tests in an isolated environment. * ADD: You can now avoid downloading 'qualities' meta-data when downloading a task with the ``download_qualities`` parameter of ``openml.tasks.get_task[s]`` functions. * DOC: Fixes a few broken links in the documentation. +* DOC #1061: Improve examples to always show a warning when they switch to the test server. +* DOC #1067: Improve documentation on the scikit-learn extension interface. +* DOC #1068: Create dedicated extensions page. +* FIX #1075: Correctly convert `y` to a pandas series when downloading sparse data. * MAINT: Rename `master` brach to ` main` branch. * MAINT/DOC: Automatically check for broken external links when building the documentation. * MAINT/DOC: Fail documentation building on warnings. This will make the documentation building diff --git a/openml/__version__.py b/openml/__version__.py index 700e61f6a..0f368c426 100644 --- a/openml/__version__.py +++ b/openml/__version__.py @@ -3,4 +3,4 @@ # License: BSD 3-Clause # The following line *must* be the last in the module, exactly as formatted: -__version__ = "0.12.1" +__version__ = "0.12.2"