diff --git a/.dockerignore b/.dockerignore index d6268db1b..71a7d2156 100644 --- a/.dockerignore +++ b/.dockerignore @@ -18,6 +18,11 @@ env package-lock.json node_modules +## Celery +celeryconfig* +celery-config* +celerybeat-schedule.* + ## Python / Extensions etc. *~ *.mo diff --git a/.gitignore b/.gitignore index 44f8811fc..b92754eec 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,11 @@ node_modules ## Docker #Dockerfile +## Celery +celeryconfig* +celery-config* +celerybeat-schedule.* + ## Python / Extensions etc. *~ *.mo diff --git a/CHANGES.rst b/CHANGES.rst index 013752c58..40d9da52a 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -10,8 +10,41 @@ Changes `Unreleased `_ (latest) ======================================================================== +Important: +---------- +- In order to support *synchronous* execution, setting ``RESULT_BACKEND`` **MUST** be specified in + the ``weaver.ini`` configuration file. + See `Weaver INI Configuration Example `_ + in section ``[celery]`` for more details. +- With resolution and added support of ``transmissionMode`` handling according to `OGC API - Processes` specification, + requests that where submitted with ``reference`` outputs will produce results in a different format than previously + since this parameter was ignored and always returned ``value`` representation. + Changes: -------- +- Support ``Prefer`` header with ``wait`` or ``respond-async`` directives to select ``Job`` execution mode either + as *synchronous* or *asynchronous* task, according to supported ``jobControlOptions`` of the relevant ``Process`` + being executed (resolves `#247 `_). +- Increase minor version of all ``builtin`` processes that will now be executable in wither (a)synchronous modes. +- Add ``weaver.exec_sync_max_wait`` and ``weaver.quote_sync_max_wait`` settings allowing custom definition for the + maximum duration that can be specified to wait for a `synchronous` response from task workers. +- Add ``-B`` (``celery beat``) option to Docker command of ``weaver-worker`` to run scheduled task in parallel + to ``celery worker`` in order to periodically cleanup task results introduced by *synchronous* execution. +- Add support of ``transmissionMode`` handling as ``reference`` to generate HTTP ``Link`` references for results + requested this way (resolves `#377 `_). +- Updated every ``Process`` to report that they support ``outputTransmission`` both as ``reference`` and ``value``, + since handling of results is accomplished by `Weaver` itself, regardless of the application being executed. +- Add partial support of ``response=raw`` parameter for execution request submission in order to handle results to + be returned accordingly to specified ``outputTransmission`` by ``reference`` or ``value``. + Multipart contents for multi-output results are not yet supported + (relates to `#376 `_). +- Add `CLI` option ``-R/--ref/--reference`` for ``execute`` operation allowing to request corresponding ``outputs`` + by ID to be returned using the ``transmissionMode: reference`` method, producing HTTP ``Link`` headers for those + entries rather than inserting values in the response content body. +- Add requested ``outputs`` into response of ``GET /jobs/{jobId}/inputs`` to obtain submitted ``Job`` definitions. +- Add query parameter ``schema`` for ``GET /jobs/{jobId}/inputs`` (and corresponding endpoints under ``/processes`` + and ``/providers``) allowing to retrieve submitted input values and requested outputs with either ``OGC``/``OLD`` + formats. - Improve conformance for returned status codes and error messages when requesting results for an unfinished, failed, or dismissed ``Job``. - Adjust conformance item references to correspond with `OGC API - Processes: Part 2` renamed from `Transactions` to @@ -22,6 +55,13 @@ Changes: Fixes: ------ +- Fix ``outputs`` permitted to be completely omitted from the execution request + (resolves `#375 `_). +- Fix ``outputs`` permitted as explicit empty mapping or list as equivalent to omitting them, defining by default + that all ``outputs`` should be returned with ``transmissionMode: value`` for ``Job`` execution. +- Fix all instances of ``outputTransmission`` reported as ``reference`` in ``Process`` descriptions, although `Weaver` + behaved with the ``value`` method, which is to return values and file references in content body, instead of + HTTP ``Link`` header references. - Fix `WPS 1/2` endpoint not reporting the appropriate instance URL (fixes `#83 `_). - Fix `CLI` ``deploy`` operation headers incorrectly passed down to the deployment request. @@ -325,7 +365,7 @@ Fixes: - Fix parsing of inputs for `OpenSearch` parameters lookup that was assuming inputs were always provided as listing definition, not considering possible mapping definition. - Fix incorrect documentation section ``Package as External Execution Unit Reference`` where content was omitted - and incorrectly anchored as following ``process-esgf-cwt`` section. + and incorrectly anchored as following ``ESGF-CWT`` section. .. _changes_4.4.0: diff --git a/config/weaver.ini.example b/config/weaver.ini.example index ee216224b..625e668f6 100644 --- a/config/weaver.ini.example +++ b/config/weaver.ini.example @@ -67,6 +67,13 @@ weaver.ssl_verify = true # see 'requests_options.yml.example' weaver.request_options = +# --- Weaver Execution settings --- + +# maximum wait time allowed for Prefer header to run Job/Quote synchronously +# over this limit, they will automatically fallback to asynchronous execution/estimation +weaver.exec_sync_max_wait = 20 +weaver.quote_sync_max_wait = 20 + # --- Weaver CWL settings --- # NOTE: [experimental] # enforce provided effective user/group identifiers for Application Package execution @@ -147,6 +154,15 @@ weaver.vault_dir = /tmp/vault [celery] #USE_CELERYCONFIG = True BROKER_URL = mongodb://mongodb:27017/celery +# Result backend is required for SYNC execution. +# Using only the backend type matching the broker URL will automatically resolve to use it database location. +# For an alternative result location, provide the full backend directly rather than using "mongodb_backend_settings". +# This setting is not correctly parsed (dict) by "pyramid_celery", and separate [celery:mongodb_backend_settings] is +# also not found. If more configuration are required, consider using a "celeryconfig" (and "USE_CELERYCONFIG = True"). +# That configuration should be placed at the root of weaver since "pyramid_celery" doesn't support custom locations. +# https://github.com/sontek/pyramid_celery/pull/89 +RESULT_BACKEND = mongodb +# RESULT_BACKEND = mongodb://mongodb:27017/celery ### # wsgi server configuration diff --git a/docker/Dockerfile-worker b/docker/Dockerfile-worker index 16f0c4c4e..bb32aa319 100644 --- a/docker/Dockerfile-worker +++ b/docker/Dockerfile-worker @@ -13,9 +13,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ && apt update \ # NOTE: # Only install CLI package, 'docker-ce' and 'containerd.io' not required as they should be provided by host. - # Docker sibliing execution is expected. See 'docker/docker-compose.yml.example' for details. + # Docker sibling execution is expected. See 'docker/docker-compose.yml.example' for details. && apt install --no-install-recommends docker-ce-cli \ && rm -rf /var/lib/apt/lists/* # run app -CMD celery worker -E -A pyramid_celery.celery_app --ini "${APP_CONFIG_DIR}/weaver.ini" +CMD celery worker -B -E -A pyramid_celery.celery_app --ini "${APP_CONFIG_DIR}/weaver.ini" diff --git a/docs/_static/custom.css b/docs/_static/custom.css index 6cd44e584..f649bbc24 100644 --- a/docs/_static/custom.css +++ b/docs/_static/custom.css @@ -1,4 +1,3 @@ - /* override readthedocs theme to enforce using full-screen width for content */ .wy-nav-content { max-width: none; diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst index 868da7742..5b7449e08 100644 --- a/docs/source/configuration.rst +++ b/docs/source/configuration.rst @@ -213,14 +213,36 @@ they are optional and which default value or operation is applied in each situat completion if an email was provided in the :ref:`Execute ` request body (see also: :ref:`Email Notification`). +.. versionadded:: 4.15.0 + +- | ``weaver.exec_sync_max_wait`` + | (default: ``20``, :class:`int`, seconds) + | + | Defines the maximum duration allowed for running a :term:`Job` execution in `synchronous` mode. + | + | See :ref:`proc_exec_mode` for more details on the feature and how to employ it. + | Ensure `Celery`_ worker is configured as specified below. + +.. versionadded:: 4.15.0 + +- | ``weaver.quote_sync_max_wait`` + | (default: ``20``, :class:`int`, seconds) + | + | Defines the maximum duration allowed for running a :term:`Quote` estimation in `synchronous` mode. + | + | See :ref:`proc_exec_mode` for more details on the feature and how to employ it. + | Ensure `Celery`_ worker is configured as specified below. .. note:: Since `Weaver` employs `Celery`_ as task queue manager and `MongoDB`_ as backend, relevant settings for the - |celery-config|_ and the |celery-mongo|_ should be referred to. Processing of task jobs and results reporting + |celery-config|_ and the |celery-mongo|_ should be employed. Processing of task jobs and results reporting is accomplished according to the specific implementation of these services. Therefore, all applicable settings and extensions should be available for custom server configuration and scaling as needed. +.. warning:: + In order to support `synchronous` execution, the ``RESULT_BACKEND`` setting **MUST** be defined. + .. |celery-config| replace:: configuration of Celery .. _celery-config: https://docs.celeryproject.org/en/latest/userguide/configuration.html#configuration .. |celery-mongo| replace:: configuration of MongoDB Backend @@ -344,16 +366,42 @@ simply set setting ``weaver.wps_processes_file`` as *undefined* (i.e.: nothing a Configuration of Request Options ======================================= -.. todo:: complete docs +.. versionadded:: 1.8.0 -:term:`Request Options` +It is possible to define :term:`Request Options` that consist of additional arguments that will be passed down to +:func:`weaver.utils.request_extra`, which essentially call a traditional request using :mod:`requests` module, but +with extended handling capabilities such as caching, retrying, and file reference support. The specific parameters +that are passed down for individual requests depend whether a match based on URL (optionally with regex rules) and +method definitions can be found in the :term:`Request Options` file. This file should be provided using +the ``weaver.request_options`` configuration setting. Using this definition, it is possible to provide specific +requests handling options, such as extended timeout, authentication arguments, SSL certification verification setting, +etc. on a per-request basis, leave other requests unaffected and generally more secure. -``weaver.ssl_verify`` +.. seealso:: + File `request_options.yml.example`_ provides more details and sample :term:`YAML` format of the expected contents + for :term:`Request Options` feature. +.. seealso:: + Please refer to :func:`weaver.utils.request_extra` documentation directly for supported parameters and capabilities. -.. versionadded:: 1.8.0 -`request_options.yml.example`_ +- | ``weaver.request_options = `` + | (default: ``None``) + | + | Path of the :term:`Request Options` definitions to employ. + + +- | ``weaver.ssl_verify = true|false`` + | (default: ``true``) + | + | Toggle the SSL certificate verification across all requests. + +.. warning:: + It is **NOT** recommended to disable SSL verification across all requests for security reasons + (avoid man-in-the-middle attacks). This is crucial for requests that involve any form of authentication, secured + access or personal user data references. This should be employed only for quickly resolving issues during + development. Consider fixing SSL certificates on problematic servers, or disable the verification on a per-request + basis using :term:`Request Options` for acceptable cases. Starting the Application diff --git a/docs/source/faq.rst b/docs/source/faq.rst index ff3208cf7..02c1fed96 100644 --- a/docs/source/faq.rst +++ b/docs/source/faq.rst @@ -46,7 +46,7 @@ Please refer to below references for more details. .. seealso:: - - Supported :term:`Application Package` definitions in :ref:`process-wps-rest` deployment. + - Supported :term:`Application Package` definitions in :ref:`proc_wps_rest` deployment. - :ref:`Deploy ` request. diff --git a/docs/source/processes.rst b/docs/source/processes.rst index d93c5f3ff..8de5bd82c 100644 --- a/docs/source/processes.rst +++ b/docs/source/processes.rst @@ -30,6 +30,8 @@ Each one of them are accessible through the same API interface, but they have di and :ref:`Execute ` request payloads for diverse set of applications. +.. _proc_builtin: + Builtin ------- @@ -55,6 +57,8 @@ As of the latest release, following `builtin` processes are available: All `builtin` processes are marked with :py:data:`weaver.processes.constants.CWL_REQUIREMENT_APP_BUILTIN` in the :term:`CWL` ``hints`` section and are all defined in :py:mod:`weaver.processes.builtin`. +.. _proc_wps_12: + WPS-1/2 ------- @@ -99,7 +103,7 @@ Please refer to :ref:`Configuration of WPS Processes` section for more details o .. seealso:: - `Remote Provider`_ -.. _process-wps-rest: +.. _proc_wps_rest: WPS-REST -------- @@ -189,7 +193,7 @@ Where the referenced file hosted at ``"https://remote-file-server.com/my-package "<...>": "<...>" -.. _process-esgf-cwt: +.. _proc_esgf_cwt: ESGF-CWT ---------- @@ -263,7 +267,7 @@ be indicated in the logs with the appropriate step and message where the error o :ref:`proc_workflow_ops` provides more details on each of the internal operations accomplished by individual step :term:`Process` chained in a :term:`Workflow`. -.. _process-remote-provider: +.. _proc_remote_provider: Remote Provider -------------------- @@ -317,7 +321,7 @@ An example body of the `register provider`_ request could be as follows: } -Then, processes of this registered :ref:`process-remote-provider` will be accessible. For example, if the referenced +Then, processes of this registered :ref:`proc_remote_provider` will be accessible. For example, if the referenced service by the above URL add a WPS process identified by ``my-process``, its JSON description would be obtained with following request (`DescribeProviderProcess`_): @@ -424,79 +428,185 @@ This section will first describe the basics of this request format, and after go and parametrization of various input/output combinations. Let's employ the following example of JSON body sent to the :term:`Job` execution to better illustrate the requirements. -.. code-block:: json +.. table:: + :class: code-table + :align: center + + +-----------------------------------------------+-----------------------------------------------+ + | .. code-block:: json | .. code-block:: json | + | :caption: Job Execution Payload as Listing | :caption: Job Execution Payload as Mapping | + | | | + | { | { | + | "mode": "async", | "mode": "async", | + | "response": "document", | "response": "document", | + | "inputs": [ | "inputs": { | + | { | "input-file": { | + | "id": "input-file", | "href": "`_ + for more general details on ``transmissionMode`` parameter. + - `OGC API - Processes, Execution Mode `_ + for more general details on the execution negotiation (formerly with ``mode`` parameter) and more recently + with ``Prefer`` header. + - |ogc-exec-sync-responses|_ and |ogc-exec-async-responses|_ + for a complete listing of available ``response`` formats considering all other parameters. -The first field is ``mode``, it basically tells whether to run the :term:`Process` in a blocking (``sync``) or -non-blocking (``async``) manner. Note that support is currently limited for mode ``sync`` as this use case is often more -cumbersome than ``async`` execution. Effectively, ``sync`` mode requires to have a task worker executor available -to run the :term:`Job` (otherwise it fails immediately due to lack of processing resource), and the requester must wait -for the *whole* execution to complete to obtain the result. Given that :term:`Process` could take a very long time to -complete, it is not practical to execute them in this manner and potentially have to wait hours to retrieve outputs. -Instead, the preferred and default approach is to request an ``async`` :term:`Job` execution. When doing so, `Weaver` -will add this to a task queue for processing, and will immediately return a :term:`Job` identifier and location where -the user can probe for its status, using :ref:`Monitoring ` request. As soon as any task worker becomes -available, it will pick any leftover queued :term:`Job` to execute it. +.. |exec-api| replace:: OpenAPI Execute +.. _exec-api: `exec-req`_ + +.. _proc_exec_body: -The second field is ``response``. At the time being, `Weaver` only supports ``document`` value. This parameter is -present only for compatibility with other :term:`ADES` implementation, but does not actually affects `Weaver`'s -response. +Execution Body +~~~~~~~~~~~~~~~~~~ -Following are the ``inputs`` definition. This is the most important section of the request body. It defines which -parameters to forward to the referenced :term:`Process` to be executed. All ``id`` elements in this :term:`Job` request +The ``inputs`` definition is the most important section of the request body. It is also the only one that is completely +required when submitting the execution request, even for a no-input process (an empty mapping is needed in such case). +It defines which parameters +to forward to the referenced :term:`Process` to be executed. All ``id`` elements in this :term:`Job` request body must correspond to valid ``inputs`` from the definition returned by :ref:`DescribeProcess ` response. Obviously, all formatting requirements (i.e.: proper file :term:`MIME-types`), data types (e.g.: ``int``, ``string``, etc.) and validations rules (e.g.: ``minOccurs``, ``AllowedValues``, etc.) must also be fulfilled. When providing files as input, multiple protocols are supported. See later section :ref:`File Reference Types` for details. -Finally, the ``outputs`` section defines, for each ``id`` corresponding to the :term:`Process` definition, how to -report the produced outputs from a successful :term:`Job` completion. Again, `Weaver` only implement the -``reference`` result for the time being as this is the most common variation. In this case, the produced file is -stored locally and exposed externally with returned reference URL. The other (unimplemented) mode ``value`` would -return the contents directly in the response instead of the URL. +The ``outputs`` section defines, for each ``id`` corresponding to the :term:`Process` definition, how to +report the produced outputs from a successful :term:`Job` completion. For the time being, `Weaver` only implement the +``reference`` result as this is the most common variation. In this case, the produced file is +stored locally and exposed externally with returned reference URL. The other mode ``value`` returns the contents +directly in the response instead of the URL. -.. note:: - Other parameters can be added to the request to provide further functionalities. Above fields are the minimum - requirements to request a :term:`Job`. Please refer to the |exec-api|_ definition for all applicable features. +When ``outputs`` section is omitted, it simply means that the :term:`Process` to be executed should return all +outputs it offers in the created :ref:`Job Results `. In such case, because no representation modes +is specified for individual outputs, `Weaver` automatically selects ``reference`` as it makes all outputs more easily +accessible with distinct URL afterwards. If the ``outputs`` section is specified, but that one of the outputs defined +in the :ref:`Process Description ` is not specified, that output should be omitted from the produced +results. For the time being, because only ``reference`` representation is offered for produced output files, this +filtering is not implemented as it offers no additional advantage for files accessed directly with their distinct URLs. +This could be added later if ``Multipart`` raw data representation is required. +Please |submit-issue|_ to request this feature if it is relevant for your use-cases. -.. note:: - Since most of the time, returned files are not human readable or are simply too large to be displayed, the - ``transmissionMode: value`` is rarely employed. Also, it is to be noted that outputs representing ``LiteralData`` - (which is even more uncommon) would automatically be represented as ``value`` without explicitly requesting it, - as there would not be any file to return. If this poses problem or you encounter a valid use-case where ``value`` - would be useful for your needs, please |submit-issue|_ to request the feature. +.. fixme:: + Filtering of ``outputs`` not implemented (everything always available). + https://github.com/crim-ca/weaver/issues/380 -.. |exec-api| replace:: OpenAPI Execute -.. _exec-api: `exec-req`_ +Other parameters presented in the above examples, namely ``mode`` and ``response`` are further detailed in +the following :ref:`proc_exec_mode` section. + +.. _proc_exec_mode: + +Execution Mode +~~~~~~~~~~~~~~~~~~~~~ + +In order to select how to execute a :term:`Process`, either `synchronously` or `asynchronously`, the ``Prefer`` header +should be specified. If omitted, `Weaver` defaults to `asynchronous` execution. To execute `asynchronously` explicitly, +``Prefer: respond-async`` should be used. Otherwise, the `synchronous` execution can be requested +with ``Prefer: wait=X`` where ``X`` is the duration in seconds to wait for a response. If no worker becomes available +within that time, or if this value is greater than ``weaver.exec_sync_max_wait``, the :term:`Job` will resume +`asynchronously` and the response will be returned. Furthermore, `synchronous` and `asynchronous` execution of +a :term:`Process` can only be requested for corresponding ``jobControlOptions`` it reports as supported in +its :ref:`Process Description `. It is important to provide the ``jobControlOptions`` parameter with +applicable modes when :ref:`Deploying a Process ` to allow it to run as desired. By default, `Weaver` +will assume that deployed processes are only `asynchronous` to handle longer operations. + +.. versionchanged:: + By default, every :ref:`proc_builtin` :term:`Process` can accept both modes. + All previously deployed processes will only allow `asynchronous` execution, as only this one was supported. + This should be reported in their ``jobControlOptions``. +.. warning:: + It is important to remember that the ``Prefer`` header is indeed a *preference*. If `Weaver` deems it cannot + allocate a worker to execute the task `synchronously` within a reasonable delay, it can enforce the `asynchronous` + execution. The `asynchronous` mode is also *prioritized* for running longer :term:`Job` submitted over the task + queue, as this allows `Weaver` to offer better availability for all requests submitted by its users. + The `synchronous` mode should be reserved only for very quick and relatively low computation intensive operations. + +The ``mode`` field displayed in the body is another method to tell whether to run the :term:`Process` in a blocking +(``sync``) or non-blocking (``async``) manner. Note that support is limited for mode ``sync`` as this use case is often +more cumbersome than ``async`` execution. Effectively, ``sync`` mode requires to have a task worker executor available +to run the :term:`Job` (otherwise it fails immediately due to lack of processing resource), and the requester must wait +for the *whole* execution to complete to obtain the result. Given that :term:`Process` could take a very long time to +complete, it is not practical to execute them in this manner and potentially have to wait hours to retrieve outputs. +Instead, the preferred and default approach is to request an ``async`` :term:`Job` execution. When doing so, `Weaver` +will add this to a task queue for processing, and will immediately return a :term:`Job` identifier and ``Location`` +where the user can probe for its status, using :ref:`Monitoring ` request. As soon as any task worker +becomes available, it will pick any leftover queued :term:`Job` to execute it. + +.. note:: + The ``mode`` field is an older methodology that precedes the official :term:`OGC API - Processes` method using + the ``Prefer`` header. It is recommended to employ the ``Prefer`` header that ensures higher interoperability + with other services using the same standard. The ``mode`` field is deprecated and preserved only for backward + compatibility purpose. + +When requesting a `synchronous` execution, and provided a worker was available to pick and complete the task before +the maximum ``wait`` time was reached, the final status will be directly returned. Therefore, the contents obtained this +way will be identical to any following :ref:`Job Status ` request. If no worker is available, or if +the worker that picked the :term:`Job` cannot complete it in time (either because it takes too long to execute or had +to wait on resources for too long), the :term:`Job` execution will automatically switch to `asynchronous` mode. + +The distinction between an `asynchronous` or `synchronous` response when executing a :term:`Job` can be +observed in multiple ways. The easiest is with the HTTP status code of the response, 200 being for +a :term:`Job` *entirely completed* synchronously, and 201 for a created :term:`Job` that should be +:ref:`monitored ` asynchronously. Another method is to observe the ``"status"`` value. +Effectively, a :term:`Job` that is executed `asynchronously` will return status information contents, while +a `synchronous` :term:`Job` will return the results directly, along a ``Location`` header referring to the +equivalent contents returned by :ref:`GetStatus ` as in the case of `asynchronous` :term:`Job`. +It is also possible to extract the ``Preference-Applied`` response header which will clearly indicate if the +submitted ``Prefer`` header was respected (because it could be with available worker resources) or not. +In general, this means that if the :term:`Job` submission request was not provided with ``Prefer: wait=X`` **AND** +replied with the same ``Preference-Applied`` value, it is safe to assume `Weaver` decided to queue the :term:`Job` +for `asynchronous` execution. That :term:`Job` could be executed immediately, or at a later time, according to +worker availability. + +It is also possible that a ``failed`` :term:`Job`, even when `synchronous`, will respond with equivalent contents +to the status location instead of results. This is because it is impossible for `Weaver` to return +the result(s) as outputs would not be generated by the incomplete :term:`Job`. + +Finally, the ``response`` parameter defines how to return the results produced by the :term:`Process`. +When ``response=document``, regardless of ``mode=async`` or ``mode=sync``, and regardless of requested +outputs ``transmissionMode=value`` or ``transmissionMode=reference``, the results will be returned in +a :term:`JSON` format containing either literal values or URL references to produced files. If ``mode=async``, +this results *document* is obtained with |results-req|_ request, while ``mode=sync`` returns it directly. +When ``response=raw``, the specific contents (type and quantity), HTTP ``Link`` headers or a mix of those components +depends both on the number of available :term:`Process` outputs, which ones were requested, and how they were +requested (i.e.: ``transmissionMode``). It is also possible that further content negotiation gets involved +accordingly to the ``Accept`` header and available ``Content-Type`` of the outputs if multiple formats are supported +by the :term:`Process`. For more details regarding those combination, the official +|ogc-exec-sync-responses|_ and |ogc-exec-async-responses|_ should be employed as reference. + +For any of the previous combinations, it is always possible to obtain :term:`Job` outputs, along with logs, exceptions +and other details using the :ref:`proc_op_result` endpoints. + + +.. _proc_exec_steps: Execution Steps ~~~~~~~~~~~~~~~~~~~~~ @@ -508,7 +618,7 @@ parametrization details, etc.), followed by ``running`` when effectively reachin :term:`Application Package` operation. This status will remain as such until the operation completes, either with ``succeeded`` or ``failed`` status. -At any moment during ``async`` execution, the :term:`Job` status can be requested using |status-req|_. Note that +At any moment during `asynchronous` execution, the :term:`Job` status can be requested using |status-req|_. Note that depending on the timing at which the user executes this request and the availability of task workers, it could be possible that the :term:`Job` be already in ``running`` state, or even ``failed`` in case of early problem detected. @@ -671,7 +781,7 @@ combinations. | |ADES| | - `WPS-1/2`_ | |file_scheme| | Convert to |http_scheme| [#file2http]_ | | | - `ESGF-CWT`_ +---------------+-------------------------------------------+ | | - `WPS-REST`_ (remote) [#wps3]_ | |http_scheme| | Nothing (unmodified) | -| | - :ref:`process-remote-provider` +---------------+-------------------------------------------+ +| | - :ref:`proc_remote_provider` +---------------+-------------------------------------------+ | | | |s3_scheme| | Fetch and convert to |http_scheme| [#s3]_ | | | +---------------+-------------------------------------------+ | | | |vault_ref| | Convert to |http_scheme| [#vault2http]_ | @@ -695,7 +805,7 @@ combinations. | |HYBRID| | - `WPS-1/2`_ | |file_scheme| | Convert to |http_scheme| [#file2http]_ | | | - `ESGF-CWT`_ +---------------+-------------------------------------------+ | | - `WPS-REST`_ (remote) [#wps3]_ | |http_scheme| | Nothing (unmodified) | -| | - :ref:`process-remote-provider` +---------------+-------------------------------------------+ +| | - :ref:`proc_remote_provider` +---------------+-------------------------------------------+ | | | |s3_scheme| | Fetch and convert to |http_scheme| [#s3]_ | | | *Note*: |HYBRID| assumes |ADES| role +---------------+-------------------------------------------+ | | (remote processes) | |vault_ref| | Convert to |http_scheme| [#vault2http]_ | @@ -1133,10 +1243,10 @@ format is employed according to the chosen location. .. _proc_op_result: -Obtaining output results, logs or errors +Obtaining results, outputs, logs or errors --------------------------------------------------------------------- -In the case of successful :term:`Job` execution, the outputs can be retrieved with |result-req|_ request to list +In the case of successful :term:`Job` execution, the *outputs* can be retrieved with |outputs-req|_ request to list each corresponding output ``id`` with the generated file reference URL. Keep in mind that the purpose of those URLs are only to fetch the results (not persistent storage), and could therefore be purged after some reasonable amount of time. The format should be similar to the following example, with minor variations according to :ref:`Configuration` @@ -1153,6 +1263,19 @@ parameters for the base :term:`WPS` output location: ] } +For the :term:`OGC` compliant endpoint, the |results-req| request can be employed instead. +In the event of a :term:`Job` executed with ``response=document``, the contents will be very similar. +On the other hand, a :term:`Job` submitted with ``response=raw`` can produce many alternative variations according +to :term:`OGC` requirements. For this reason, the *outputs* endpoint will always provide all data and file references +in the response body as :term:`Job`, no matter the original ``response`` format. The *outputs* endpoint can also +receive additional query parameters, such as ``schema``, to return contents formatted similarly to *results*, but +enforcing a :term:`JSON` body as if ``response=document`` was specified during submission of the :term:`Process` +execution. + +In order to better understand the parameters that where submitted during :term:`Job` creation, the |inputs-req|_ +can be employed. This will return both the data and reference inputs that were submitted, as well as +the *requested outputs* to retrieve any relevant ``transmissionMode`` definition. + In situations where the :term:`Job` resulted into ``failed`` status, the |except-req|_ can be use to retrieve the potential cause of failure, by capturing any raised exception. Below is an example of such exception details. @@ -1188,6 +1311,14 @@ Note again that the more the :term:`Process` is verbose, the more tracking will .. literalinclude:: ../../weaver/wps_restapi/examples/job_logs.json :language: json + +.. note:: + All endpoints to retrieve any of the above information about a :term:`Job` can either be requested directly + (i.e.: ``/jobs/{jobID}/...``) or with equivalent :term:`Provider` and/or :term:`Process` prefixed endpoints, + if the requested :term:`Job` did refer to those :term:`Provider` and/or :term:`Process`. + A *local* :term:`Process` would have its :term:`Job` references as ``/processes/{processId}/jobs/{jobID}/...`` + while a :ref:`proc_remote_provider` will use ``/provider/{providerName}/processes/{processId}/jobs/{jobID}/...``. + .. _vault: Uploading File to the Vault diff --git a/docs/source/references.rst b/docs/source/references.rst index 418d0557c..60341070c 100644 --- a/docs/source/references.rst +++ b/docs/source/references.rst @@ -56,6 +56,10 @@ .. _ogc-home: `ogc`_ .. |ogc-proc-api| replace:: OGC API - Processes .. _ogc-proc-api: https://github.com/opengeospatial/ogcapi-processes +.. |ogc-exec-sync-responses| replace:: OGC API - Processes, Responses (sync) +.. _ogc-exec-sync-responses: https://docs.ogc.org/is/18-062r2/18-062r2.html#sc_execute_response +.. |ogc-exec-async-responses| replace:: OGC API - Processes, Responses (async) +.. _ogc-exec-async-responses: https://docs.ogc.org/is/18-062r2/18-062r2.html#_response_7 .. |pywps| replace:: PyWPS .. _pywps: https://github.com/geopython/pywps/ .. |pywps-status| replace:: Progress and Status Report @@ -135,8 +139,12 @@ .. _status-req-name: `status-req`_ .. |status-req| replace:: ``GET {WEAVER_URL}/processes/{id}/jobs/{id}`` (GetStatus) .. _status-req: https://pavics-weaver.readthedocs.io/en/latest/api.html#tag/Status%2Fpaths%2F~1processes~1{process_id}~1jobs~1{job_id}%2Fget -.. |result-req| replace:: ``GET {WEAVER_URL}/processes/{id}/jobs/{id}/result`` (GetResult) -.. _result-req: https://pavics-weaver.readthedocs.io/en/latest/api.html#tag/Results%2Fpaths%2F~1processes~1%7Bprocess_id%7D~1jobs~1%7Bjob_id%7D~1result%2Fget +.. |inputs-req| replace:: ``GET {WEAVER_URL}/jobs/{id}/inputs`` (Inputs) +.. _inputs-req: https://pavics-weaver.readthedocs.io/en/latest/api.html#tag/inputs/paths/~1jobs~1{job_id}~1inputs/get +.. |outputs-req| replace:: ``GET {WEAVER_URL}/jobs/{id}/outputs`` (Outputs) +.. _outputs-req: https://pavics-weaver.readthedocs.io/en/latest/api.html#tag/outputs/paths/~1jobs~1{job_id}~1outputs/get +.. |results-req| replace:: ``GET {WEAVER_URL}/jobs/{id}/results`` (Results) +.. _results-req: https://pavics-weaver.readthedocs.io/en/latest/api.html#tag/Results/paths/~1jobs~1{job_id}~1results/get .. |update-token-req| replace:: Update Token .. _update-token-req: https://pavics-weaver.readthedocs.io/en/latest/api.html#tag/UpdateToken/paths/~1processes~1{process_id}/put .. |vault-upload-req| replace:: Vault File Upload (POST) diff --git a/tests/functional/application-packages/AggregateESGF/execute.json b/tests/functional/application-packages/AggregateESGF/execute.json index 925a870a8..35037c126 100644 --- a/tests/functional/application-packages/AggregateESGF/execute.json +++ b/tests/functional/application-packages/AggregateESGF/execute.json @@ -22,7 +22,7 @@ "outputs": [ { "id": "output", - "transmissionMode": "reference" + "transmissionMode": "value" } ] } diff --git a/tests/functional/application-packages/CatalogSearch/execute.json b/tests/functional/application-packages/CatalogSearch/execute.json index 0a0949de0..48322f9c9 100644 --- a/tests/functional/application-packages/CatalogSearch/execute.json +++ b/tests/functional/application-packages/CatalogSearch/execute.json @@ -10,7 +10,7 @@ "outputs": [ { "id": "output", - "transmissionMode": "reference" + "transmissionMode": "value" } ] } diff --git a/tests/functional/application-packages/CatalogSearch/package.cwl b/tests/functional/application-packages/CatalogSearch/package.cwl index b0b995099..c6ae0f45c 100644 --- a/tests/functional/application-packages/CatalogSearch/package.cwl +++ b/tests/functional/application-packages/CatalogSearch/package.cwl @@ -39,7 +39,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ] }, "executionUnit": [ diff --git a/tests/functional/application-packages/ColibriFlyingpigeon_SubsetBbox/execute.json b/tests/functional/application-packages/ColibriFlyingpigeon_SubsetBbox/execute.json index 0247ea813..4f8d82642 100644 --- a/tests/functional/application-packages/ColibriFlyingpigeon_SubsetBbox/execute.json +++ b/tests/functional/application-packages/ColibriFlyingpigeon_SubsetBbox/execute.json @@ -26,7 +26,7 @@ "outputs": [ { "id": "output", - "transmissionMode": "reference" + "transmissionMode": "value" } ] } diff --git a/tests/functional/application-packages/DockerCopyImages/execute.json b/tests/functional/application-packages/DockerCopyImages/execute.json index 6d0c771e2..5a2f3884e 100644 --- a/tests/functional/application-packages/DockerCopyImages/execute.json +++ b/tests/functional/application-packages/DockerCopyImages/execute.json @@ -10,7 +10,7 @@ "outputs": [ { "id": "output_files", - "transmissionMode": "reference" + "transmissionMode": "value" } ] } diff --git a/tests/functional/application-packages/DockerCopyNestedOutDir/execute.json b/tests/functional/application-packages/DockerCopyNestedOutDir/execute.json index 94e0479b9..61c6e2e5d 100644 --- a/tests/functional/application-packages/DockerCopyNestedOutDir/execute.json +++ b/tests/functional/application-packages/DockerCopyNestedOutDir/execute.json @@ -10,7 +10,7 @@ "outputs": [ { "id": "output_files", - "transmissionMode": "reference" + "transmissionMode": "value" } ] } diff --git a/tests/functional/application-packages/DockerStageImages/execute.json b/tests/functional/application-packages/DockerStageImages/execute.json index fb3d6d8bb..350c5705a 100644 --- a/tests/functional/application-packages/DockerStageImages/execute.json +++ b/tests/functional/application-packages/DockerStageImages/execute.json @@ -10,7 +10,7 @@ "outputs": [ { "id": "staging_output", - "transmissionMode": "reference" + "transmissionMode": "value" } ] } diff --git a/tests/functional/application-packages/Finch_IceDays/execute.json b/tests/functional/application-packages/Finch_IceDays/execute.json index 0870dd611..dda3863eb 100644 --- a/tests/functional/application-packages/Finch_IceDays/execute.json +++ b/tests/functional/application-packages/Finch_IceDays/execute.json @@ -14,7 +14,7 @@ "outputs": [ { "id": "output_netcdf", - "transmissionMode": "reference" + "transmissionMode": "value" } ] } diff --git a/tests/functional/application-packages/SubsetESGF/execute.json b/tests/functional/application-packages/SubsetESGF/execute.json index 66353dfb9..6d0db78f9 100644 --- a/tests/functional/application-packages/SubsetESGF/execute.json +++ b/tests/functional/application-packages/SubsetESGF/execute.json @@ -54,7 +54,7 @@ "outputs": [ { "id": "output", - "transmissionMode": "reference" + "transmissionMode": "value" } ] } diff --git a/tests/functional/application-packages/SubsetNASAESGF/execute.json b/tests/functional/application-packages/SubsetNASAESGF/execute.json index d57906574..e07a28830 100644 --- a/tests/functional/application-packages/SubsetNASAESGF/execute.json +++ b/tests/functional/application-packages/SubsetNASAESGF/execute.json @@ -50,7 +50,7 @@ "outputs": [ { "id": "output", - "transmissionMode": "reference" + "transmissionMode": "value" } ] } diff --git a/tests/functional/application-packages/WorkflowChainCopy/execute.json b/tests/functional/application-packages/WorkflowChainCopy/execute.json index 26428dd91..963ea522b 100644 --- a/tests/functional/application-packages/WorkflowChainCopy/execute.json +++ b/tests/functional/application-packages/WorkflowChainCopy/execute.json @@ -10,7 +10,7 @@ "outputs": [ { "id": "output", - "transmissionMode": "reference" + "transmissionMode": "value" } ] } diff --git a/tests/functional/application-packages/WorkflowChainStrings/execute.json b/tests/functional/application-packages/WorkflowChainStrings/execute.json index 69426ca4d..0a5e91f78 100644 --- a/tests/functional/application-packages/WorkflowChainStrings/execute.json +++ b/tests/functional/application-packages/WorkflowChainStrings/execute.json @@ -4,7 +4,7 @@ }, "outputs": { "output": { - "transmissionMode": "reference" + "transmissionMode": "value" } } } diff --git a/tests/functional/application-packages/WorkflowESGF/execute.json b/tests/functional/application-packages/WorkflowESGF/execute.json index 31c52ee18..6253c1dbf 100644 --- a/tests/functional/application-packages/WorkflowESGF/execute.json +++ b/tests/functional/application-packages/WorkflowESGF/execute.json @@ -18,7 +18,7 @@ "outputs": [ { "id": "output_netcdf", - "transmissionMode": "reference" + "transmissionMode": "value" } ] } diff --git a/tests/functional/application-packages/WorkflowFileToSubsetCRIM/execute.json b/tests/functional/application-packages/WorkflowFileToSubsetCRIM/execute.json index 46ce4760a..10e76a0f1 100644 --- a/tests/functional/application-packages/WorkflowFileToSubsetCRIM/execute.json +++ b/tests/functional/application-packages/WorkflowFileToSubsetCRIM/execute.json @@ -26,7 +26,7 @@ "outputs": [ { "id": "output", - "transmissionMode": "reference" + "transmissionMode": "value" } ] } diff --git a/tests/functional/application-packages/WorkflowStageCopyImages/execute.json b/tests/functional/application-packages/WorkflowStageCopyImages/execute.json index b499bd81b..79aaea34f 100644 --- a/tests/functional/application-packages/WorkflowStageCopyImages/execute.json +++ b/tests/functional/application-packages/WorkflowStageCopyImages/execute.json @@ -10,7 +10,7 @@ "outputs": [ { "id": "output", - "transmissionMode": "reference" + "transmissionMode": "value" } ] } diff --git a/tests/functional/application-packages/WorkflowSubsetIceDays/execute.json b/tests/functional/application-packages/WorkflowSubsetIceDays/execute.json index d1a64db22..01778a692 100644 --- a/tests/functional/application-packages/WorkflowSubsetIceDays/execute.json +++ b/tests/functional/application-packages/WorkflowSubsetIceDays/execute.json @@ -30,7 +30,7 @@ "outputs": [ { "id": "output", - "transmissionMode": "reference" + "transmissionMode": "value" } ] } diff --git a/tests/functional/application-packages/WorkflowSubsetLLNL_SubsetCRIM/execute.json b/tests/functional/application-packages/WorkflowSubsetLLNL_SubsetCRIM/execute.json index ac83ebba4..e5c6fddc9 100644 --- a/tests/functional/application-packages/WorkflowSubsetLLNL_SubsetCRIM/execute.json +++ b/tests/functional/application-packages/WorkflowSubsetLLNL_SubsetCRIM/execute.json @@ -51,7 +51,7 @@ "outputs": [ { "id": "output", - "transmissionMode": "reference" + "transmissionMode": "value" } ] } diff --git a/tests/functional/application-packages/WorkflowSubsetNASAESGF_SubsetCRIM/execute.json b/tests/functional/application-packages/WorkflowSubsetNASAESGF_SubsetCRIM/execute.json index 7e2b3a2cf..4af0849c4 100644 --- a/tests/functional/application-packages/WorkflowSubsetNASAESGF_SubsetCRIM/execute.json +++ b/tests/functional/application-packages/WorkflowSubsetNASAESGF_SubsetCRIM/execute.json @@ -46,7 +46,7 @@ "outputs": [ { "id": "output", - "transmissionMode": "reference" + "transmissionMode": "value" } ] } diff --git a/tests/functional/application-packages/WorkflowSubsetPicker/execute.json b/tests/functional/application-packages/WorkflowSubsetPicker/execute.json index d98858ab8..b22e58274 100644 --- a/tests/functional/application-packages/WorkflowSubsetPicker/execute.json +++ b/tests/functional/application-packages/WorkflowSubsetPicker/execute.json @@ -30,7 +30,7 @@ "outputs": [ { "id": "output", - "transmissionMode": "reference" + "transmissionMode": "value" } ] } diff --git a/tests/functional/test_builtin.py b/tests/functional/test_builtin.py index be249f3f5..df0ecd9b1 100644 --- a/tests/functional/test_builtin.py +++ b/tests/functional/test_builtin.py @@ -11,6 +11,7 @@ from weaver.execute import ExecuteControlOption, ExecuteMode, ExecuteResponse, ExecuteTransmissionMode from weaver.formats import ContentType from weaver.processes.builtin import register_builtin_processes +from weaver.status import Status if TYPE_CHECKING: from weaver.typedefs import JSON @@ -58,8 +59,8 @@ def test_jsonarray2netcdf_describe_old_schema(self): assert isinstance(body["process"]["outputs"][0]["formats"], list) assert len(body["process"]["outputs"][0]["formats"]) == 1 assert body["process"]["outputs"][0]["formats"][0]["mediaType"] == ContentType.APP_NETCDF - assert body["jobControlOptions"] == [ExecuteControlOption.ASYNC] - assert body["outputTransmission"] == [ExecuteTransmissionMode.REFERENCE] + assert body["jobControlOptions"] == [ExecuteControlOption.ASYNC, ExecuteControlOption.SYNC] + assert body["outputTransmission"] == [ExecuteTransmissionMode.REFERENCE, ExecuteTransmissionMode.VALUE] def test_jsonarray2netcdf_describe_ogc_schema(self): resp = self.app.get("/processes/jsonarray2netcdf", headers=self.json_headers) @@ -82,67 +83,283 @@ def test_jsonarray2netcdf_describe_ogc_schema(self): assert isinstance(body["outputs"]["output"]["formats"], list) assert len(body["outputs"]["output"]["formats"]) == 1 assert body["outputs"]["output"]["formats"][0]["mediaType"] == ContentType.APP_NETCDF - assert body["jobControlOptions"] == [ExecuteControlOption.ASYNC] - assert body["outputTransmission"] == [ExecuteTransmissionMode.REFERENCE] + assert body["jobControlOptions"] == [ExecuteControlOption.ASYNC, ExecuteControlOption.SYNC] + assert body["outputTransmission"] == [ExecuteTransmissionMode.REFERENCE, ExecuteTransmissionMode.VALUE] - def test_jsonarray2netcdf_execute(self): + def setup_inputs(self, stack): dirname = tempfile.gettempdir() nc_data = "Hello NetCDF!" + tmp_ncdf = tempfile.NamedTemporaryFile(dir=dirname, mode="w", suffix=".nc") + tmp_json = tempfile.NamedTemporaryFile(dir=dirname, mode="w", suffix=".json") + tmp_ncdf = stack.enter_context(tmp_ncdf) # noqa + tmp_json = stack.enter_context(tmp_json) # noqa + tmp_ncdf.write(nc_data) + tmp_ncdf.seek(0) + tmp_json.write(json.dumps(["file://{}".format(os.path.join(dirname, tmp_ncdf.name))])) + tmp_json.seek(0) + body = {"inputs": [{"id": "input", "href": os.path.join(dirname, tmp_json.name)}]} + return body, nc_data + + def validate_results(self, results, outputs, data, links): + # first validate format of OGC-API results + if results is not None: + assert isinstance(results, dict) + assert "output" in results, "Expected result ID 'output' in response body" + assert isinstance(results["output"], dict), "Container of result ID 'output' should be a dict" + assert "href" in results["output"] + assert "format" in results["output"] + fmt = results["output"]["format"] # type: JSON + assert isinstance(fmt, dict), "Result format should be provided with content details" + assert "mediaType" in fmt + assert isinstance(fmt["mediaType"], str), "Result format Content-Type should be a single string definition" + assert fmt["mediaType"] == ContentType.APP_NETCDF, "Result 'output' format expected to be NetCDF file" + nc_href = results["output"]["href"] + assert isinstance(nc_href, str) and len(nc_href) + elif links: + assert isinstance(links, list) and len(links) == 1 and isinstance(links[0], tuple) + assert "rel=\"output\"" in links[0][1] + assert f"type={ContentType.APP_NETCDF}" in links[0][1] + nc_link = links[0][1].split(" ")[0] + assert nc_link.startswith("<") and nc_link.startswith(">") + nc_href = nc_link[1:-1] + else: + nc_href = None + + settings = get_settings_from_testapp(self.app) + wps_path = settings.get("weaver.wps_output_path") + wps_dir = settings.get("weaver.wps_output_dir") + wps_out = "{}{}".format(settings.get("weaver.url"), wps_path) + + # validate results if applicable + if nc_href is not None: + nc_real_path = nc_href.replace(wps_out, wps_dir) + assert nc_href.startswith(wps_out) + assert os.path.split(nc_real_path)[-1] == os.path.split(nc_href)[-1] + assert os.path.isfile(nc_real_path) + with open(nc_real_path, "r") as f: + assert f.read() == data + + # if everything was valid for results, validate equivalent but differently formatted outputs response + assert outputs["outputs"][0]["id"] == "output" + nc_href = outputs["outputs"][0]["href"] + assert isinstance(nc_href, str) and len(nc_href) + assert nc_href.startswith(wps_out) + nc_real_path = nc_href.replace(wps_out, wps_dir) + assert os.path.split(nc_real_path)[-1] == os.path.split(nc_href)[-1] + + def test_jsonarray2netcdf_execute_async(self): with contextlib.ExitStack() as stack_exec: - tmp_ncdf = tempfile.NamedTemporaryFile(dir=dirname, mode="w", suffix=".nc") - tmp_json = tempfile.NamedTemporaryFile(dir=dirname, mode="w", suffix=".json") - tmp_ncdf = stack_exec.enter_context(tmp_ncdf) # noqa - tmp_json = stack_exec.enter_context(tmp_json) # noqa - tmp_ncdf.write(nc_data) - tmp_ncdf.seek(0) - tmp_json.write(json.dumps(["file://{}".format(os.path.join(dirname, tmp_ncdf.name))])) - tmp_json.seek(0) - data = { + body, nc_data = self.setup_inputs(stack_exec) + body.update({ "mode": ExecuteMode.ASYNC, "response": ExecuteResponse.DOCUMENT, - "inputs": [{"id": "input", "href": os.path.join(dirname, tmp_json.name)}], + "outputs": [{"id": "output", "transmissionMode": ExecuteTransmissionMode.VALUE}], + }) + for mock_exec in mocked_execute_celery(): + stack_exec.enter_context(mock_exec) + path = "/processes/jsonarray2netcdf/jobs" + resp = mocked_sub_requests(self.app, "post_json", path, + data=body, headers=self.json_headers, only_local=True) + + assert resp.status_code == 201, "Error: {}".format(resp.json) + assert resp.content_type in ContentType.APP_JSON + # following details not available yet in async, but are in sync + assert "created" not in resp.json + assert "finished" not in resp.json + assert "duration" not in resp.json + assert "progress" not in resp.json + + job_url = resp.json["location"] + results = self.monitor_job(job_url) + + output_url = job_url + "/outputs" + resp = self.app.get(output_url, headers=self.json_headers) + assert resp.status_code == 200, "Error job outputs:\n{}".format(resp.json) + outputs = resp.json + + self.validate_results(results, outputs, nc_data, None) + + def test_jsonarray2netcdf_execute_async_output_by_reference_dontcare_response_document(self): + """ + Jobs submitted with ``response=document`` are not impacted by ``transmissionMode``. + + The results schema should always be returned when document is requested. + + .. seealso:: + https://docs.ogc.org/is/18-062r2/18-062r2.html#req_core_process-execute-sync-document + """ + with contextlib.ExitStack() as stack_exec: + body, nc_data = self.setup_inputs(stack_exec) + body.update({ + "response": ExecuteResponse.DOCUMENT, # by value/reference don't care because of this "outputs": [{"id": "output", "transmissionMode": ExecuteTransmissionMode.REFERENCE}], - } + }) + for mock_exec in mocked_execute_celery(): + stack_exec.enter_context(mock_exec) + path = "/processes/jsonarray2netcdf/jobs" + resp = mocked_sub_requests(self.app, "post_json", path, + data=body, headers=self.json_headers, only_local=True) + + assert resp.status_code == 201, "Error: {}".format(resp.json) + assert resp.content_type in ContentType.APP_JSON + job_url = resp.json["location"] + self.monitor_job(job_url, return_status=True) # don't fetch results automatically + resp = self.app.get("{}/results".format(job_url), headers=self.json_headers) + assert resp.status_code == 200, "Error: {}".format(resp.text) + assert resp.content_type == ContentType.APP_JSON + result_links = [hdr for hdr in resp.headers if hdr[0].lower() == "link"] + assert len(result_links) == 0 + results = resp.json + + # even though results are requested by Link reference, + # Weaver still offers them with document on outputs endpoint + output_url = job_url + "/outputs" + resp = self.app.get(output_url, headers=self.json_headers) + assert resp.status_code == 200, "Error job outputs:\n{}".format(resp.text) + outputs = resp.json + + self.validate_results(results, outputs, nc_data, result_links) + + def test_jsonarray2netcdf_execute_async_output_by_value_response_raw(self): + """ + Jobs submitted with ``response=raw`` and single output as ``transmissionMode=value`` must return its raw data. + + .. seealso:: + https://docs.ogc.org/is/18-062r2/18-062r2.html#req_core_process-execute-sync-raw-value-one + """ + with contextlib.ExitStack() as stack_exec: + body, nc_data = self.setup_inputs(stack_exec) + body.update({ + "response": ExecuteResponse.RAW, # by value/reference important here + # NOTE: quantity of outputs important as well + # since single output, content-type is directly that output (otherwise should be multipart) + "outputs": [{"id": "output", "transmissionMode": ExecuteTransmissionMode.VALUE}], # data dump + }) for mock_exec in mocked_execute_celery(): stack_exec.enter_context(mock_exec) path = "/processes/jsonarray2netcdf/jobs" resp = mocked_sub_requests(self.app, "post_json", path, - data=data, headers=self.json_headers, only_local=True) + data=body, headers=self.json_headers, only_local=True) + + assert resp.status_code == 201, "Error: {}".format(resp.text) + assert resp.content_type in ContentType.APP_JSON + job_url = resp.json["location"] + self.monitor_job(job_url, return_status=True) # don't fetch results automatically + + resp = self.app.get("{}/results".format(job_url), headers=self.json_headers) + assert resp.status_code < 400, "Error: {}".format(resp.text) + assert resp.status_code == 200, "Body should contain literal raw data dump" + assert resp.content_type in ContentType.APP_NETCDF, "raw result by value should be directly the content-type" + assert resp.text == nc_data, "raw result by value should be directly the data content" + assert resp.headers + result_links = [hdr for hdr in resp.headers if hdr[0].lower() == "link"] + assert len(result_links) == 0 + + # even though results are requested by raw data, + # Weaver still offers them with document on outputs endpoint + output_url = job_url + "/outputs" + resp = self.app.get(output_url, headers=self.json_headers) + assert resp.status_code == 200, "Error job outputs:\n{}".format(resp.text) + outputs = resp.json + + self.validate_results(None, outputs, nc_data, result_links) + + def test_jsonarray2netcdf_execute_async_output_by_reference_response_raw(self): + """ + Jobs submitted with ``response=raw`` and single output as ``transmissionMode=reference`` must a link. + + Contents should be empty, and the reference should be provided with HTTP ``Link`` header. + + .. seealso:: + https://docs.ogc.org/is/18-062r2/18-062r2.html#req_core_process-execute-sync-raw-ref + """ + with contextlib.ExitStack() as stack_exec: + body, nc_data = self.setup_inputs(stack_exec) + body.update({ + "response": ExecuteResponse.RAW, # by value/reference important here + "outputs": [{"id": "output", "transmissionMode": ExecuteTransmissionMode.REFERENCE}], # Link header + }) + for mock_exec in mocked_execute_celery(): + stack_exec.enter_context(mock_exec) + path = "/processes/jsonarray2netcdf/jobs" + resp = mocked_sub_requests(self.app, "post_json", path, + data=body, headers=self.json_headers, only_local=True) assert resp.status_code == 201, "Error: {}".format(resp.json) assert resp.content_type in ContentType.APP_JSON job_url = resp.json["location"] - results = self.monitor_job(job_url) + self.monitor_job(job_url, return_status=True) # don't fetch results automatically - # first validate format of OGC-API results - assert "output" in results, "Expected result ID 'output' in response body" - assert isinstance(results["output"], dict), "Container of result ID 'output' should be a dict" - assert "href" in results["output"] - assert "format" in results["output"] - fmt = results["output"]["format"] # type: JSON - assert isinstance(fmt, dict), "Result format should be provided with content details" - assert "mediaType" in fmt - assert isinstance(fmt["mediaType"], str), "Result format Content-Type should be a single string definition" - assert fmt["mediaType"] == ContentType.APP_NETCDF, "Result 'output' format expected to be NetCDF file" - nc_path = results["output"]["href"] - assert isinstance(nc_path, str) and len(nc_path) - settings = get_settings_from_testapp(self.app) - wps_out = "{}{}".format(settings.get("weaver.url"), settings.get("weaver.wps_output_path")) - nc_real_path = nc_path.replace(wps_out, settings.get("weaver.wps_output_dir")) - assert nc_path.startswith(wps_out) - assert os.path.split(nc_real_path)[-1] == os.path.split(nc_path)[-1] - assert os.path.isfile(nc_real_path) - with open(nc_real_path, "r") as f: - assert f.read() == nc_data + resp = self.app.get("{}/results".format(job_url), headers=self.json_headers) + assert resp.status_code < 400, "Error: {}".format(resp.json) + assert resp.status_code == 204, "Body should be empty since all outputs requested by reference (Link header)" + assert resp.content_type is None + assert resp.headers + result_links = [hdr for hdr in resp.headers if hdr[0] == "Link"] - # if everything was valid for results, validate equivalent but differently formatted outputs response + # even though results are requested by Link reference, + # Weaver still offers them with document on outputs endpoint output_url = job_url + "/outputs" resp = self.app.get(output_url, headers=self.json_headers) assert resp.status_code == 200, "Error job outputs:\n{}".format(resp.json) outputs = resp.json - assert outputs["outputs"][0]["id"] == "output" - nc_path = outputs["outputs"][0]["href"] - assert isinstance(nc_path, str) and len(nc_path) - assert nc_path.startswith(wps_out) - assert os.path.split(nc_real_path)[-1] == os.path.split(nc_path)[-1] + + self.validate_results(None, outputs, nc_data, result_links) + + def test_jsonarray2netcdf_execute_sync(self): + """ + Job submitted with ``mode=sync`` or ``Prefer`` header for sync should respond directly with the results schema. + + .. seealso:: + https://docs.ogc.org/is/18-062r2/18-062r2.html#sc_execute_response + """ + with contextlib.ExitStack() as stack_exec: + body, nc_data = self.setup_inputs(stack_exec) + body.update({ + "response": ExecuteResponse.DOCUMENT, + "outputs": [{"id": "output", "transmissionMode": ExecuteTransmissionMode.VALUE}] + }) + for mock_exec in mocked_execute_celery(): + stack_exec.enter_context(mock_exec) + headers = {"Prefer": "wait=10"} + headers.update(self.json_headers) + path = "/processes/jsonarray2netcdf/jobs" + resp = mocked_sub_requests(self.app, "post_json", path, + data=body, headers=headers, only_local=True) + + assert resp.status_code == 200, "Error: {}".format(resp.json) + assert resp.content_type in ContentType.APP_JSON + + # since sync, results are directly available instead of job status + # even if results are returned directly (instead of status), + # status location link is available for reference as needed + assert "Location" in resp.headers + # validate sync was indeed applied (in normal situation, not considering mock test that runs in sync) + assert resp.headers["Preference-Applied"] == headers["Prefer"] + # following details should not be available since results are returned in sync instead of async job status + for field in ["status", "created", "finished", "duration", "progress"]: + assert field not in resp.json + + # validate that job can still be found and its metadata are defined although executed in sync + job_url = resp.headers["Location"] + resp = self.app.get(job_url, headers=self.json_headers) + assert resp.status_code == 200 + assert resp.content_type == ContentType.APP_JSON + for field in ["status", "created", "finished", "duration", "progress"]: + assert field in resp.json + assert resp.json["status"] == Status.SUCCEEDED + assert resp.json["progress"] == 100 + + out_url = f"{job_url}/results" + resp = self.app.get(out_url, headers=self.json_headers) + assert resp.status_code == 200 + assert resp.content_type == ContentType.APP_JSON + results = resp.json + + output_url = job_url + "/outputs" + resp = self.app.get(output_url, headers=self.json_headers) + assert resp.status_code == 200, "Error job outputs:\n{}".format(resp.json) + outputs = resp.json + + self.validate_results(results, outputs, nc_data, None) diff --git a/tests/functional/test_cli.py b/tests/functional/test_cli.py index 2ee00ff1e..55ed61f10 100644 --- a/tests/functional/test_cli.py +++ b/tests/functional/test_cli.py @@ -744,6 +744,107 @@ def test_execute_auto_monitor(self): assert any(f"\"status\": \"{Status.SUCCEEDED}\"" in line for line in lines) assert any("\"rel\": \"http://www.opengis.net/def/rel/ogc/1.0/results\"" in line for line in lines) + def test_execute_result_by_reference(self): + """ + Validate option to obtain outputs by reference returned with ``Link`` header. + + Result obtained is validated both with API outputs and extended auto-download outputs. + """ + proc = self.test_process["Echo"] + with contextlib.ExitStack() as stack_exec: + out_tmp = stack_exec.enter_context(tempfile.TemporaryDirectory()) + stack_exec.enter_context(mocked_wps_output(self.settings)) + for mock_exec_proc in mocked_execute_celery(): + stack_exec.enter_context(mock_exec_proc) + + msg = "TEST MESSAGE!" + lines = mocked_sub_requests( + self.app, run_command, + [ + # "weaver", + "execute", + "-u", self.url, + "-p", proc, + "-I", f"message='{msg}'", + "-R", "output", + "-M", + "-T", 10, + "-W", 1, + "-F", OutputFormat.YAML, + ], + trim=False, + entrypoint=weaver_cli, + only_local=True, + ) + assert "jobID: " in lines[0] # don't care value, self-handled + assert any(f"status: {Status.SUCCEEDED}" in line for line in lines) + + job_id = lines[0].split(":")[-1].strip() + lines = mocked_sub_requests( + self.app, run_command, + [ + # "weaver", + "results", + "-u", self.url, + "-j", job_id, + "-H", # must display header to get 'Link' + "-F", OutputFormat.YAML, + ], + trim=False, + entrypoint=weaver_cli, + only_local=True, + ) + sep = lines.index("---") + headers = lines[:sep] + content = lines[sep+1:-1] # ignore final newline + assert len(headers) and any("Link:" in hdr for hdr in headers) + assert content == ["null"], "When no download involved, body should be the original no-content results." + + lines = mocked_sub_requests( + self.app, run_command, + [ + # "weaver", + "results", + "-u", self.url, + "-j", job_id, + "-H", # must display header to get 'Link' + "-F", OutputFormat.YAML, + "-D", + "-O", out_tmp + ], + trim=False, + entrypoint=weaver_cli, + only_local=True, + ) + sep = lines.index("---") + headers = lines[:sep] + content = lines[sep+1:] + + assert len(content), "Content should have been populated from download to provide downloaded file paths." + link = None + for header in headers: + if "Link:" in header: + link = header.split(":", 1)[-1].strip() + break + assert link + link = link.split(";")[0].strip("<>") + path = map_wps_output_location(link, self.settings, url=False) + assert os.path.isfile(path), "Original file results should exist in job output dir." + + # path should be in contents as well, pre-resolved within download dir (not same as job output dir) + assert len([line for line in content if "path:" in line]) == 1 + path = None + for line in content: + if "path:" in line: + path = line.split(":", 1)[-1].strip() + break + assert path + assert path.startswith(out_tmp) + assert os.path.isfile(path) + with open(path, "r") as file: + data = file.read() + assert msg in data # technically, output is log of echoed input message, so not exactly equal + def test_execute_help_details(self): """ Verify that formatting of the execute operation help provides multiple paragraphs with more details. diff --git a/tests/functional/test_docker_app.py b/tests/functional/test_docker_app.py index 9e3a066fa..1195f478d 100644 --- a/tests/functional/test_docker_app.py +++ b/tests/functional/test_docker_app.py @@ -148,7 +148,7 @@ def test_execute_wps_rest_resp_json(self): {"id": "file", "href": tmp_file.name}, ], "outputs": [ - {"id": self.out_key, "transmissionMode": ExecuteTransmissionMode.REFERENCE}, + {"id": self.out_key, "transmissionMode": ExecuteTransmissionMode.VALUE}, ] } for mock_exec in mocked_execute_celery(): @@ -335,7 +335,7 @@ def test_execute_docker_embedded_python_script(self): {"id": "cost", "value": cost} ], "outputs": [ - {"id": "quote", "transmissionMode": ExecuteTransmissionMode.REFERENCE}, + {"id": "quote", "transmissionMode": ExecuteTransmissionMode.VALUE}, ] } resp = mocked_sub_requests(self.app, "POST", path, json=body, headers=self.json_headers, only_local=True) diff --git a/tests/functional/test_quoting.py b/tests/functional/test_quoting.py index 280e19fef..1fecbf993 100644 --- a/tests/functional/test_quoting.py +++ b/tests/functional/test_quoting.py @@ -67,7 +67,8 @@ def deploy_test_processes(cls): def test_quote_bad_inputs(self): path = sd.process_quotes_service.path.format(process_id="Echo") - resp = mocked_sub_requests(self.app, "POST", path, json={}, headers=self.json_headers, only_local=True) + data = {"inputs": [1, 2, 3]} + resp = mocked_sub_requests(self.app, "POST", path, json=data, headers=self.json_headers, only_local=True) assert resp.status_code == 400 @mock.patch("weaver.quotation.estimation.estimate_process_quote", side_effect=mocked_estimate_process_quote) @@ -84,7 +85,7 @@ def test_quote_atomic_process(self, mocked_estimate): }, "outputs": { "output": { - "transmissionMode": ExecuteTransmissionMode.REFERENCE + "transmissionMode": ExecuteTransmissionMode.VALUE } } } diff --git a/tests/functional/test_wps_package.py b/tests/functional/test_wps_package.py index e95d980f1..326d17735 100644 --- a/tests/functional/test_wps_package.py +++ b/tests/functional/test_wps_package.py @@ -639,7 +639,7 @@ def test_execute_file_type_io_format_references(self): data.update({ "mode": ExecuteMode.ASYNC, "response": ExecuteResponse.DOCUMENT, - "outputs": {"output": {"transmissionMode": ExecuteTransmissionMode.REFERENCE}} + "outputs": {"output": {"transmissionMode": ExecuteTransmissionMode.VALUE}} }) with contextlib.ExitStack() as stack_exec: for mock_exec in mocked_execute_celery(): @@ -1414,7 +1414,7 @@ def test_execute_job_with_accept_languages(self): "mode": ExecuteMode.ASYNC, "response": ExecuteResponse.DOCUMENT, "inputs": [{"id": "message", "value": "test"}], - "outputs": [{"id": "output", "transmissionMode": ExecuteTransmissionMode.REFERENCE}] + "outputs": [{"id": "output", "transmissionMode": ExecuteTransmissionMode.VALUE}] } headers = deepcopy(self.json_headers) @@ -1770,7 +1770,7 @@ def test_execute_job_with_context_output_dir(self): "mode": ExecuteMode.ASYNC, "response": ExecuteResponse.DOCUMENT, "inputs": [{"id": "message", "value": "test"}], - "outputs": [{"id": "output", "transmissionMode": ExecuteTransmissionMode.REFERENCE}] + "outputs": [{"id": "output", "transmissionMode": ExecuteTransmissionMode.VALUE}] } headers = deepcopy(self.json_headers) @@ -1846,7 +1846,7 @@ def test_execute_job_with_custom_file_name(self): "mode": ExecuteMode.ASYNC, "response": ExecuteResponse.DOCUMENT, "inputs": [{"id": "input_file", "href": tmp_http}], - "outputs": [{"id": "output", "transmissionMode": ExecuteTransmissionMode.REFERENCE}] + "outputs": [{"id": "output", "transmissionMode": ExecuteTransmissionMode.VALUE}] } resp = mocked_sub_requests(self.app, "post_json", proc_url, timeout=5, data=exec_body, headers=headers, only_local=True) @@ -1892,7 +1892,7 @@ def test_dismiss_job(self): "mode": ExecuteMode.ASYNC, "response": ExecuteResponse.DOCUMENT, "inputs": [{"id": "delay", "value": 1}], - "outputs": [{"id": "output", "transmissionMode": ExecuteTransmissionMode.REFERENCE}] + "outputs": [{"id": "output", "transmissionMode": ExecuteTransmissionMode.VALUE}] } with contextlib.ExitStack() as stack_exec: @@ -2459,8 +2459,8 @@ def test_execute_application_package_process_with_bucket(self): {"id": "input_with_s3", "href": test_bucket_ref}, ], "outputs": [ - {"id": "output_from_http", "transmissionMode": ExecuteTransmissionMode.REFERENCE}, - {"id": "output_from_s3", "transmissionMode": ExecuteTransmissionMode.REFERENCE}, + {"id": "output_from_http", "transmissionMode": ExecuteTransmissionMode.VALUE}, + {"id": "output_from_s3", "transmissionMode": ExecuteTransmissionMode.VALUE}, ] } with contextlib.ExitStack() as stack_exec: diff --git a/tests/functional/test_wps_provider.py b/tests/functional/test_wps_provider.py index c03e1ca9f..ff72bb585 100644 --- a/tests/functional/test_wps_provider.py +++ b/tests/functional/test_wps_provider.py @@ -187,7 +187,7 @@ def test_register_describe_execute_ncdump(self, mock_responses): assert "GetCapabilities" in links["service-desc"] assert ExecuteControlOption.ASYNC in body["jobControlOptions"] - assert ExecuteTransmissionMode.REFERENCE in body["outputTransmission"] + assert ExecuteTransmissionMode.VALUE in body["outputTransmission"] # validate execution submission # (don't actually execute because server is mocked, only validate parsing of I/O and job creation) @@ -198,7 +198,7 @@ def test_register_describe_execute_ncdump(self, mock_responses): "mode": ExecuteMode.ASYNC, "response": ExecuteResponse.DOCUMENT, "inputs": [{"id": "dataset", "href": exec_file}], - "outputs": [{"id": "output", "transmissionMode": ExecuteTransmissionMode.REFERENCE}] + "outputs": [{"id": "output", "transmissionMode": ExecuteTransmissionMode.VALUE}] } status_url = resources.TEST_REMOTE_SERVER_URL + "/status.xml" output_url = resources.TEST_REMOTE_SERVER_URL + "/output.txt" diff --git a/tests/functional/utils.py b/tests/functional/utils.py index c807b69cb..6b459fde4 100644 --- a/tests/functional/utils.py +++ b/tests/functional/utils.py @@ -284,7 +284,7 @@ def check_job_status(_resp, running=False): if return_status or expect_failed: return resp.json resp = self.app.get("{}/results".format(status_url), headers=self.json_headers) - assert resp.status_code == 200, "Error job info:\n{}".format(resp.json) + assert resp.status_code == 200, "Error job info:\n{}".format(resp.text) return resp.json def get_outputs(self, status_url): diff --git a/tests/opensearch/json/opensearch_deploy.json b/tests/opensearch/json/opensearch_deploy.json index 34a902d5b..423f9698e 100644 --- a/tests/opensearch/json/opensearch_deploy.json +++ b/tests/opensearch/json/opensearch_deploy.json @@ -79,7 +79,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ] }, "executionUnit": [ diff --git a/tests/opensearch/json/opensearch_describe_process.json b/tests/opensearch/json/opensearch_describe_process.json index 20709b6cd..b1788da6a 100644 --- a/tests/opensearch/json/opensearch_describe_process.json +++ b/tests/opensearch/json/opensearch_describe_process.json @@ -135,10 +135,10 @@ ], "version": "1.0.0", "jobControlOptions": [ - "async" + "async-execute" ], "outputTransmission": [ - "reference" + "value" ] } } diff --git a/tests/opensearch/json/opensearch_execute.json b/tests/opensearch/json/opensearch_execute.json index 5033245a8..825b81b1f 100644 --- a/tests/opensearch/json/opensearch_execute.json +++ b/tests/opensearch/json/opensearch_execute.json @@ -22,7 +22,7 @@ "outputs": [ { "id": "output", - "transmissionMode": "reference" + "transmissionMode": "value" } ] -} \ No newline at end of file +} diff --git a/tests/opensearch/json/opensearch_process.json b/tests/opensearch/json/opensearch_process.json index a35af044d..2af09286d 100644 --- a/tests/opensearch/json/opensearch_process.json +++ b/tests/opensearch/json/opensearch_process.json @@ -87,7 +87,7 @@ ], "keywords": [], "outputTransmission": [ - "reference" + "value" ], "outputs": [ { @@ -236,7 +236,7 @@ ], "keywords": [], "outputTransmission": [ - "reference" + "value" ], "outputs": [ { diff --git a/tests/resources/test_describe_process_wps3.json b/tests/resources/test_describe_process_wps3.json index 695715821..778fd2d2f 100644 --- a/tests/resources/test_describe_process_wps3.json +++ b/tests/resources/test_describe_process_wps3.json @@ -31,7 +31,7 @@ "executeEndpoint": "https://remote-server.com/processes/test-remote-process-wps3/jobs" }, "outputTransmission": [ - "reference" + "value" ], "jobControlOptions": [ "async-execute" diff --git a/tests/test_datatype.py b/tests/test_datatype.py index 6ca56767e..ee6eb42de 100644 --- a/tests/test_datatype.py +++ b/tests/test_datatype.py @@ -47,18 +47,21 @@ def _replace_specials(value): def test_process_job_control_options_resolution(): - # invalid or matching default mode should be corrected to default async list - for test_process in [ + # invalid or matching default mode should be corrected to default modes list + for i, test_process in enumerate([ Process(id="test-{}".format(uuid.uuid4()), package={}, jobControlOptions=None), Process(id="test-{}".format(uuid.uuid4()), package={}, jobControlOptions=[None]), Process(id="test-{}".format(uuid.uuid4()), package={}, jobControlOptions=[]), - Process(id="test-{}".format(uuid.uuid4()), package={}, jobControlOptions=[ExecuteControlOption.ASYNC]), - ]: - assert test_process.jobControlOptions == [ExecuteControlOption.ASYNC] + ]): + assert test_process.jobControlOptions == [ExecuteControlOption.ASYNC], f"Test {i}" + # explicitly provided modes are used as is, especially if partial (allow disabling some modes) + proc = Process(id="test-{}".format(uuid.uuid4()), package={}, jobControlOptions=[ExecuteControlOption.ASYNC]) + assert proc.jobControlOptions == [ExecuteControlOption.ASYNC] # other valid definitions should be preserved as is proc = Process(id="test-{}".format(uuid.uuid4()), package={}, jobControlOptions=[ExecuteControlOption.SYNC]) assert proc.jobControlOptions == [ExecuteControlOption.SYNC] + # See ordering note in 'jobControlOptions' property proc = Process(id="test-{}".format(uuid.uuid4()), package={}, jobControlOptions=[ExecuteControlOption.SYNC, ExecuteControlOption.ASYNC]) assert proc.jobControlOptions == [ExecuteControlOption.SYNC, ExecuteControlOption.ASYNC] diff --git a/tests/utils.py b/tests/utils.py index 1ed286c0e..57d8216bb 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -26,6 +26,7 @@ import pkg_resources import pyramid_celery import responses +from celery.exceptions import TimeoutError as CeleryTaskTimeoutError from owslib.wps import Languages, WebProcessingService from pyramid import testing from pyramid.config import Configurator @@ -216,8 +217,10 @@ def setup_config_with_celery(config): settings = config.get_settings() # override celery loader to specify configuration directly instead of ini file + celery_mongodb_url = "mongodb://{}:{}/celery".format(settings.get("mongodb.host"), settings.get("mongodb.port")) celery_settings = { - "CELERY_BROKER_URL": "mongodb://{}:{}/celery".format(settings.get("mongodb.host"), settings.get("mongodb.port")) + "broker_url": celery_mongodb_url, + "result_backend": celery_mongodb_url # for sync exec } pyramid_celery.loaders.INILoader.read_configuration = mock.MagicMock(return_value=celery_settings) config.include("pyramid_celery") @@ -865,6 +868,15 @@ class MockTask(object): def id(self): return self._id + # since delay is mocked and blocks to execute, assume sync is complete at this point + # all following methods return what would be returned normally in sync mode + + def wait(self, *_, **__): + raise CeleryTaskTimeoutError + + def ready(self, *_, **__): + return True + task = MockTask() def mock_execute_task(*args, **kwargs): @@ -895,7 +907,7 @@ def mocked_dismiss_process(): mock_celery_app = mock.MagicMock() mock_celery_app.control = mock.MagicMock() mock_celery_app.control.revoke = mock.MagicMock() - mock_celery_revoke = mock.patch("weaver.wps_restapi.jobs.jobs.celery_app", return_value=mock_celery_app) + mock_celery_revoke = mock.patch("weaver.wps_restapi.jobs.utils.celery_app", return_value=mock_celery_app) try: with mock_celery_revoke: diff --git a/tests/wps_restapi/test_jobs.py b/tests/wps_restapi/test_jobs.py index 997eca80a..7edbf80c8 100644 --- a/tests/wps_restapi/test_jobs.py +++ b/tests/wps_restapi/test_jobs.py @@ -11,6 +11,7 @@ from distutils.version import LooseVersion from typing import TYPE_CHECKING +import colander import mock import pyramid.testing import pytest @@ -537,7 +538,7 @@ def test_get_jobs_by_encrypted_email(self): email = "some.test@crim.ca" body = { "inputs": [{"id": "test_input", "data": "test"}], - "outputs": [{"id": "test_output", "transmissionMode": ExecuteTransmissionMode.REFERENCE}], + "outputs": [{"id": "test_output", "transmissionMode": ExecuteTransmissionMode.VALUE}], "mode": ExecuteMode.ASYNC, "response": ExecuteResponse.DOCUMENT, "notification_email": email @@ -1366,3 +1367,88 @@ def test_job_results_errors(self): assert resp.json["cause"] == cause assert resp.json["type"].endswith(error_type) # ignore http full reference, not always there assert "links" in resp.json + + def test_jobs_inputs_outputs_validations(self): + """ + Ensure that inputs/outputs submitted or returned can be represented and validated across various formats. + """ + default_trans_mode = {"transmissionMode": ExecuteTransmissionMode.VALUE} + + job_none = sd.Execute().deserialize({}) + assert job_none == { + "inputs": {}, + "outputs": {}, + "mode": ExecuteMode.AUTO, + "response": ExecuteResponse.DOCUMENT + } + + job_in_none = sd.Execute().deserialize({"outputs": {"random": default_trans_mode}}) + assert job_in_none == { + "inputs": {}, + "outputs": {"random": default_trans_mode}, + "mode": ExecuteMode.AUTO, + "response": ExecuteResponse.DOCUMENT + } + + job_in_empty_dict = sd.Execute().deserialize({"inputs": {}, "outputs": {"random": default_trans_mode}}) + assert job_in_empty_dict == { + "inputs": {}, + "outputs": {"random": default_trans_mode}, + "mode": ExecuteMode.AUTO, + "response": ExecuteResponse.DOCUMENT + } + + job_in_empty_list = sd.Execute().deserialize({"inputs": [], "outputs": {"random": default_trans_mode}}) + assert job_in_empty_list == { + "inputs": [], + "outputs": {"random": default_trans_mode}, + "mode": ExecuteMode.AUTO, + "response": ExecuteResponse.DOCUMENT + } + + job_out_none = sd.Execute().deserialize({"inputs": {"random": "ok"}}) + assert job_out_none == { + "inputs": {"random": "ok"}, + "outputs": {}, + "mode": ExecuteMode.AUTO, + "response": ExecuteResponse.DOCUMENT + } + + job_out_empty_dict = sd.Execute().deserialize({"inputs": {"random": "ok"}, "outputs": {}}) + assert job_out_empty_dict == { + "inputs": {"random": "ok"}, + "outputs": {}, + "mode": ExecuteMode.AUTO, + "response": ExecuteResponse.DOCUMENT + } + + job_out_empty_list = sd.Execute().deserialize({"inputs": {"random": "ok"}, "outputs": []}) + assert job_out_empty_list == { + "inputs": {"random": "ok"}, + "outputs": [], + "mode": ExecuteMode.AUTO, + "response": ExecuteResponse.DOCUMENT + } + + job_out_defined = sd.Execute().deserialize({ + "inputs": {"random": "ok"}, + "outputs": {"random": {"transmissionMode": ExecuteTransmissionMode.REFERENCE}} + }) + assert job_out_defined == { + "inputs": {"random": "ok"}, + "outputs": {"random": {"transmissionMode": ExecuteTransmissionMode.REFERENCE}}, + "mode": ExecuteMode.AUTO, + "response": ExecuteResponse.DOCUMENT + } + + with self.assertRaises(colander.Invalid): + sd.Execute().deserialize({"inputs": "value"}) + + with self.assertRaises(colander.Invalid): + sd.Execute().deserialize({"outputs": "value"}) + + with self.assertRaises(colander.Invalid): + sd.Execute().deserialize({"outputs": {"random": "value"}}) + + with self.assertRaises(colander.Invalid): + sd.Execute().deserialize({"outputs": {"random": {"transmissionMode": "bad"}}}) diff --git a/tests/wps_restapi/test_processes.py b/tests/wps_restapi/test_processes.py index 19aff4641..96c694d41 100644 --- a/tests/wps_restapi/test_processes.py +++ b/tests/wps_restapi/test_processes.py @@ -136,7 +136,7 @@ def get_process_execute_template(test_input="not-specified"): ], "outputs": [ {"id": "test_output", - "transmissionMode": ExecuteTransmissionMode.REFERENCE} + "transmissionMode": ExecuteTransmissionMode.VALUE} ], "mode": ExecuteMode.ASYNC, "response": ExecuteResponse.DOCUMENT, @@ -1004,28 +1004,52 @@ def test_execute_process_no_json_body(self): assert resp.content_type == ContentType.APP_JSON def test_execute_process_missing_required_params(self): + """ + Validate execution against missing parameters. + + .. versionchanged:: 4.15.0 + Multiple parameters are not **required** anymore because the alternative with ``Prefer`` header + for :term:`OGC API - Processes` compliance is permitted. When the values are specified through, + they should still be validated to provide relevant error details to the user. + """ execute_data = self.get_process_execute_template(fully_qualified_name(self)) # remove components for testing different cases - execute_data_tests = [deepcopy(execute_data) for _ in range(7)] - execute_data_tests[0].pop("outputs") - execute_data_tests[1].pop("mode") - execute_data_tests[2].pop("response") - execute_data_tests[3]["mode"] = "random" - execute_data_tests[4]["response"] = "random" - execute_data_tests[5]["inputs"] = [{"test_input": "test_value"}] # noqa # bad format on purpose - execute_data_tests[6]["outputs"] = [{"id": "test_output", "transmissionMode": "random"}] + execute_data_tests = [[True, deepcopy(execute_data)] for _ in range(7)] + execute_data_tests[0][0] = False + execute_data_tests[0][1].pop("outputs") + execute_data_tests[1][0] = False + execute_data_tests[1][1].pop("mode") + execute_data_tests[2][0] = False + execute_data_tests[2][1].pop("response") + execute_data_tests[3][1]["mode"] = "random" + execute_data_tests[4][1]["response"] = "random" + execute_data_tests[5][1]["inputs"] = [{"test_input": "test_value"}] # noqa # bad format on purpose + execute_data_tests[6][1]["outputs"] = [{"id": "test_output", "transmissionMode": "random"}] + + def no_op(*_, **__): + return Status.SUCCEEDED path = "/processes/{}/jobs".format(self.process_public.identifier) - for i, exec_data in enumerate(execute_data_tests): - data_json = json.dumps(exec_data, indent=2) - with stopit.ThreadingTimeout(3) as timeout: # timeout to kill execution if schema validation did not raise - resp = self.app.post_json(path, params=exec_data, headers=self.json_headers, expect_errors=True) - msg = "Failed with test variation '{}' with status '{}' using:\n{}" - assert resp.status_code in [400, 422], msg.format(i, resp.status_code, data_json) - assert resp.content_type == ContentType.APP_JSON, msg.format(i, resp.content_type) - msg = "Killed test '{}' request taking too long using:\n{}".format(i, data_json) - assert timeout.state == timeout.EXECUTED, msg + with contextlib.ExitStack() as stack_exec: + for mock_exec in mocked_execute_celery(func_execute_task=no_op): + stack_exec.enter_context(mock_exec) + for i, (is_invalid, exec_data) in enumerate(execute_data_tests): + data_json = json.dumps(exec_data, indent=2) + try: + # timeout to kill execution if schema validation did not raise + with stopit.ThreadingTimeout(3) as timeout: + resp = self.app.post_json(path, params=exec_data, headers=self.json_headers, expect_errors=True) + msg = "Failed with test variation '{}' with status '{}' using:\n{}" + code = [400, 422] if is_invalid else [201] + assert resp.status_code in code, msg.format(i, resp.status_code, data_json) + assert resp.content_type == ContentType.APP_JSON, msg.format(i, resp.content_type) + except stopit.TimeoutException: + # if required, not normal to have passed validation + # if optional, valid since omitting field does not raise missing field in schema + if is_invalid: + msg = "Killed test '{}' request taking too long using:\n{}".format(i, data_json) + assert timeout.state == timeout.EXECUTED, msg def test_execute_process_dont_cast_one_of(self): """ diff --git a/tests/wps_restapi/test_providers.py b/tests/wps_restapi/test_providers.py index 6a72f9fcc..a61fb2580 100644 --- a/tests/wps_restapi/test_providers.py +++ b/tests/wps_restapi/test_providers.py @@ -346,7 +346,7 @@ def test_get_provider_process_description_old_schema(self): assert len(body["jobControlOptions"]) == 1 assert ExecuteControlOption.ASYNC in body["jobControlOptions"] assert len(body["outputTransmission"]) == 1 - assert ExecuteTransmissionMode.REFERENCE in body["outputTransmission"] + assert ExecuteTransmissionMode.VALUE in body["outputTransmission"] assert "inputs" in process and isinstance(process["inputs"], list) assert all(isinstance(p_io, dict) and "id" in p_io for p_io in process["inputs"]) assert "outputs" in process and isinstance(process["outputs"], list) @@ -377,7 +377,7 @@ def test_get_provider_process_description_ogc_schema(self): assert len(process["jobControlOptions"]) == 1 assert ExecuteControlOption.ASYNC in process["jobControlOptions"] assert len(process["outputTransmission"]) == 1 - assert ExecuteTransmissionMode.REFERENCE in process["outputTransmission"] + assert ExecuteTransmissionMode.VALUE in process["outputTransmission"] assert "inputs" in process and isinstance(process["inputs"], dict) assert all(isinstance(p_io, str) and isinstance(process["inputs"][p_io], dict) for p_io in process["inputs"]) assert all("id" not in process["inputs"][p_io] for p_io in process["inputs"]) diff --git a/weaver/cli.py b/weaver/cli.py index a8dc0d81e..4029a2336 100644 --- a/weaver/cli.py +++ b/weaver/cli.py @@ -11,6 +11,7 @@ from urllib.parse import urlparse import yaml +from webob.headers import ResponseHeaders from yaml.scanner import ScannerError from weaver import __meta__ @@ -35,6 +36,7 @@ get_file_headers, load_file, null, + parse_kvp, request_extra, setup_loggers ) @@ -42,16 +44,17 @@ from weaver.wps_restapi import swagger_definitions as sd if TYPE_CHECKING: - from typing import Any, Dict, Optional, Tuple, Union + from typing import Any, Dict, Iterable, List, Optional, Tuple, Union from requests import Response # avoid failing sphinx-argparse documentation # https://github.com/ashb/sphinx-argparse/issues/7 try: - from weaver.typedefs import CWL, JSON, ExecutionInputsMap, HeadersType + from weaver.typedefs import AnyHeadersContainer, CWL, JSON, ExecutionInputsMap, ExecutionResults, HeadersType except ImportError: - CWL = JSON = ExecutionInputsMap = HeadersType = Any # avoid linter issue + # avoid linter issue + AnyHeadersContainer = CWL = JSON = ExecutionInputsMap = ExecutionResults = HeadersType = Any try: from weaver.formats import AnyOutputFormat from weaver.processes.constants import ProcessSchemaType @@ -80,7 +83,7 @@ class OperationResult(AutoBase): """ success = False # type: Optional[bool] message = "" # type: Optional[str] - headers = {} # type: Optional[HeadersType] + headers = {} # type: Optional[AnyHeadersContainer] body = {} # type: Optional[Union[JSON, str]] code = None # type: Optional[int] @@ -88,7 +91,7 @@ def __init__(self, success=None, # type: Optional[bool] message=None, # type: Optional[str] body=None, # type: Optional[Union[str, JSON]] - headers=None, # type: Optional[HeadersType] + headers=None, # type: Optional[AnyHeadersContainer] text=None, # type: Optional[str] code=None, # type: Optional[int] **kwargs, # type: Any @@ -96,7 +99,7 @@ def __init__(self, super(OperationResult, self).__init__(**kwargs) self.success = success self.message = message - self.headers = headers + self.headers = ResponseHeaders(headers) if headers is not None else None self.body = body self.text = text self.code = code @@ -123,6 +126,28 @@ def text(self, text): # type: (str) -> None self["text"] = text + def links(self, header_names=None): + # type: (Optional[List[str]]) -> ResponseHeaders + """ + Obtain HTTP headers sorted in the result that corresponds to any link reference. + + :param header_names: + Limit link names to be considered. + By default, considered headers are ``Link``, ``Content-Location`` and ``Location``. + """ + if not self.headers: + return ResponseHeaders([]) + if not isinstance(self.headers, ResponseHeaders): + self.headers = ResponseHeaders(self.headers) + if not header_names: + header_names = ["Link", "Content-Location", "Location"] + header_names = [hdr.lower() for hdr in header_names] + link_headers = ResponseHeaders() + for hdr_n, hdr_v in self.headers.items(): + if hdr_n.lower() in header_names: + link_headers.add(hdr_n, hdr_v) + return link_headers + class WeaverClient(object): """ @@ -164,7 +189,7 @@ def _parse_url(url): return parsed_url.rsplit("/", 1)[0] if parsed_url.endswith("/") else parsed_url @staticmethod - def _parse_result(response, # type: Response + def _parse_result(response, # type: Union[Response, OperationResult] body=None, # type: Optional[JSON] # override response body message=None, # type: Optional[str] # override message/description in contents success=None, # type: Optional[bool] # override resolved success @@ -173,33 +198,44 @@ def _parse_result(response, # type: Response nested_links=None, # type: Optional[str] output_format=None, # type: Optional[AnyOutputFormat] ): # type: (...) -> OperationResult - hdr = dict(response.headers) + # multi-header of same name, for example to support many Link + headers = ResponseHeaders(response.headers) + code = getattr(response, "status_code", None) or getattr(response, "code", None) _success = False try: - body = body or response.json() - if not show_links: - if nested_links: - nested = body.get(nested_links, []) - if isinstance(nested, list): - for item in nested: - item.pop("links", None) - body.pop("links", None) - msg = message or body.get("description", body.get("message", "undefined")) - if response.status_code >= 400: - if not msg: + msg = None + ctype = headers.get("Content-Type") + content = getattr(response, "content", None) or getattr(response, "body", None) + if not body and content and ctype and ContentType.APP_JSON in ctype and hasattr(response, "json"): + body = response.json() + if isinstance(body, dict): + if not show_links: + if nested_links: + nested = body.get(nested_links, []) + if isinstance(nested, list): + for item in nested: + item.pop("links", None) + body.pop("links", None) + msg = body.get("description", body.get("message", "undefined")) + if code >= 400: + if not msg and isinstance(body, dict): msg = body.get("error", body.get("exception", "unknown")) else: _success = True + msg = message or getattr(response, "message", None) or msg or "undefined" text = OutputFormat.convert(body, output_format or OutputFormat.JSON_STR, item_root="result") - except Exception: # noqa - text = body = response.text + except Exception as exc: # noqa msg = "Could not parse body." + text = body = response.text + LOGGER.warning(msg, exc_info=exc) if show_headers: - s_hdr = OutputFormat.convert({"Headers": hdr}, OutputFormat.YAML) - text = f"{s_hdr}---\n{text}" + # convert potential multi-equal-key headers into a JSON/YAML serializable format + hdr_l = [{hdr_name: hdr_val} for hdr_name, hdr_val in sorted(headers.items())] + hdr_s = OutputFormat.convert({"Headers": hdr_l}, OutputFormat.YAML) + text = f"{hdr_s}---\n{text}" if success is not None: _success = success - return OperationResult(_success, msg, body, hdr, text=text, code=response.status_code) + return OperationResult(_success, msg, body, headers, text=text, code=code) @staticmethod def _parse_deploy_body(body, process_id): @@ -548,10 +584,6 @@ def _update_files(self, inputs, url=None): auth_headers = {sd.XAuthVaultFileHeader.name: multi_tokens} return update_inputs, auth_headers - # FIXME: support sync (https://github.com/crim-ca/weaver/issues/247) - # :param execute_async: - # Execute the process asynchronously (user must call :meth:`monitor` themselves, - # or synchronously where monitoring is done automatically until completion before returning. def execute(self, process_id, # type: str inputs=None, # type: Optional[Union[str, JSON]] @@ -562,6 +594,7 @@ def execute(self, show_links=True, # type: bool show_headers=False, # type: bool output_format=None, # type: Optional[AnyOutputFormat] + output_refs=None, # type: Optional[Iterable[str]] ): # type: (...) -> OperationResult """ Execute a :term:`Job` for the specified :term:`Process` with provided inputs. @@ -577,6 +610,12 @@ def execute(self, .. seealso:: :ref:`proc_op_execute` + .. note:: + Execution requests are always accomplished asynchronously. To obtain the final :term:`Job` status as if + they were executed synchronously, provide the :paramref:`monitor` argument. This offers more flexibility + over servers that could decide to ignore sync/async preferences, and avoids closing/timeout connection + errors that could occur for long running processes, since status is pooled iteratively rather than waiting. + :param process_id: Identifier of the process to execute. :param inputs: Literal :term:`JSON` or :term:`YAML` contents of the inputs submitted and inserted into the execution body, @@ -592,6 +631,12 @@ def execute(self, :param show_links: Indicate if ``links`` section should be preserved in returned result body. :param show_headers: Indicate if response headers should be returned in result output. :param output_format: Select an alternate output representation of the result body contents. + :param output_refs: + Indicates which outputs by ID to be returned as HTTP Link header reference instead of body content value. + With reference transmission mode, outputs that contain literal data will be linked by ``text/plain`` file + containing the data. outputs that refer to a file reference will simply contain that URL reference as link. + With value transmission mode (default behavior when outputs are not specified in this list), outputs are + returned as direct values (literal or href) within the response content body. :returns: Results of the operation. """ if isinstance(inputs, list) and all(isinstance(item, list) for item in inputs): @@ -605,29 +650,35 @@ def execute(self, return result values, auth_headers = result data = { - # NOTE: since sync is not yet properly implemented in Weaver, simulate with monitoring after if requested - # FIXME: support 'sync' (https://github.com/crim-ca/weaver/issues/247) + # NOTE: Backward compatibility for servers that only know ``mode`` and don't handle ``Prefer`` header. "mode": ExecuteMode.ASYNC, "inputs": values, - # FIXME: support 'response: raw' (https://github.com/crim-ca/weaver/issues/376) "response": ExecuteResponse.DOCUMENT, - # FIXME: allow omitting 'outputs' (https://github.com/crim-ca/weaver/issues/375) - # FIXME: allow 'transmissionMode: value/reference' selection (https://github.com/crim-ca/weaver/issues/377) + # FIXME: allow filtering 'outputs' (https://github.com/crim-ca/weaver/issues/380) "outputs": {} } - # FIXME: since (https://github.com/crim-ca/weaver/issues/375) not implemented, auto-populate all the outputs result = self.describe(process_id, url=base) if not result.success: return OperationResult(False, "Could not obtain process description for execution.", body=result.body, headers=result.headers, code=result.code, text=result.text) outputs = result.body.get("outputs") + output_refs = set(output_refs or []) for output_id in outputs: - # use 'value' to have all outputs reported in body as 'value/href' rather than 'Link' headers - data["outputs"][output_id] = {"transmissionMode": ExecuteTransmissionMode.VALUE} + if output_id in output_refs: + # If any 'reference' is requested explicitly, must switch to 'response=raw' + # since 'response=document' ignores 'transmissionMode' definitions. + data["response"] = ExecuteResponse.RAW + # Use 'value' to have all outputs reported in body as 'value/href' rather than 'Link' headers. + out_mode = ExecuteTransmissionMode.REFERENCE + else: + # make sure to set value to outputs not requested as reference in case another one needs reference + # mode doesn't matter if no output by reference requested since 'response=document' would be used + out_mode = ExecuteTransmissionMode.VALUE + data["outputs"][output_id] = {"transmissionMode": out_mode} LOGGER.info("Executing [%s] with inputs:\n%s", process_id, OutputFormat.convert(values, OutputFormat.JSON_STR)) path = f"{base}/processes/{process_id}/execution" # use OGC-API compliant endpoint (not '/jobs') - headers = {} + headers = {"Prefer": "respond-async"} # for more recent servers, OGC-API compliant async request headers.update(self._headers) headers.update(auth_headers) resp = request_extra("POST", path, json=data, headers=headers, settings=self._settings) @@ -809,6 +860,65 @@ def monitor(self, once = False return OperationResult(False, f"Monitoring timeout reached ({timeout}s). Job did not complete in time.") + def _download_references(self, outputs, out_links, out_dir, job_id): + # type: (ExecutionResults, AnyHeadersContainer, str, str) -> ExecutionResults + """ + Download file references from results response contents and link headers. + + Downloaded files extend the results contents with ``path`` and ``source`` fields to indicate where the + retrieved files have been saved and where they came from. When files are found by HTTP header links, they + are added to the output contents to generate a combined representation in the operation result. + """ + if not isinstance(outputs, dict): + # default if links-only needed later (insert as content for printed output) + outputs = {} # type: ExecutionResults + + # download file results + if not (any("href" in value for value in outputs.values()) or len(out_links)): + return OperationResult(False, "Outputs were found but none are downloadable (only raw values?).", outputs) + if not out_dir: + out_dir = os.path.join(os.path.realpath(os.path.curdir), job_id) + os.makedirs(out_dir, exist_ok=True) + LOGGER.info("Will store job [%s] output results in [%s]", job_id, out_dir) + + # download outputs from body content + LOGGER.debug("%s outputs in results content.", "Processing" if len(outputs) else "No") + for output, value in outputs.items(): + is_list = True + if not isinstance(value, list): + value = [value] + is_list = False + for i, item in enumerate(value): + if "href" in item: + file_path = fetch_file(item["href"], out_dir, link=False) + if is_list: + outputs[output][i]["path"] = file_path + outputs[output][i]["source"] = "body" + else: + outputs[output]["path"] = file_path + outputs[output]["source"] = "body" + + # download links from headers + LOGGER.debug("%s outputs in results link headers.", "Processing" if len(out_links) else "No") + for _, link_header in ResponseHeaders(out_links).items(): + link, params = link_header.split(";", 1) + href = link.strip("<>") + params = parse_kvp(params, multi_value_sep=None, accumulate_keys=False) + ctype = (params.get("type") or [None])[0] + rel = params["rel"][0].split(".") + output = rel[0] + is_array = len(rel) > 1 and str.isnumeric(rel[1]) + file_path = fetch_file(href, out_dir, link=False) + value = {"href": href, "type": ctype, "path": file_path, "source": "link"} + if output in outputs: + if isinstance(outputs[output], dict): # in case 'rel=". OperationResult @@ -1216,11 +1314,29 @@ def make_parser(): Example: ``-I message='Hello Weaver' -I value:int=1234`` """) ) - # FIXME: support sync (https://github.com/crim-ca/weaver/issues/247) + # FIXME: allow filtering 'outputs' (https://github.com/crim-ca/weaver/issues/380) + # Only specified ones are returned, if none specified, return all. # op_execute.add_argument( - # "-A", "--async", dest="execute_async", - # help="" - # ) + # "-O", "--output", + op_execute.add_argument( + "-R", "--ref", "--reference", metavar="REFERENCE", dest="output_refs", action="append", + help=inspect.cleandoc(""" + Indicates which outputs by ID to be returned as HTTP Link header reference instead of body content value. + This defines the output transmission mode when submitting the execution request. + + With reference transmission mode, + outputs that contain literal data will be linked by ``text/plain`` file containing the data. + Outputs that refer to a file reference will simply contain that URL reference as link. + + With value transmission mode (default behavior when outputs are not specified in this list), outputs are + returned as direct values (literal or href) within the response content body. + + When requesting any output to be returned by reference, option ``-H/--headers`` should be considered as + well to return the provided ``Link`` headers for these outputs on the command line. + + Example: ``-R output-one -R output-two`` + """) + ) op_execute.add_argument( "-M", "--monitor", dest="monitor", action="store_true", help="Automatically perform the monitoring operation following job submission to retrieve final results. " @@ -1293,7 +1409,7 @@ def make_parser(): op_results = WeaverArgumentParser( "results", description=( - "Obtain the output results description of a job. " + "Obtain the output results from a job successfully executed. " "This operation can also download them from the remote server if requested." ), formatter_class=ParagraphFormatter, diff --git a/weaver/datatype.py b/weaver/datatype.py index ea701700d..add6a9210 100644 --- a/weaver/datatype.py +++ b/weaver/datatype.py @@ -30,7 +30,7 @@ from weaver import xml_util from weaver.exceptions import ProcessInstanceError, ServiceParsingError -from weaver.execute import ExecuteControlOption, ExecuteMode, ExecuteTransmissionMode +from weaver.execute import ExecuteControlOption, ExecuteMode, ExecuteResponse, ExecuteTransmissionMode from weaver.formats import AcceptLanguage, ContentType, repr_json from weaver.processes.constants import ProcessSchema from weaver.processes.convert import get_field, null, ows2json, wps2json_io @@ -58,7 +58,7 @@ from owslib.wps import WebProcessingService - from weaver.execute import AnyExecuteControlOption, AnyExecuteTransmissionMode + from weaver.execute import AnyExecuteControlOption, AnyExecuteMode, AnyExecuteResponse, AnyExecuteTransmissionMode from weaver.processes.constants import ProcessSchemaType from weaver.processes.types import AnyProcessType from weaver.quotation.status import AnyQuoteStatus @@ -67,6 +67,8 @@ AnyProcess, AnySettingsContainer, AnyUUID, + ExecutionInputs, + ExecutionOutputs, Number, CWL, JSON, @@ -756,19 +758,29 @@ def type(self): return "provider" def _get_inputs(self): - # type: () -> List[Optional[Dict[str, JSON]]] + # type: () -> Optional[ExecutionInputs] if self.get("inputs") is None: - self["inputs"] = list() + return {} return dict.__getitem__(self, "inputs") def _set_inputs(self, inputs): - # type: (List[Optional[Dict[str, JSON]]]) -> None - if not isinstance(inputs, list): - raise TypeError(f"Type 'list' is required for '{self.__name__}.inputs'") + # type: (Optional[ExecutionInputs]) -> None self["inputs"] = inputs # allows to correctly update list by ref using 'job.inputs.extend()' - inputs = property(_get_inputs, _set_inputs) + inputs = property(_get_inputs, _set_inputs, doc="Input values and reference submitted for execution.") + + def _get_outputs(self): + # type: () -> Optional[ExecutionOutputs] + if self.get("outputs") is None: + return {} + return dict.__getitem__(self, "outputs") + + def _set_outputs(self, outputs): + # type: (Optional[ExecutionOutputs]) -> None + self["outputs"] = outputs + + outputs = property(_get_outputs, _set_outputs, doc="Output transmission modes submitted for execution.") @property def user_id(self): @@ -825,6 +837,21 @@ def status_location(self, location_url): raise TypeError(f"Type 'str' is required for '{self.__name__}.status_location'") self["status_location"] = location_url + def status_url(self, container=None): + # type: (Optional[AnySettingsContainer]) -> str + """ + Obtain the resolved endpoint where the :term:`Job` status information can be obtained. + """ + settings = get_settings(container) + location_base = "/providers/{provider_id}".format(provider_id=self.service) if self.service else "" + location_url = "{base_url}{location_base}/processes/{process_id}/jobs/{job_id}".format( + base_url=get_wps_restapi_base_url(settings), + location_base=location_base, + process_id=self.process, + job_id=self.id + ) + return location_url + @property def notification_email(self): # type: () -> Optional[str] @@ -861,18 +888,39 @@ def execute_sync(self): @property def execution_mode(self): - # type: () -> ExecuteMode + # type: () -> AnyExecuteMode return ExecuteMode.get(self.get("execution_mode"), ExecuteMode.ASYNC) @execution_mode.setter def execution_mode(self, mode): - # type: (Union[ExecuteMode, str]) -> None + # type: (Union[AnyExecuteMode, str]) -> None exec_mode = ExecuteMode.get(mode) if exec_mode not in ExecuteMode: modes = list(ExecuteMode.values()) raise ValueError(f"Invalid value for '{self.__name__}.execution_mode'. Must be one of {modes}") self["execution_mode"] = mode + @property + def execution_response(self): + # type: () -> AnyExecuteResponse + out = self.setdefault("execution_response", ExecuteResponse.DOCUMENT) + if out not in ExecuteResponse.values(): + out = ExecuteResponse.DOCUMENT + self["execution_response"] = out + return out + + @execution_response.setter + def execution_response(self, response): + # type: (Optional[Union[AnyExecuteResponse, str]]) -> None + if response is None: + exec_resp = ExecuteResponse.DOCUMENT + else: + exec_resp = ExecuteResponse.get(response) + if exec_resp not in ExecuteResponse: + resp = list(ExecuteResponse.values()) + raise ValueError(f"Invalid value for '{self.__name__}.execution_response'. Must be one of {resp}") + self["execution_response"] = exec_resp + @property def is_local(self): # type: () -> bool @@ -969,7 +1017,7 @@ def _set_results(self, results): self["results"] = results # allows to correctly update list by ref using 'job.results.extend()' - results = property(_get_results, _set_results) + results = property(_get_results, _set_results, doc="Output values and references that resulted from execution.") def _get_exceptions(self): # type: () -> List[Union[str, Dict[str, str]]] @@ -1204,10 +1252,12 @@ def params(self): "service": self.service, "process": self.process, "inputs": self.inputs, + "outputs": self.outputs, "user_id": self.user_id, "status": self.status, "status_message": self.status_message, "status_location": self.status_location, + "execution_response": self.execution_response, "execution_mode": self.execution_mode, "is_workflow": self.is_workflow, "created": self.created, @@ -1798,27 +1848,49 @@ def outputs(self): @property def jobControlOptions(self): # noqa: N802 # type: () -> List[AnyExecuteControlOption] - jco = self.setdefault("jobControlOptions", [ExecuteControlOption.ASYNC]) + """ + Control options that indicate which :term:`Job` execution modes are supported by the :term:`Process`. + + .. note:: + + There are no official mentions about the ordering of ``jobControlOptions``. + Nevertheless, it is often expected that the first item can be considered the default mode when none is + requested explicitly (at execution time). With the definition of execution mode through the ``Prefer`` + header, `Weaver` has the option to decide if it wants to honor this header, according to available + resources and :term:`Job` duration. + + For this reason, ``async`` is placed first by default when nothing was defined during deployment, + since it is the preferred mode in `Weaver`. If deployment included items though, they are preserved as is. + This allows to re-deploy a :term:`Process` to a remote non-`Weaver` :term:`ADES` preserving the original + :term:`Process` definition. + + .. seealso:: + Discussion about expected ordering of ``jobControlOptions``: + https://github.com/opengeospatial/ogcapi-processes/issues/171#issuecomment-836819528 + """ + # Weaver's default async only, must override explicitly during deploy if sync is needed + jco_default = [ExecuteControlOption.ASYNC] + jco = self.setdefault("jobControlOptions", jco_default) if not isinstance(jco, list): # eg: None, bw-compat - jco = [ExecuteControlOption.ASYNC] + jco = jco_default jco = [ExecuteControlOption.get(opt) for opt in jco] jco = [opt for opt in jco if opt is not None] if len(jco) == 0: - jco.append(ExecuteControlOption.ASYNC) - self["jobControlOptions"] = jco + jco = jco_default + self["jobControlOptions"] = jco # no alpha order important! return dict.__getitem__(self, "jobControlOptions") @property def outputTransmission(self): # noqa: N802 # type: () -> List[AnyExecuteTransmissionMode] - out = self.setdefault("outputTransmission", [ExecuteTransmissionMode.REFERENCE]) + out = self.setdefault("outputTransmission", ExecuteTransmissionMode.values()) if not isinstance(out, list): # eg: None, bw-compat - out = [ExecuteTransmissionMode.REFERENCE] + out = [ExecuteTransmissionMode.VALUE] out = [ExecuteTransmissionMode.get(mode) for mode in out] out = [mode for mode in out if mode is not None] if len(out) == 0: - out.append(ExecuteTransmissionMode.REFERENCE) - self["outputTransmission"] = out + out.extend(ExecuteTransmissionMode.values()) + self["outputTransmission"] = list(sorted(out)) return dict.__getitem__(self, "outputTransmission") @property diff --git a/weaver/execute.py b/weaver/execute.py index 986640de8..a51787447 100644 --- a/weaver/execute.py +++ b/weaver/execute.py @@ -2,6 +2,9 @@ from weaver.base import Constants +if TYPE_CHECKING: + from typing import List + class ExecuteMode(Constants): AUTO = "auto" @@ -13,6 +16,14 @@ class ExecuteControlOption(Constants): ASYNC = "async-execute" SYNC = "sync-execute" + @classmethod + def values(cls): + # type: () -> List[AnyExecuteControlOption] + """ + Return default control options in specific order according to preferred modes for execution by `Weaver`. + """ + return [ExecuteControlOption.ASYNC, ExecuteControlOption.SYNC] + class ExecuteResponse(Constants): RAW = "raw" diff --git a/weaver/formats.py b/weaver/formats.py index 969e511e8..e99bc0e7b 100644 --- a/weaver/formats.py +++ b/weaver/formats.py @@ -165,7 +165,10 @@ def convert(cls, data, to, item_root="item"): xml = xml.strip() return xml if fmt in [OutputFormat.YML, OutputFormat.YAML]: - return yaml.safe_dump(data, indent=2, sort_keys=False) + yml = yaml.safe_dump(data, indent=2, sort_keys=False, width=float("inf")) + if yml.endswith("\n...\n"): # added when data is single literal or None instead of list/object + yml = yml[:-4] + return yml return data diff --git a/weaver/processes/builtin/__init__.py b/weaver/processes/builtin/__init__.py index aadc737b6..7d1f5843c 100644 --- a/weaver/processes/builtin/__init__.py +++ b/weaver/processes/builtin/__init__.py @@ -14,6 +14,7 @@ from weaver.database import get_db from weaver.datatype import Process from weaver.exceptions import PackageExecutionError, PackageNotFound, ProcessNotAccessible, ProcessNotFound +from weaver.execute import ExecuteControlOption from weaver.processes.constants import CWL_REQUIREMENT_APP_BUILTIN from weaver.processes.types import ProcessType from weaver.processes.wps_package import PACKAGE_EXTENSIONS, get_process_definition @@ -145,6 +146,7 @@ def register_builtin_processes(container): processDescriptionURL=process_url, processEndpointWPS1=get_wps_url(container), executeEndpoint="/".join([process_url, "jobs"]), + jobControlOptions=ExecuteControlOption.values(), visibility=Visibility.PUBLIC, )) diff --git a/weaver/processes/builtin/file2string_array.py b/weaver/processes/builtin/file2string_array.py index 280c11451..9f9eca4ef 100644 --- a/weaver/processes/builtin/file2string_array.py +++ b/weaver/processes/builtin/file2string_array.py @@ -17,6 +17,11 @@ LOGGER.addHandler(logging.StreamHandler(sys.stdout)) LOGGER.setLevel(logging.INFO) +# process details +__version__ = "1.1" +__title__ = "File to String-Array" +__abstract__ = __doc__ # NOTE: '__doc__' is fetched directly, this is mostly to be informative + OUTPUT_CWL_JSON = "cwl.output.json" diff --git a/weaver/processes/builtin/file_index_selector.py b/weaver/processes/builtin/file_index_selector.py index 9f4f79865..7dac13540 100644 --- a/weaver/processes/builtin/file_index_selector.py +++ b/weaver/processes/builtin/file_index_selector.py @@ -26,7 +26,7 @@ LOGGER.setLevel(logging.INFO) # process details -__version__ = "1.0" +__version__ = "1.1" __title__ = "File Index Selector" __abstract__ = __doc__ # NOTE: '__doc__' is fetched directly, this is mostly to be informative diff --git a/weaver/processes/builtin/jsonarray2netcdf.py b/weaver/processes/builtin/jsonarray2netcdf.py index 0cdee5d49..d6a675349 100644 --- a/weaver/processes/builtin/jsonarray2netcdf.py +++ b/weaver/processes/builtin/jsonarray2netcdf.py @@ -28,7 +28,7 @@ LOGGER.setLevel(logging.INFO) # process details -__version__ = "1.1" +__version__ = "1.2" __title__ = "JSON array to NetCDF" __abstract__ = __doc__ # NOTE: '__doc__' is fetched directly, this is mostly to be informative diff --git a/weaver/processes/builtin/metalink2netcdf.py b/weaver/processes/builtin/metalink2netcdf.py index 45c85ca43..c5a8007e8 100644 --- a/weaver/processes/builtin/metalink2netcdf.py +++ b/weaver/processes/builtin/metalink2netcdf.py @@ -27,7 +27,7 @@ LOGGER.setLevel(logging.INFO) # process details -__version__ = "1.0" +__version__ = "1.1" __title__ = "Metalink to NetCDF" __abstract__ = __doc__ # NOTE: '__doc__' is fetched directly, this is mostly to be informative diff --git a/weaver/processes/convert.py b/weaver/processes/convert.py index d84314be5..6f2e6e6f5 100644 --- a/weaver/processes/convert.py +++ b/weaver/processes/convert.py @@ -82,9 +82,11 @@ CWL_Output_Type, ExecutionInputs, ExecutionInputsList, + ExecutionOutputs, JobValueFile, JSON ) + from weaver.wps_restapi.constants import JobInputsOutputsSchemaType # typing shortcuts # pylint: disable=C0103,invalid-name @@ -1018,7 +1020,7 @@ def _get_file_input(input_data): def convert_input_values_schema(inputs, schema): - # type: (ExecutionInputs, ProcessSchemaType) -> ExecutionInputs + # type: (ExecutionInputs, JobInputsOutputsSchemaType) -> ExecutionInputs """ Convert execution input values between equivalent formats. @@ -1026,6 +1028,8 @@ def convert_input_values_schema(inputs, schema): :param schema: Desired schema. :return: Converted inputs. """ + if isinstance(schema, str): + schema = schema.upper() if ( (schema == ProcessSchema.OGC and isinstance(inputs, dict)) or (schema == ProcessSchema.OLD and isinstance(inputs, list)) @@ -1078,6 +1082,46 @@ def convert_input_values_schema(inputs, schema): raise NotImplementedError(f"Unknown conversion format of input values for schema: [{schema}]") +def convert_output_params_schema(outputs, schema): + # type: (ExecutionOutputs, JobInputsOutputsSchemaType) -> ExecutionOutputs + """ + Convert execution output parameters between equivalent formats. + + .. warning:: + These outputs are not *values* (i.e.: *results*), but *submitted* :term:`Job` outputs for return definitions. + Contents are transferred as-is without any consideration of ``value`` or ``href`` fields. + + :param outputs: Outputs to convert. + :param schema: Desired schema. + :return: Converted outputs. + """ + if isinstance(schema, str): + schema = schema.upper() + if ( + (schema == ProcessSchema.OGC and isinstance(outputs, dict)) or + (schema == ProcessSchema.OLD and isinstance(outputs, list)) + ): + return outputs + if ( + (schema == ProcessSchema.OGC and not isinstance(outputs, list)) or + (schema == ProcessSchema.OLD and not isinstance(outputs, dict)) + ): + name = fully_qualified_name(outputs) + raise ValueError(f"Unknown conversion method to schema [{schema}] for outputs of type [{name}]: {outputs}") + if schema == ProcessSchema.OGC: + out_dict = {} + for out in outputs: + out_id = get_any_id(out, pop=True) + out_dict[out_id] = out + return out_dict + if schema == ProcessSchema.OLD: + out_list = [{"id": out} for out in outputs] + for out in out_list: + out.update(outputs[out["id"]]) + return out_list + raise NotImplementedError(f"Unknown conversion format of outputs definitions for schema: [{schema}]") + + def repr2json_input_values(inputs): # type: (List[str]) -> ExecutionInputsList """ @@ -1611,7 +1655,7 @@ def wps2json_job_payload(wps_request, wps_process): else: data_output = wps_request.outputs[oid] if as_ref: - data_output["transmissionMode"] = ExecuteTransmissionMode.REFERENCE + data_output["transmissionMode"] = ExecuteTransmissionMode.VALUE else: data_output["transmissionMode"] = ExecuteTransmissionMode.VALUE data_output["id"] = oid diff --git a/weaver/processes/execution.py b/weaver/processes/execution.py index 94c1909ed..ee4f877b3 100644 --- a/weaver/processes/execution.py +++ b/weaver/processes/execution.py @@ -4,15 +4,16 @@ from typing import TYPE_CHECKING import colander +from celery.exceptions import TimeoutError as CeleryTaskTimeoutError from celery.utils.log import get_task_logger from owslib.util import clean_ows_url from owslib.wps import ComplexDataInput -from pyramid.httpexceptions import HTTPBadRequest, HTTPNotAcceptable, HTTPNotImplemented +from pyramid.httpexceptions import HTTPBadRequest, HTTPNotAcceptable from pyramid_celery import celery_app as app from weaver.database import get_db from weaver.datatype import Process, Service -from weaver.execute import ExecuteMode, ExecuteResponse, ExecuteTransmissionMode +from weaver.execute import ExecuteControlOption, ExecuteMode from weaver.formats import AcceptLanguage, ContentType from weaver.notify import encrypt_email, notify_job_complete from weaver.owsexceptions import OWSNoApplicableCode @@ -21,8 +22,19 @@ from weaver.processes.convert import get_field, ows2json_output_data from weaver.processes.types import ProcessType from weaver.status import JOB_STATUS_CATEGORIES, Status, StatusCategory, map_status -from weaver.store.base import StoreJobs -from weaver.utils import get_any_id, get_any_value, get_registry, get_settings, now, raise_on_xml_exception, wait_secs +from weaver.store.base import StoreJobs, StoreProcesses +from weaver.utils import ( + as_int, + get_any_id, + get_any_value, + get_header, + get_registry, + get_settings, + now, + parse_prefer_header_execute_mode, + raise_on_xml_exception, + wait_secs +) from weaver.visibility import Visibility from weaver.wps.utils import ( check_wps_status, @@ -34,7 +46,7 @@ load_pywps_config ) from weaver.wps_restapi import swagger_definitions as sd -from weaver.wps_restapi.utils import get_wps_restapi_base_url +from weaver.wps_restapi.jobs.utils import get_job_results_response, get_job_submission_response LOGGER = logging.getLogger(__name__) if TYPE_CHECKING: @@ -48,7 +60,7 @@ from weaver.datatype import Job from weaver.processes.convert import OWS_Input_Type, ProcessOWS from weaver.status import StatusType - from weaver.typedefs import HeadersType, HeaderCookiesType, JSON, SettingsType + from weaver.typedefs import AnyResponseType, CeleryResult, HeadersType, HeaderCookiesType, JSON, SettingsType from weaver.visibility import AnyVisibility @@ -430,7 +442,7 @@ def map_locations(job, settings): def submit_job(request, reference, tags=None): - # type: (Request, Union[Service, Process], Optional[List[str]]) -> JSON + # type: (Request, Union[Service, Process], Optional[List[str]]) -> AnyResponseType """ Generates the job submission from details retrieved in the request. @@ -483,28 +495,7 @@ def submit_job(request, reference, tags=None): headers = dict(request.headers) settings = get_settings(request) return submit_job_handler(json_body, settings, service_url, provider_id, process_id, is_workflow, is_local, - visibility, language=lang, auth=headers, tags=tags, user=user, context=context) - - -# FIXME: this should not be necessary if schema validators correctly implement OneOf(values) -def _validate_job_parameters(json_body): - # type: (JSON) -> None - """ - Tests supported parameters not automatically validated by colander deserialize. - """ - if json_body["mode"] not in [ExecuteMode.ASYNC, ExecuteMode.AUTO]: - raise HTTPNotImplemented(detail="Execution mode '{}' not supported.".format(json_body["mode"])) - - if json_body["response"] != ExecuteResponse.DOCUMENT: - raise HTTPNotImplemented(detail="Execution response type '{}' not supported.".format(json_body["response"])) - - outputs = json_body.get("outputs", []) - if isinstance(outputs, dict): - outputs = [dict(id=out, **keys) for out, keys in outputs.items()] - for job_output in outputs: - mode = job_output["transmissionMode"] - if mode not in ExecuteTransmissionMode.values(): - raise HTTPNotImplemented(detail="Execute transmissionMode '{}' not supported.".format(mode)) + visibility, language=lang, headers=headers, tags=tags, user=user, context=context) def submit_job_handler(payload, # type: JSON @@ -516,11 +507,11 @@ def submit_job_handler(payload, # type: JSON is_local=True, # type: bool visibility=None, # type: Optional[AnyVisibility] language=None, # type: Optional[str] - auth=None, # type: Optional[HeaderCookiesType] + headers=None, # type: Optional[HeaderCookiesType] tags=None, # type: Optional[List[str]] user=None, # type: Optional[int] context=None, # type: Optional[str] - ): # type: (...) -> JSON + ): # type: (...) -> AnyResponseType """ Submits the job to the Celery worker with provided parameters. @@ -531,39 +522,76 @@ def submit_job_handler(payload, # type: JSON except colander.Invalid as ex: raise HTTPBadRequest("Invalid schema: [{}]".format(str(ex))) - # TODO: remove when all parameter variations are supported - # FIXME: - # - support 'sync' and 'Prefer' header variants (https://github.com/crim-ca/weaver/issues/247) - # - support 'response: raw' (https://github.com/crim-ca/weaver/issues/376) - # - allow omitting 'outputs' (https://github.com/crim-ca/weaver/issues/375) - _validate_job_parameters(json_body) + db = get_db(settings) + headers = headers or {} + if is_local: + proc_store = db.get_store(StoreProcesses) + process = proc_store.fetch_by_id(process_id) + job_ctl_opts = process.jobControlOptions + else: + job_ctl_opts = ExecuteControlOption.values() + max_wait = as_int(settings.get("weaver.exec_sync_max_wait"), default=20) + mode, wait, applied = parse_prefer_header_execute_mode(headers, job_ctl_opts, max_wait) + get_header("prefer", headers, pop=True) + if not applied: # whatever returned is a default, consider 'mode' in body as alternative + is_execute_async = ExecuteMode.get(json_body.get("mode")) != ExecuteMode.SYNC # convert auto to async + else: + # as per https://datatracker.ietf.org/doc/html/rfc7240#section-2 + # Prefer header not resolve as valid still proces + is_execute_async = mode != ExecuteMode.SYNC + exec_resp = json_body.get("response") - is_execute_async = ExecuteMode.get(json_body["mode"]) != ExecuteMode.SYNC # convert auto to async notification_email = json_body.get("notification_email") encrypted_email = encrypt_email(notification_email, settings) if notification_email else None - store = get_db(settings).get_store(StoreJobs) + store = db.get_store(StoreJobs) # type: StoreJobs job = store.save_job(task_id=Status.ACCEPTED, process=process_id, service=provider_id, - inputs=json_body.get("inputs"), is_local=is_local, is_workflow=is_workflow, - access=visibility, user_id=user, execute_async=is_execute_async, custom_tags=tags, - notification_email=encrypted_email, accept_language=language, context=context) + inputs=json_body.get("inputs"), outputs=json_body.get("outputs"), + is_local=is_local, is_workflow=is_workflow, access=visibility, user_id=user, context=context, + execute_async=is_execute_async, execute_response=exec_resp, + custom_tags=tags, notification_email=encrypted_email, accept_language=language) job.save_log(logger=LOGGER, message="Job task submitted for execution.", status=Status.ACCEPTED, progress=0) job = store.update_job(job) - result = execute_process.delay(job_id=job.id, wps_url=clean_ows_url(service_url), headers=auth) - LOGGER.debug("Celery pending task [%s] for job [%s].", result.id, job.id) + location_url = job.status_url(settings) + resp_headers = {"Location": location_url} + resp_headers.update(applied) - # local/provider process location - location_base = "/providers/{provider_id}".format(provider_id=provider_id) if provider_id else "" - location = "{base_url}{location_base}/processes/{process_id}/jobs/{job_id}".format( - base_url=get_wps_restapi_base_url(settings), - location_base=location_base, - process_id=process_id, - job_id=job.id) - body_data = { + wps_url = clean_ows_url(service_url) + result = execute_process.delay(job_id=job.id, wps_url=wps_url, headers=headers) # type: CeleryResult + LOGGER.debug("Celery pending task [%s] for job [%s].", result.id, job.id) + if not is_execute_async: + LOGGER.debug("Celery task requested as sync if it completes before (wait=%ss)", wait) + try: + result.wait(timeout=wait) + except CeleryTaskTimeoutError: + pass + if result.ready(): + job = store.fetch_by_id(job.id) + # when sync is successful, it must return the results direct instead of status info + # see: https://docs.ogc.org/is/18-062r2/18-062r2.html#sc_execute_response + if job.status == Status.SUCCEEDED: + return get_job_results_response(job, settings, headers=resp_headers) + # otherwise return the error status + body = job.json(container=settings, self_link="status") + body["location"] = location_url + resp = get_job_submission_response(body, resp_headers, error=True) + return resp + else: + LOGGER.debug("Celery task requested as sync took too long to complete (wait=%ss). Continue in async.", wait) + # sync not respected, therefore must drop it + # since both could be provided as alternative preferences, drop only async with limited subset + prefer = get_header("Preference-Applied", headers, pop=True) + _, _, async_applied = parse_prefer_header_execute_mode({"Prefer": prefer}, [ExecuteMode.ASYNC]) + if async_applied: + resp_headers.update(async_applied) + + LOGGER.debug("Celery task submitted to run async.") + body = { "jobID": job.id, "processID": job.process, "providerID": provider_id, # dropped by validator if not applicable "status": map_status(Status.ACCEPTED), - "location": location + "location": location_url } - return body_data + resp = get_job_submission_response(body, resp_headers) + return resp diff --git a/weaver/processes/utils.py b/weaver/processes/utils.py index 900a33558..42194115b 100644 --- a/weaver/processes/utils.py +++ b/weaver/processes/utils.py @@ -102,19 +102,6 @@ def get_process(process_id=None, request=None, settings=None, store=None): raise HTTPBadRequest("Invalid schema:\n[{0!r}].".format(ex)) -def get_job_submission_response(body): - # type: (JSON) -> HTTPCreated - """ - Generates the successful response from contents returned by job submission process. - - .. seealso:: - :func:`weaver.processes.execution.submit_job` - """ - body["description"] = sd.CreatedLaunchJobResponse.description - body = sd.CreatedJobStatusSchema().deserialize(body) - return HTTPCreated(location=body["location"], json=body) - - def map_progress(progress, range_min, range_max): # type: (Number, Number, Number) -> Number """ diff --git a/weaver/processes/wps3_process.py b/weaver/processes/wps3_process.py index 8383f0044..23e0deb1d 100644 --- a/weaver/processes/wps3_process.py +++ b/weaver/processes/wps3_process.py @@ -241,7 +241,7 @@ def prepare(self): def format_outputs(self, workflow_outputs): # type: (JobOutputs) -> JobOutputs for output in workflow_outputs: - output.update({"transmissionMode": ExecuteTransmissionMode.REFERENCE}) + output.update({"transmissionMode": ExecuteTransmissionMode.VALUE}) return workflow_outputs def dispatch(self, process_inputs, process_outputs): diff --git a/weaver/store/base.py b/weaver/store/base.py index 49f96a621..c2cc1f423 100644 --- a/weaver/store/base.py +++ b/weaver/store/base.py @@ -3,13 +3,21 @@ if TYPE_CHECKING: import datetime - from typing import Any, Dict, List, Optional, Tuple, Union + from typing import Dict, List, Optional, Tuple, Union from pyramid.request import Request from pywps import Process as ProcessWPS from weaver.datatype import Bill, Job, Process, Quote, Service, VaultFile - from weaver.typedefs import AnyUUID, DatetimeIntervalType, SettingsType, TypedDict + from weaver.execute import AnyExecuteResponse + from weaver.typedefs import ( + AnyUUID, + ExecutionInputs, + ExecutionOutputs, + DatetimeIntervalType, + SettingsType, + TypedDict + ) JobGroupCategory = TypedDict("JobGroupCategory", {"category": Dict[str, Optional[str]], "count": int, "jobs": List[Job]}) @@ -113,10 +121,12 @@ def save_job(self, task_id, # type: str process, # type: str service=None, # type: Optional[str] - inputs=None, # type: Optional[List[Any]] + inputs=None, # type: Optional[ExecutionInputs] + outputs=None, # type: Optional[ExecutionOutputs] is_workflow=False, # type: bool is_local=False, # type: bool execute_async=True, # type: bool + execute_response=None, # type: Optional[AnyExecuteResponse] custom_tags=None, # type: Optional[List[str]] user_id=None, # type: Optional[int] access=None, # type: Optional[str] diff --git a/weaver/store/mongodb.py b/weaver/store/mongodb.py index 903ad2c0a..448cd9cdb 100644 --- a/weaver/store/mongodb.py +++ b/weaver/store/mongodb.py @@ -52,9 +52,10 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union from pymongo.collection import Collection + from weaver.execute import AnyExecuteResponse from weaver.processes.types import AnyProcessType from weaver.store.base import DatetimeIntervalType, JobGroupCategory, JobSearchResult - from weaver.typedefs import AnyProcess, AnyProcessClass, AnyUUID, AnyValueType + from weaver.typedefs import AnyProcess, AnyProcessClass, AnyUUID, AnyValueType, ExecutionInputs, ExecutionOutputs from weaver.visibility import AnyVisibility MongodbValue = Union[AnyValueType, datetime.datetime] @@ -572,10 +573,12 @@ def save_job(self, task_id, # type: AnyUUID process, # type: str service=None, # type: Optional[str] - inputs=None, # type: Optional[List[Any]] + inputs=None, # type: Optional[ExecutionInputs] + outputs=None, # type: Optional[ExecutionOutputs] is_workflow=False, # type: bool is_local=False, # type: bool execute_async=True, # type: bool + execute_response=None, # type: Optional[AnyExecuteResponse] custom_tags=None, # type: Optional[List[str]] user_id=None, # type: Optional[int] access=None, # type: Optional[str] @@ -607,8 +610,10 @@ def save_job(self, "service": service, # provider identifier (WPS service) "process": process, # process identifier (WPS request) "inputs": inputs, + "outputs": outputs, "status": map_status(Status.ACCEPTED), "execute_async": execute_async, + "execution_response": execute_response, "is_workflow": is_workflow, "is_local": is_local, "created": created if created else now(), diff --git a/weaver/typedefs.py b/weaver/typedefs.py index b9d8fd00a..5f3cf3421 100644 --- a/weaver/typedefs.py +++ b/weaver/typedefs.py @@ -24,6 +24,7 @@ FileSystemPathType = str from celery.app import Celery + from celery.result import AsyncResult, EagerResult, GroupResult, ResultSet from owslib.wps import BoundingBoxDataInput, ComplexDataInput, Process as ProcessOWS, WPSExecution from pyramid.httpexceptions import HTTPSuccessful, HTTPRedirection from pyramid.registry import Registry @@ -290,6 +291,24 @@ def __call__(self, message: str, progress: Number, status: AnyStatusType, *args: ExecutionInputsList = List[JobValueItem] # when schema='weaver.processes.constants.ProcessSchema.OLD' ExecutionInputs = Union[ExecutionInputsList, ExecutionInputsMap] + ExecutionOutputObject = TypedDict("ExecutionOutputObject", { + "transmissionMode": str + }, total=False) + ExecutionOutputItem = TypedDict("ExecutionOutputItem", { + "id": str, + "transmissionMode": str + }, total=False) + ExecutionOutputsList = List[ExecutionOutputItem] + ExecutionOutputsMap = Dict[str, ExecutionOutputObject] + ExecutionOutputs = Union[ExecutionOutputsList, ExecutionOutputsMap] + ExecutionResultObject = TypedDict("ExecutionResultObject", { + "value": Optional[AnyValueType], + "href": Optional[str], + "type": Optional[str], + }, total=False) + ExecutionResultArray = List[ExecutionResultObject] + ExecutionResults = Dict[str, Union[ExecutionResultObject, ExecutionResultArray]] + # reference employed as 'JobMonitorReference' by 'WPS1Process' JobExecution = TypedDict("JobExecution", {"execution": WPSExecution}) @@ -298,3 +317,5 @@ def __call__(self, message: str, progress: Number, status: AnyStatusType, *args: "inputs": JobInputs, "outputs": JobOutputs, }) + + CeleryResult = Union[AsyncResult, EagerResult, GroupResult, ResultSet] diff --git a/weaver/utils.py b/weaver/utils.py index fee3624a0..176abda26 100644 --- a/weaver/utils.py +++ b/weaver/utils.py @@ -387,12 +387,17 @@ def parse_prefer_header_execute_mode( This defines all conditions how to handle ``Prefer`` against applicable :term:`Process` description. - :rfc:`7240#section-4.1` HTTP Prefer header ``respond-async`` + .. seealso:: + If ``Prefer`` format is valid, but server decides it cannot be respected, it can be transparently ignored + (:rfc:`7240#section-2`). The server must respond with ``Preference-Applied`` indicating preserved preferences + it decided to respect. + :param header_container: Request headers to retrieve preference, if any available. :param supported_modes: Execute modes that are permitted for the operation that received the ``Prefer`` header. Resolved mode will respect this constrain following specification requirements of :term:`OGC API - Processes`. :param wait_max: - Maximum wait time enforced by the server. If requested wait time is greater, 'wait' preference will not be + Maximum wait time enforced by the server. If requested wait time is greater, ``wait`` preference will not be applied and will fallback to asynchronous response. :return: Tuple of resolved execution mode, wait time if specified, and header of applied preferences if possible. @@ -498,6 +503,18 @@ def is_uuid(maybe_uuid): return re.match(UUID_PATTERN, str(maybe_uuid)) is not None +def as_int(value, default): + # type: (Any, int) -> int + """ + Ensures a value is converted to :class:`int`. + """ + try: + return int(value) + except Exception: # noqa: W0703 # nosec: B110 + pass + return default + + def parse_extra_options(option_str, sep=","): # type: (str, str) -> Dict[str, Optional[str]] """ diff --git a/weaver/wps/service.py b/weaver/wps/service.py index 5edb73f4e..4a7627f11 100644 --- a/weaver/wps/service.py +++ b/weaver/wps/service.py @@ -21,7 +21,7 @@ from weaver.processes.convert import wps2json_job_payload from weaver.processes.execution import submit_job_handler from weaver.processes.types import ProcessType -from weaver.processes.utils import get_job_submission_response, get_process +from weaver.processes.utils import get_process from weaver.store.base import StoreProcesses from weaver.utils import get_header, get_registry, get_settings, get_weaver_url from weaver.visibility import Visibility @@ -34,6 +34,7 @@ load_pywps_config ) from weaver.wps_restapi import swagger_definitions as sd +from weaver.wps_restapi.jobs.utils import get_job_submission_response LOGGER = logging.getLogger(__name__) if TYPE_CHECKING: @@ -236,9 +237,14 @@ def _submit_job(self, wps_request): is_workflow = proc.type == ProcessType.WORKFLOW tags = req.args.get("tags", "").split(",") + ["xml", "wps-{}".format(wps_request.version)] data = wps2json_job_payload(wps_request, wps_process) - body = submit_job_handler(data, self.settings, proc.processEndpointWPS1, - process_id=pid, is_local=True, is_workflow=is_workflow, visibility=Visibility.PUBLIC, - language=wps_request.language, tags=tags, auth=dict(req.headers), context=ctx) + resp = submit_job_handler( + data, self.settings, proc.processEndpointWPS1, + process_id=pid, is_local=True, is_workflow=is_workflow, visibility=Visibility.PUBLIC, + language=wps_request.language, tags=tags, headers=dict(req.headers), context=ctx + ) + # enforced JSON results with submitted data that includes 'response=document' + # use 'json_body' to work with any 'response' implementation + body = resp.json_body # if Accept was JSON, provide response content as is # if anything else (even */*), return as XML @@ -247,7 +253,7 @@ def _submit_job(self, wps_request): # way to provide explicitly Accept header. Even our Wps1Process as Workflow step depends on this behaviour. accept_type = get_header("Accept", req.headers) if accept_type == ContentType.APP_JSON: - resp = get_job_submission_response(body) + resp = get_job_submission_response(body, resp.headers) setattr(resp, "_update_status", lambda *_, **__: None) # patch to avoid pywps server raising return resp diff --git a/weaver/wps_restapi/api.py b/weaver/wps_restapi/api.py index 9c454dc01..f06dfa35c 100644 --- a/weaver/wps_restapi/api.py +++ b/weaver/wps_restapi/api.py @@ -321,23 +321,33 @@ def api_conformance(request): # noqa: F811 # ogcapi_proc_core + "/per/core/process-execute-input-inline-bbox", ogcapi_proc_core + "/per/core/process-execute-sync-job", ogcapi_proc_core + "/per/core/limit-response", + # ogcapi_proc_core + "/per/core/limit-default-minimum-maximum", ogcapi_proc_core + "/per/core/prev", ogcapi_proc_core + "/per/job-list/limit-response", ogcapi_proc_core + "/per/job-list/prev", # ogcapi_proc_core + "/rec/core/access-control-expose-headers", ogcapi_proc_core + "/rec/core/api-definition-oas", ogcapi_proc_core + "/rec/core/cross-origin", + ogcapi_proc_core + "/rec/core/content-length", # ogcapi_proc_core + "/rec/core/html", + # ogcapi_proc_core + "/rec/core/http-head", ogcapi_proc_core + "/rec/core/job-status", + ogcapi_proc_core + "/rec/core/job-results-async-many-json-prefer-none", + # FIXME: https://github.com/crim-ca/weaver/issues/414 + # ogcapi_proc_core + "/rec/core/job-results-async-many-json-prefer-minimal", + # ogcapi_proc_core + "/rec/core/job-results-async-many-json-prefer-representation", + # ogcapi_proc_core + "/per/core/job-results-async-many-other-formats", + ogcapi_proc_core + "/rec/core/process-execute-sync-many-json-prefer-none", + # ogcapi_proc_core + "/rec/core/process-execute-sync-many-json-prefer-minimal", + # ogcapi_proc_core + "/rec/core/process-execute-sync-many-json-prefer-representation", # ogcapi_proc_core + "/rec/core/link-header", ogcapi_proc_core + "/rec/core/ogc-process-description", # FIXME: error details (for all below: https://github.com/crim-ca/weaver/issues/320) # ogcapi_proc_core + "/rec/core/problem-details", - # FIXME: https://github.com/crim-ca/weaver/issues/247 (Prefer header) - # ogcapi_proc_core + "/rec/core/process-execute-handle-prefer", - # ogcapi_proc_core + "/rec/core/process-execute-honor-prefer", - # ogcapi_proc_core + "/rec/core/process-execute-mode-auto", - # ogcapi_proc_core + "/rec/core/process-execute-preference-applied", + ogcapi_proc_core + "/rec/core/process-execute-handle-prefer", + ogcapi_proc_core + "/rec/core/process-execute-honor-prefer", + ogcapi_proc_core + "/rec/core/process-execute-mode-auto", + ogcapi_proc_core + "/rec/core/process-execute-preference-applied", ogcapi_proc_core + "/rec/core/process-execute-sync-document-ref", ogcapi_proc_core + "/rec/core/next-1", ogcapi_proc_core + "/rec/core/next-2", @@ -370,11 +380,12 @@ def api_conformance(request): # noqa: F811 ogcapi_proc_core + "/req/core/job-results-failed", ogcapi_proc_core + "/req/core/job-results", ogcapi_proc_core + "/req/core/job-results-async-document", + # FIXME: support raw multipart (https://github.com/crim-ca/weaver/issues/376) # ogcapi_proc_core + "/req/core/job-results-async-raw-mixed-multi", - # ogcapi_proc_core + "/req/core/job-results-async-raw-ref", + ogcapi_proc_core + "/req/core/job-results-async-raw-ref", # ogcapi_proc_core + "/req/core/job-results-async-raw-value-multi", - # ogcapi_proc_core + "/req/core/job-results-async-raw-value-one", - # ogcapi_proc_core + "/req/core/job-results-success-sync", + ogcapi_proc_core + "/req/core/job-results-async-raw-value-one", + ogcapi_proc_core + "/req/core/job-results-success-sync", ogcapi_proc_core + "/req/core/job-success", ogcapi_proc_core + "/req/core/landingpage-op", ogcapi_proc_core + "/req/core/landingpage-success", @@ -382,8 +393,7 @@ def api_conformance(request): # noqa: F811 ogcapi_proc_core + "/req/core/process", ogcapi_proc_core + "/req/core/process-success", ogcapi_proc_core + "/req/core/process-exception/no-such-process", - # FIXME: https://github.com/crim-ca/weaver/issues/247 (Prefer header) - # ogcapi_proc_core + "/req/core/process-execute-auto-execution-mode", + ogcapi_proc_core + "/req/core/process-execute-auto-execution-mode", ogcapi_proc_core + "/req/core/process-execute-default-execution-mode", ogcapi_proc_core + "/req/core/process-execute-default-outputs", ogcapi_proc_core + "/req/core/process-execute-input-array", @@ -400,9 +410,10 @@ def api_conformance(request): # noqa: F811 ogcapi_proc_core + "/req/core/process-execute-success-async", ogcapi_proc_core + "/req/core/process-execute-sync-document", # ogcapi_proc_core + "/req/core/process-execute-sync-raw-mixed-multi", - # ogcapi_proc_core + "/req/core/process-execute-sync-raw-ref", + ogcapi_proc_core + "/req/core/process-execute-sync-raw-ref", + # FIXME: support raw multipart (https://github.com/crim-ca/weaver/issues/376) # ogcapi_proc_core + "/req/core/process-execute-sync-raw-value-multi", - # ogcapi_proc_core + "/req/core/process-execute-sync-raw-value-one", + ogcapi_proc_core + "/req/core/process-execute-sync-raw-value-one", ogcapi_proc_core + "/req/core/pl-limit-definition", ogcapi_proc_core + "/req/core/pl-limit-response", ogcapi_proc_core + "/req/core/process-list", diff --git a/weaver/wps_restapi/colander_extras.py b/weaver/wps_restapi/colander_extras.py index 7306e72fb..301f96760 100644 --- a/weaver/wps_restapi/colander_extras.py +++ b/weaver/wps_restapi/colander_extras.py @@ -1152,17 +1152,53 @@ def _validate_nodes(self): ExtendedSchemaBase._validate(node) +class StrictMappingSchema(ExtendedMappingSchema): + """ + Object schema that will ``raise`` any unknown field not represented by children schema. + + This is equivalent to `OpenAPI` object mapping with ``additionalProperties: false``. + This type is useful for defining a dictionary that matches *exactly* a specific set of values and children schema. + + ..note:: + When doing schema deserialization to validate it, unknown keys would normally be removed without this class + (default behaviour is to ``ignore`` them). With this schema, content under an unknown key is fails validation. + + .. seealso:: + :class:`PermissiveMappingSchema` + """ + def __init__(self, *args, **kwargs): + kwargs["unknown"] = "raise" + super(StrictMappingSchema, self).__init__(*args, **kwargs) + # sub-type mapping itself must also have 'raise' such that its own 'deserialize' copies the fields over + self.typ.unknown = "raise" + + +class EmptyMappingSchema(StrictMappingSchema): + """ + Mapping that guarantees it is completely empty for validation during deserialization. + + Any children added to this schema are removed automatically. + """ + def __init__(self, *args, **kwargs): + super(EmptyMappingSchema, self).__init__(*args, **kwargs) + self.children = [] + + class PermissiveMappingSchema(ExtendedMappingSchema): """ - Object schema that will allow *any unknown* field to remain present in the resulting deserialization. + Object schema that will ``preserve`` any unknown field to remain present in the resulting deserialization. This type is useful for defining a dictionary where some field names are not known in advance, or when more optional keys that don't need to all be exhaustively provided in the schema are acceptable. - When doing schema deserialization to validate it, unknown keys would normally be removed without this class - (default behaviour is to ``ignore`` them). With this schema, content under an unknown key is ``preserved`` - as it was received without any validation. Other fields that are explicitly specified with sub-schema nodes - will still be validated as per usual behaviour. + ..note:: + When doing schema deserialization to validate it, unknown keys would normally be removed without this class + (default behaviour is to ``ignore`` them). With this schema, content under an unknown key using ``preserve`` + are passed down without any validation. Other fields that are explicitly specified with sub-schema nodes + will still be validated as per usual behaviour. + + .. seealso:: + :class:`StrictMappingSchema` Example:: @@ -1785,9 +1821,6 @@ class NotKeywordSchema(KeywordMapper): Corresponds to the ``not`` specifier of `OpenAPI` specification. - This is equivalent to `OpenAPI` object mapping with ``additionalProperties: false``, but is more explicit in - the definition of invalid or conflicting field names with explicit definitions during deserialization. - Example:: class RequiredItem(ExtendedMappingSchema): diff --git a/weaver/wps_restapi/constants.py b/weaver/wps_restapi/constants.py index 7c99527f3..71089599a 100644 --- a/weaver/wps_restapi/constants.py +++ b/weaver/wps_restapi/constants.py @@ -3,7 +3,7 @@ from weaver.base import Constants -class JobOutputsSchema(Constants): +class JobInputsOutputsSchema(Constants): """ Schema selector to represent a :term:`Job` output results. """ @@ -16,9 +16,9 @@ class JobOutputsSchema(Constants): if TYPE_CHECKING: from weaver.typedefs import Literal - JobOutputsSchemaType = Literal[ - JobOutputsSchema.OGC_STRICT, - JobOutputsSchema.OLD_STRICT, - JobOutputsSchema.OGC, - JobOutputsSchema.OLD + JobInputsOutputsSchemaType = Literal[ + JobInputsOutputsSchema.OGC_STRICT, + JobInputsOutputsSchema.OLD_STRICT, + JobInputsOutputsSchema.OGC, + JobInputsOutputsSchema.OLD ] diff --git a/weaver/wps_restapi/examples/providers_processes_details.json b/weaver/wps_restapi/examples/providers_processes_details.json index 6a12ce45d..e60b59483 100644 --- a/weaver/wps_restapi/examples/providers_processes_details.json +++ b/weaver/wps_restapi/examples/providers_processes_details.json @@ -11,7 +11,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ], "processDescriptionURL": "http://localhost:4002/processes/ColibriFlyingpigeon_SubsetBbox", "processEndpointWPS1": "http://localhost:4002/ows/wps", @@ -28,7 +28,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ], "processDescriptionURL": "http://localhost:4002/processes/OutardeFlyingpigeon_SubsetBbox", "processEndpointWPS1": "http://localhost:4002/ows/wps", @@ -47,7 +47,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ], "processDescriptionURL": "http://localhost:4002/processes/Staging_S2L1C", "processEndpointWPS1": "http://localhost:4002/ows/wps", @@ -66,7 +66,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ], "processDescriptionURL": "http://localhost:4002/processes/Staging_S2L1C-mock-docker", "processEndpointWPS1": "http://localhost:4002/ows/wps", @@ -85,7 +85,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ], "processDescriptionURL": "http://localhost:4002/processes/WaterExtent_S2-mock-docker", "processEndpointWPS1": "http://localhost:4002/ows/wps", @@ -104,7 +104,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ], "processDescriptionURL": "http://localhost:4002/processes/WorkflowWaterExtent", "processEndpointWPS1": "http://localhost:4002/ows/wps", @@ -123,7 +123,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ], "processDescriptionURL": "http://localhost:4002/processes/WorkflowWaterExtent-mock", "processEndpointWPS1": "http://localhost:4002/ows/wps", @@ -159,7 +159,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ], "processDescriptionURL": "http://localhost:4002/processes/anti-spoofing", "processEndpointWPS1": "http://localhost:4002/ows/wps", @@ -178,7 +178,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ], "processDescriptionURL": "http://localhost:4002/processes/docker-demo-cat", "processEndpointWPS1": "http://localhost:4002/ows/wps", @@ -195,7 +195,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ], "processDescriptionURL": "http://localhost:4002/processes/docker-python-script", "processEndpointWPS1": "http://localhost:4002/ows/wps", @@ -212,7 +212,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ], "processDescriptionURL": "http://localhost:4002/processes/file2string_array", "processEndpointWPS1": "http://localhost:4002/ows/wps", @@ -239,7 +239,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ], "processDescriptionURL": "http://localhost:4002/processes/image-utils", "processEndpointWPS1": "http://localhost:4002/ows/wps", @@ -258,7 +258,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ], "processDescriptionURL": "http://localhost:4002/processes/jsonarray2netcdf", "processEndpointWPS1": "http://localhost:4002/ows/wps", @@ -313,7 +313,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ], "processDescriptionURL": "http://localhost:4002/processes/las2tif", "processEndpointWPS1": "http://localhost:4002/ows/wps", @@ -332,7 +332,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ], "processDescriptionURL": "http://localhost:4002/processes/metalink2netcdf", "processEndpointWPS1": "http://localhost:4002/ows/wps", @@ -352,7 +352,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ], "processDescriptionURL": "http://localhost:4002/processes/python-script", "processEndpointWPS1": "http://localhost:4002/ows/wps", @@ -370,7 +370,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ], "processDescriptionURL": "http://localhost:4002/processes/sleep", "processEndpointWPS1": "http://localhost:4002/ows/wps", @@ -397,7 +397,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ], "processDescriptionURL": "http://localhost:4002/processes/test_blurring", "processEndpointWPS1": "http://localhost:4002/ows/wps", @@ -424,7 +424,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ], "processDescriptionURL": "http://localhost:4002/processes/test_generation", "processEndpointWPS1": "http://localhost:4002/ows/wps", @@ -443,7 +443,7 @@ "async-execute" ], "outputTransmission": [ - "reference" + "value" ], "processDescriptionURL": "http://localhost:4002/processes/test_workflow", "processEndpointWPS1": "http://localhost:4002/ows/wps", diff --git a/weaver/wps_restapi/jobs/jobs.py b/weaver/wps_restapi/jobs/jobs.py index d9392c935..9111a0825 100644 --- a/weaver/wps_restapi/jobs/jobs.py +++ b/weaver/wps_restapi/jobs/jobs.py @@ -1,350 +1,39 @@ -import math -import os -import shutil -from copy import deepcopy from typing import TYPE_CHECKING from celery.utils.log import get_task_logger from colander import Invalid -from pyramid.httpexceptions import ( - HTTPBadRequest, - HTTPNotFound, - HTTPOk, - HTTPPermanentRedirect, - HTTPUnauthorized, - HTTPUnprocessableEntity -) -from pyramid.request import Request -from pyramid_celery import celery_app +from pyramid.httpexceptions import HTTPBadRequest, HTTPOk, HTTPPermanentRedirect, HTTPUnprocessableEntity from notify import encrypt_email from weaver.database import get_db from weaver.datatype import Job -from weaver.exceptions import ( - InvalidIdentifierValue, - JobGone, - JobInvalidParameter, - JobNotFound, - ProcessNotAccessible, - ProcessNotFound, - ServiceNotAccessible, - ServiceNotFound, - log_unhandled_exceptions -) -from weaver.formats import ContentType, OutputFormat, get_format, repr_json -from weaver.owsexceptions import OWSNoApplicableCode, OWSNotFound -from weaver.processes.convert import any2wps_literal_datatype -from weaver.status import JOB_STATUS_CATEGORIES, Status, StatusCategory, map_status -from weaver.store.base import StoreJobs, StoreProcesses, StoreServices -from weaver.utils import get_any_id, get_any_value, get_path_kvp, get_settings, get_weaver_url, is_uuid -from weaver.visibility import Visibility -from weaver.wps.utils import get_wps_output_dir, get_wps_output_url +from weaver.exceptions import JobNotFound, log_unhandled_exceptions +from weaver.formats import OutputFormat, repr_json +from weaver.processes.convert import convert_input_values_schema, convert_output_params_schema +from weaver.store.base import StoreJobs +from weaver.utils import get_settings from weaver.wps_restapi import swagger_definitions as sd -from weaver.wps_restapi.constants import JobOutputsSchema -from weaver.wps_restapi.providers.utils import forbid_local_only +from weaver.wps_restapi.jobs.utils import ( + dismiss_job_task, + get_job, + get_job_list_links, + get_job_results_response, + get_results, + get_schema_query, + raise_job_bad_status, + raise_job_dismissed, + validate_service_process +) from weaver.wps_restapi.swagger_definitions import datetime_interval_parser if TYPE_CHECKING: - from typing import Dict, Iterable, List, Optional, Tuple, Union - - from pyramid.httpexceptions import HTTPException + from typing import Iterable, List - from weaver.typedefs import AnySettingsContainer, AnyValueType, JSON - from weaver.wps_restapi.constants import JobOutputsSchemaType + from weaver.typedefs import JSON, AnyResponseType, PyramidRequest LOGGER = get_task_logger(__name__) -def get_job(request): - # type: (Request) -> Job - """ - Obtain a job from request parameters. - - :returns: Job information if found. - :raise HTTPNotFound: with JSON body details on missing/non-matching job, process, provider IDs. - """ - job_id = request.matchdict.get("job_id") - try: - if not is_uuid(job_id): - raise JobInvalidParameter - store = get_db(request).get_store(StoreJobs) - job = store.fetch_by_id(job_id) - except (JobInvalidParameter, JobNotFound) as exc: - exception = type(exc) - if exception is JobInvalidParameter: - desc = "Invalid job reference is not a valid UUID." - else: - desc = "Could not find job with specified reference." - title = "NoSuchJob" - raise exception( - # new format: https://docs.ogc.org/is/18-062r2/18-062r2.html#req_core_job-exception-no-such-job - json={ - "title": title, - "type": "http://www.opengis.net/def/exceptions/ogcapi-processes-1/1.0/no-such-job", - "detail": desc, - "status": exception.code, - "cause": str(job_id) - }, - code=title, locator="JobID", description=desc # old format - ) - - provider_id = request.matchdict.get("provider_id", job.service) - process_id = request.matchdict.get("process_id", job.process) - if provider_id: - forbid_local_only(request) - - if job.service != provider_id: - title = "NoSuchProvider" - desc = "Could not find job reference corresponding to specified provider reference." - raise OWSNotFound( - # new format: https://docs.ogc.org/is/18-062r2/18-062r2.html#req_core_job-exception-no-such-job - json={ - "title": title, - "type": "http://www.opengis.net/def/exceptions/ogcapi-processes-1/1.0/no-such-job", - "detail": desc, - "status": OWSNotFound.code, - "cause": str(process_id) - }, - code=title, locator="provider", description=desc # old format - ) - if job.process != process_id: - title = "NoSuchProcess" - desc = "Could not find job reference corresponding to specified process reference." - raise OWSNotFound( - # new format: https://docs.ogc.org/is/18-062r2/18-062r2.html#req_core_job-exception-no-such-job - # note: although 'no-such-process' error, return 'no-such-job' because process could exist, only mismatches - json={ - "title": title, - "type": "http://www.opengis.net/def/exceptions/ogcapi-processes-1/1.0/no-such-job", - "detail": desc, - "status": OWSNotFound.code, - "cause": str(process_id) - }, - code=title, locator="process", description=desc # old format - ) - return job - - -def get_job_list_links(job_total, filters, request): - # type: (int, Dict[str, AnyValueType], Request) -> List[JSON] - """ - Obtains a list of all relevant links for the corresponding job listing defined by query parameter filters. - - :raises IndexError: if the paging values are out of bounds compared to available total :term:`Job` matching search. - """ - base_url = get_weaver_url(request) - - # reapply queries that must be given to obtain the same result in case of subsequent requests (sort, limits, etc.) - kvp_params = {param: value for param, value in request.params.items() if param != "page"} - # patch datetime that have some extra character manipulation (reapply '+' auto-converted to ' ' by params parser) - if "datetime" in kvp_params: - kvp_params["datetime"] = kvp_params["datetime"].replace(" ", "+") - alt_kvp = deepcopy(kvp_params) - - # request job uses general endpoint, obtain the full path if any service/process was given as alternate location - if request.path.startswith(sd.jobs_service.path): - job_path = base_url + sd.jobs_service.path - alt_path = None - parent_url = None - # cannot generate full path apply for 'service' by itself - if filters["process"] and filters["service"]: - alt_path = base_url + sd.provider_jobs_service.path.format( - provider_id=filters["service"], process_id=filters["process"] - ) - parent_url = alt_path.rsplit("/", 1)[0] - elif filters["process"]: - alt_path = base_url + sd.process_jobs_service.path.format(process_id=filters["process"]) - parent_url = alt_path.rsplit("/", 1)[0] - for param in ["service", "provider", "process"]: - alt_kvp.pop(param, None) - # path is whichever specific service/process endpoint, jobs are pre-filtered by them - # transform sub-endpoints into matching query parameters and use generic path as alternate location - else: - job_path = base_url + request.path - alt_path = base_url + sd.jobs_service.path - alt_kvp["process"] = filters["process"] - if filters["service"]: - alt_kvp["provider"] = filters["service"] - parent_url = job_path.rsplit("/", 1)[0] - - cur_page = filters["page"] - per_page = filters["limit"] - max_page = max(math.ceil(job_total / per_page) - 1, 0) - if cur_page < 0 or cur_page > max_page: - raise IndexError(f"Page index {cur_page} is out of range from [0,{max_page}].") - - alt_links = [] - if alt_path: - alt_links = [{ - "href": get_path_kvp(alt_path, page=cur_page, **alt_kvp), "rel": "alternate", - "type": ContentType.APP_JSON, "title": "Alternate endpoint with equivalent set of filtered jobs." - }] - - links = alt_links + [ - {"href": job_path, "rel": "collection", - "type": ContentType.APP_JSON, "title": "Complete job listing (no filtering queries applied)."}, - {"href": base_url + sd.jobs_service.path, "rel": "search", - "type": ContentType.APP_JSON, "title": "Generic query endpoint to search for jobs."}, - {"href": job_path + "?detail=false", "rel": "preview", - "type": ContentType.APP_JSON, "title": "Job listing summary (UUID and count only)."}, - {"href": job_path, "rel": "http://www.opengis.net/def/rel/ogc/1.0/job-list", - "type": ContentType.APP_JSON, "title": "List of registered jobs."}, - {"href": get_path_kvp(job_path, page=cur_page, **kvp_params), "rel": "current", - "type": ContentType.APP_JSON, "title": "Current page of job query listing."}, - {"href": get_path_kvp(job_path, page=0, **kvp_params), "rel": "first", - "type": ContentType.APP_JSON, "title": "First page of job query listing."}, - {"href": get_path_kvp(job_path, page=max_page, **kvp_params), "rel": "last", - "type": ContentType.APP_JSON, "title": "Last page of job query listing."}, - ] - if cur_page > 0: - links.append({ - "href": get_path_kvp(job_path, page=cur_page - 1, **kvp_params), "rel": "prev", - "type": ContentType.APP_JSON, "title": "Previous page of job query listing." - }) - if cur_page < max_page: - links.append({ - "href": get_path_kvp(job_path, page=cur_page + 1, **kvp_params), "rel": "next", - "type": ContentType.APP_JSON, "title": "Next page of job query listing." - }) - if parent_url: - links.append({ - "href": parent_url, "rel": "up", - "type": ContentType.APP_JSON, "title": "Parent collection for which listed jobs apply." - }) - return links - - -def get_results(job, container, value_key=None, schema=JobOutputsSchema.OLD): - # type: (Job, AnySettingsContainer, Optional[str], JobOutputsSchemaType) -> Union[List[JSON], JSON] - """ - Obtains the job results with extended full WPS output URL as applicable and according to configuration settings. - - :param job: job from which to retrieve results. - :param container: any container giving access to instance settings (to resolve reference output location). - :param value_key: - If not specified, the returned values will have the appropriate ``data``/``href`` key according to the content. - Otherwise, all values will have the specified key. - :param schema: - Selects which schema to employ for representing the output results. - :returns: list of all outputs each with minimally an ID and value under the requested key. - """ - wps_url = get_wps_output_url(container) - if not wps_url.endswith("/"): - wps_url = wps_url + "/" - schema = JobOutputsSchema.get(str(schema).lower(), default=JobOutputsSchema.OLD) - strict = schema.endswith("+strict") - schema = schema.split("+")[0] - ogc_api = schema == JobOutputsSchema.OGC - outputs = {} if ogc_api else [] - fmt_key = "mediaType" if ogc_api else "mimeType" - for result in job.results: - rtype = "data" if any(k in result for k in ["data", "value"]) else "href" - value = get_any_value(result) - out_id = get_any_id(result) - out_key = rtype - if rtype == "href": - # fix paths relative to instance endpoint, but leave explicit links as is (eg: S3 bucket, remote HTTP, etc.) - if value.startswith("/"): - value = str(value).lstrip("/") - if "://" not in value: - value = wps_url + value - elif ogc_api: - out_key = "value" - elif value_key: - out_key = value_key - output = {out_key: value} - if rtype == "href": # required for the rest to be there, other fields optional - if "mimeType" not in result: - result["mimeType"] = get_format(value, default=ContentType.TEXT_PLAIN).mime_type - if ogc_api or not strict: - output["type"] = result["mimeType"] - if not ogc_api or not strict: - output["format"] = {fmt_key: result["mimeType"]} - for field in ["encoding", "schema"]: - if field in result: - output["format"][field] = result[field] - elif rtype != "href": - # literal data - # FIXME: BoundingBox not implemented (https://github.com/crim-ca/weaver/issues/51) - dtype = result.get("dataType", any2wps_literal_datatype(value, is_value=True) or "string") - if ogc_api: - output["dataType"] = {"name": dtype} - else: - output["dataType"] = dtype - - if ogc_api: - if out_id in outputs: - output_list = outputs[out_id] - if not isinstance(output_list, list): - output_list = [output_list] - output_list.append(output) - outputs[out_id] = output_list - else: - outputs[out_id] = output - else: - # if ordered insert supported by python version, insert ID first - output = dict([("id", out_id)] + list(output.items())) # noqa - outputs.append(output) - return outputs - - -def validate_service_process(request): - # type: (Request) -> Tuple[Optional[str], Optional[str]] - """ - Verifies that service or process specified by path or query will raise the appropriate error if applicable. - """ - service_name = ( - request.matchdict.get("provider_id", None) or - request.params.get("provider", None) or - request.params.get("service", None) # backward compatibility - ) - process_name = ( - request.matchdict.get("process_id", None) or - request.params.get("process", None) or - request.params.get("processID", None) # OGC-API conformance - ) - item_test = None - item_type = None - - try: - service = None - if service_name: - forbid_local_only(request) - item_type = "Service" - item_test = service_name - store = get_db(request).get_store(StoreServices) - service = store.fetch_by_name(service_name, visibility=Visibility.PUBLIC) - if process_name: - item_type = "Process" - item_test = process_name - # local process - if not service: - store = get_db(request).get_store(StoreProcesses) - store.fetch_by_id(process_name, visibility=Visibility.PUBLIC) - # remote process - else: - processes = service.processes(request) - if process_name not in [p.id for p in processes]: - raise ProcessNotFound - except (ServiceNotFound, ProcessNotFound): - raise HTTPNotFound(json={ - "code": "NoSuch{}".format(item_type), - "description": "{} of id '{}' cannot be found.".format(item_type, item_test) - }) - except (ServiceNotAccessible, ProcessNotAccessible): - raise HTTPUnauthorized(json={ - "code": "Unauthorized{}".format(item_type), - "description": "{} of id '{}' is not accessible.".format(item_type, item_test) - }) - except InvalidIdentifierValue as ex: - raise HTTPBadRequest(json={ - "code": InvalidIdentifierValue.__name__, - "description": str(ex) - }) - - return service_name, process_name - - @sd.provider_jobs_service.get(tags=[sd.TAG_JOBS, sd.TAG_PROVIDERS], renderer=OutputFormat.JSON, schema=sd.GetProviderJobsEndpoint(), response_schemas=sd.get_prov_all_jobs_responses) @sd.process_jobs_service.get(tags=[sd.TAG_PROCESSES, sd.TAG_JOBS], renderer=OutputFormat.JSON, @@ -353,7 +42,7 @@ def validate_service_process(request): schema=sd.GetJobsEndpoint(), response_schemas=sd.get_all_jobs_responses) @log_unhandled_exceptions(logger=LOGGER, message=sd.InternalServerErrorResponseSchema.description) def get_queried_jobs(request): - # type: (Request) -> HTTPOk + # type: (PyramidRequest) -> HTTPOk """ Retrieve the list of jobs which can be filtered, sorted, paged and categorized using query parameters. """ @@ -442,7 +131,7 @@ def _job_list(jobs): # type: (Iterable[Job]) -> List[JSON] schema=sd.JobEndpoint(), response_schemas=sd.get_single_job_status_responses) @log_unhandled_exceptions(logger=LOGGER, message=sd.InternalServerErrorResponseSchema.description) def get_job_status(request): - # type: (Request) -> HTTPOk + # type: (PyramidRequest) -> HTTPOk """ Retrieve the status of a job. """ @@ -451,133 +140,6 @@ def get_job_status(request): return HTTPOk(json=job_status) -def raise_job_bad_status(job, container=None): - # type: (Job, Optional[AnySettingsContainer]) -> None - """ - Raise the appropriate message for :term:`Job` not ready or unable to retrieve output results due to status. - """ - if job.status != Status.SUCCEEDED: - links = job.links(container=container) - if job.status == Status.FAILED: - err_code = None - err_info = None - err_known_modules = [ - "pywps.exceptions", - "owslib.wps", - "weaver.exceptions", - "weaver.owsexceptions", - ] - # try to infer the cause, fallback to generic error otherwise - for error in job.exceptions: - try: - if isinstance(error, dict): - err_code = error.get("Code") - err_info = error.get("Text") - elif isinstance(error, str) and any(error.startswith(mod) for mod in err_known_modules): - err_code, err_info = error.split(":", 1) - err_code = err_code.split(".")[-1].strip() - err_info = err_info.strip() - except Exception: - err_code = None - if err_code: - break - if not err_code: # default - err_code = OWSNoApplicableCode.code - err_info = "unknown" - # /req/core/job-results-failed - raise HTTPBadRequest(json={ - "title": "JobResultsFailed", - "type": err_code, - "detail": "Job results not available because execution failed.", - "status": HTTPBadRequest.code, - "cause": err_info, - "links": links - }) - - # /req/core/job-results-exception/results-not-ready - raise HTTPBadRequest(json={ - "title": "JobResultsNotReady", - "type": "http://www.opengis.net/def/exceptions/ogcapi-processes-1/1.0/result-not-ready", - "detail": "Job is not ready to obtain results.", - "status": HTTPBadRequest.code, - "cause": {"status": job.status}, - "links": links - }) - - -def raise_job_dismissed(job, container=None): - # type: (Job, Optional[AnySettingsContainer]) -> None - """ - Raise the appropriate messages for dismissed :term:`Job` status. - """ - if job.status == Status.DISMISSED: - # provide the job status links since it is still available for reference - settings = get_settings(container) - job_links = job.links(settings) - job_links = [link for link in job_links if link["rel"] in ["status", "alternate", "collection", "up"]] - raise JobGone( - json={ - "title": "JobDismissed", - "type": "JobDismissed", - "status": JobGone.code, - "detail": "Job was dismissed and artifacts have been removed.", - "cause": {"status": job.status}, - "value": str(job.id), - "links": job_links - } - ) - - -def dismiss_job_task(job, container): - # type: (Job, AnySettingsContainer) -> Job - """ - Cancels any pending or running :mod:`Celery` task and removes completed job artifacts. - - .. note:: - The :term:`Job` object itself is not deleted, only its artifacts. - Therefore, its inputs, outputs, logs, exceptions, etc. are still available in the database, - but corresponding files that would be exposed by ``weaver.wps_output`` configurations are removed. - - :param job: Job to cancel or cleanup. - :param container: Application settings. - :return: Updated and dismissed job. - """ - raise_job_dismissed(job, container) - if job.status in JOB_STATUS_CATEGORIES[StatusCategory.RUNNING]: - # signal to stop celery task. Up to it to terminate remote if any. - LOGGER.debug("Job [%s] dismiss operation: Canceling task [%s]", job.id, job.task_id) - celery_app.control.revoke(job.task_id, terminate=True) - - wps_out_dir = get_wps_output_dir(container) - job_out_dir = os.path.join(wps_out_dir, str(job.id)) - job_out_log = os.path.join(wps_out_dir, str(job.id) + ".log") - job_out_xml = os.path.join(wps_out_dir, str(job.id) + ".xml") - if os.path.isdir(job_out_dir): - LOGGER.debug("Job [%s] dismiss operation: Removing output results.", job.id) - shutil.rmtree(job_out_dir, onerror=lambda func, path, _exc: LOGGER.warning( - "Job [%s] dismiss operation: Failed to delete [%s] due to [%s]", job.id, job_out_dir, _exc - )) - if os.path.isfile(job_out_log): - LOGGER.debug("Job [%s] dismiss operation: Removing output logs.", job.id) - try: - os.remove(job_out_log) - except OSError as exc: - LOGGER.warning("Job [%s] dismiss operation: Failed to delete [%s] due to [%s]", job.id, job_out_log, exc) - if os.path.isfile(job_out_xml): - LOGGER.debug("Job [%s] dismiss operation: Removing output WPS status.", job.id) - try: - os.remove(job_out_xml) - except OSError as exc: - LOGGER.warning("Job [%s] dismiss operation: Failed to delete [%s] due to [%s]", job.id, job_out_xml, exc) - - LOGGER.debug("Job [%s] dismiss operation: Updating job status.") - store = get_db(container).get_store(StoreJobs) - job.status_message = "Job {}.".format(Status.DISMISSED) - job.status = map_status(Status.DISMISSED) - job = store.update_job(job) - return job - - @sd.provider_job_service.delete(tags=[sd.TAG_JOBS, sd.TAG_DISMISS, sd.TAG_PROVIDERS], renderer=OutputFormat.JSON, schema=sd.ProviderJobEndpoint(), response_schemas=sd.delete_prov_job_responses) @sd.process_job_service.delete(tags=[sd.TAG_JOBS, sd.TAG_DISMISS, sd.TAG_PROCESSES], renderer=OutputFormat.JSON, @@ -586,6 +148,7 @@ def dismiss_job_task(job, container): schema=sd.JobEndpoint(), response_schemas=sd.delete_job_responses) @log_unhandled_exceptions(logger=LOGGER, message=sd.InternalServerErrorResponseSchema.description) def cancel_job(request): + # type: (PyramidRequest) -> AnyResponseType """ Dismiss a planned or running job execution, or remove result artifacts of a completed job. @@ -611,6 +174,7 @@ def cancel_job(request): schema=sd.DeleteJobsEndpoint(), response_schemas=sd.delete_jobs_responses) @log_unhandled_exceptions(logger=LOGGER, message=sd.InternalServerErrorResponseSchema.description) def cancel_job_batch(request): + # type: (PyramidRequest) -> AnyResponseType """ Dismiss operation for multiple jobs. @@ -652,15 +216,21 @@ def cancel_job_batch(request): schema=sd.JobInputsEndpoint(), response_schemas=sd.get_job_inputs_responses) @log_unhandled_exceptions(logger=LOGGER, message=sd.InternalServerErrorResponseSchema.description) def get_job_inputs(request): - # type: (Request) -> HTTPException + # type: (PyramidRequest) -> AnyResponseType """ - Retrieve the inputs of a job. + Retrieve the inputs values and outputs definitions of a job. """ job = get_job(request) - inputs = {"inputs": job.inputs} - inputs.update({"links": job.links(request, self_link="inputs")}) - inputs = sd.JobInputsBody().deserialize(inputs) - return HTTPOk(json=inputs) + schema = get_schema_query(request.params.get("schema"), strict=False) + job_inputs = job.inputs + job_outputs = job.outputs + if schema: + job_inputs = convert_input_values_schema(job_inputs, schema) + job_outputs = convert_output_params_schema(job_outputs, schema) + body = {"inputs": job_inputs, "outputs": job_outputs} + body.update({"links": job.links(request, self_link="inputs")}) + body = sd.JobInputsBody().deserialize(body) + return HTTPOk(json=body) @sd.provider_outputs_service.get(tags=[sd.TAG_JOBS, sd.TAG_RESULTS, sd.TAG_PROCESSES], renderer=OutputFormat.JSON, @@ -671,15 +241,16 @@ def get_job_inputs(request): schema=sd.JobOutputsEndpoint(), response_schemas=sd.get_job_outputs_responses) @log_unhandled_exceptions(logger=LOGGER, message=sd.InternalServerErrorResponseSchema.description) def get_job_outputs(request): - # type: (Request) -> HTTPException + # type: (PyramidRequest) -> AnyResponseType """ - Retrieve the outputs of a job. + Retrieve the output values resulting from a job execution. """ job = get_job(request) raise_job_dismissed(job, request) raise_job_bad_status(job, request) - schema = request.params.get("schema") - outputs = {"outputs": get_results(job, request, schema=str(schema).replace(" ", "+"))} # unescape query + schema = get_schema_query(request.params.get("schema")) + results, _ = get_results(job, request, schema=schema, link_references=False) + outputs = {"outputs": results} outputs.update({"links": job.links(request, self_link="outputs")}) outputs = sd.JobOutputsBody().deserialize(outputs) return HTTPOk(json=outputs) @@ -693,23 +264,13 @@ def get_job_outputs(request): schema=sd.JobResultsEndpoint(), response_schemas=sd.get_job_results_responses) @log_unhandled_exceptions(logger=LOGGER, message=sd.InternalServerErrorResponseSchema.description) def get_job_results(request): - # type: (Request) -> HTTPException + # type: (PyramidRequest) -> AnyResponseType """ Retrieve the results of a job. """ job = get_job(request) - raise_job_dismissed(job, request) - raise_job_bad_status(job, request) - job_status = map_status(job.status) - if job_status in JOB_STATUS_CATEGORIES[StatusCategory.RUNNING]: - raise HTTPNotFound(json={ - "code": "ResultsNotReady", - "description": "Job status is '{}'. Results are not yet available.".format(job_status) - }) - results = get_results(job, request, value_key="value", schema=JobOutputsSchema.OGC) - # note: cannot add links in this case because variable OutputID keys are directly at the root - results = sd.Result().deserialize(results) - return HTTPOk(json=results) + resp = get_job_results_response(job, request) + return resp @sd.provider_exceptions_service.get(tags=[sd.TAG_JOBS, sd.TAG_EXCEPTIONS, sd.TAG_PROVIDERS], @@ -721,6 +282,7 @@ def get_job_results(request): schema=sd.ProcessExceptionsEndpoint(), response_schemas=sd.get_exceptions_responses) @log_unhandled_exceptions(logger=LOGGER, message=sd.InternalServerErrorResponseSchema.description) def get_job_exceptions(request): + # type: (PyramidRequest) -> AnyResponseType """ Retrieve the exceptions of a job. """ @@ -738,6 +300,7 @@ def get_job_exceptions(request): schema=sd.ProcessLogsEndpoint(), response_schemas=sd.get_logs_responses) @log_unhandled_exceptions(logger=LOGGER, message=sd.InternalServerErrorResponseSchema.description) def get_job_logs(request): + # type: (PyramidRequest) -> AnyResponseType """ Retrieve the logs of a job. """ @@ -758,6 +321,7 @@ def get_job_logs(request): response_schemas=sd.get_result_redirect_responses) @log_unhandled_exceptions(logger=LOGGER, message=sd.InternalServerErrorResponseSchema.description) def redirect_job_result(request): + # type: (PyramidRequest) -> AnyResponseType """ Deprecated job result endpoint that is now returned by corresponding outputs path with added links. """ diff --git a/weaver/wps_restapi/jobs/utils.py b/weaver/wps_restapi/jobs/utils.py new file mode 100644 index 000000000..ccb0eaebf --- /dev/null +++ b/weaver/wps_restapi/jobs/utils.py @@ -0,0 +1,699 @@ +import math +import os +import shutil +from copy import deepcopy +from typing import TYPE_CHECKING + +from celery.utils.log import get_task_logger +from pyramid.httpexceptions import ( + HTTPBadRequest, + HTTPCreated, + HTTPNoContent, + HTTPNotFound, + HTTPNotImplemented, + HTTPOk, + HTTPUnauthorized +) +from pyramid.response import FileResponse +from pyramid_celery import celery_app + +from weaver.database import get_db +from weaver.datatype import Job +from weaver.exceptions import ( + InvalidIdentifierValue, + JobGone, + JobInvalidParameter, + JobNotFound, + ProcessNotAccessible, + ProcessNotFound, + ServiceNotAccessible, + ServiceNotFound +) +from weaver.execute import ExecuteResponse, ExecuteTransmissionMode +from weaver.formats import ContentType, get_format +from weaver.owsexceptions import OWSNoApplicableCode, OWSNotFound +from weaver.processes.convert import any2wps_literal_datatype, convert_output_params_schema, get_field +from weaver.status import JOB_STATUS_CATEGORIES, Status, StatusCategory, map_status +from weaver.store.base import StoreJobs, StoreProcesses, StoreServices +from weaver.utils import ( + get_any_id, + get_any_value, + get_file_headers, + get_header, + get_path_kvp, + get_settings, + get_weaver_url, + is_uuid +) +from weaver.visibility import Visibility +from weaver.wps.utils import get_wps_output_dir, get_wps_output_url, map_wps_output_location +from weaver.wps_restapi import swagger_definitions as sd +from weaver.wps_restapi.constants import JobInputsOutputsSchema +from weaver.wps_restapi.providers.utils import forbid_local_only + +if TYPE_CHECKING: + from typing import Dict, List, Optional, Tuple, Union + + from weaver.typedefs import ( + AnyHeadersContainer, + AnyRequestType, + AnyResponseType, + AnySettingsContainer, + AnyUUID, + AnyValueType, + ExecutionResultArray, + ExecutionResultObject, + ExecutionResults, + HeadersTupleType, + JSON, + PyramidRequest, + SettingsType + ) + from weaver.wps_restapi.constants import JobInputsOutputsSchemaType + +LOGGER = get_task_logger(__name__) + + +def get_job(request): + # type: (PyramidRequest) -> Job + """ + Obtain a job from request parameters. + + :returns: Job information if found. + :raise HTTPNotFound: with JSON body details on missing/non-matching job, process, provider IDs. + """ + job_id = request.matchdict.get("job_id") + try: + if not is_uuid(job_id): + raise JobInvalidParameter + store = get_db(request).get_store(StoreJobs) + job = store.fetch_by_id(job_id) + except (JobInvalidParameter, JobNotFound) as exc: + exception = type(exc) + if exception is JobInvalidParameter: + desc = "Invalid job reference is not a valid UUID." + else: + desc = "Could not find job with specified reference." + title = "NoSuchJob" + raise exception( + # new format: https://docs.ogc.org/is/18-062r2/18-062r2.html#req_core_job-exception-no-such-job + json={ + "title": title, + "type": "http://www.opengis.net/def/exceptions/ogcapi-processes-1/1.0/no-such-job", + "detail": desc, + "status": exception.code, + "cause": str(job_id) + }, + code=title, locator="JobID", description=desc # old format + ) + + provider_id = request.matchdict.get("provider_id", job.service) + process_id = request.matchdict.get("process_id", job.process) + if provider_id: + forbid_local_only(request) + + if job.service != provider_id: + title = "NoSuchProvider" + desc = "Could not find job reference corresponding to specified provider reference." + raise OWSNotFound( + # new format: https://docs.ogc.org/is/18-062r2/18-062r2.html#req_core_job-exception-no-such-job + json={ + "title": title, + "type": "http://www.opengis.net/def/exceptions/ogcapi-processes-1/1.0/no-such-job", + "detail": desc, + "status": OWSNotFound.code, + "cause": str(process_id) + }, + code=title, locator="provider", description=desc # old format + ) + if job.process != process_id: + title = "NoSuchProcess" + desc = "Could not find job reference corresponding to specified process reference." + raise OWSNotFound( + # new format: https://docs.ogc.org/is/18-062r2/18-062r2.html#req_core_job-exception-no-such-job + # note: although 'no-such-process' error, return 'no-such-job' because process could exist, only mismatches + json={ + "title": title, + "type": "http://www.opengis.net/def/exceptions/ogcapi-processes-1/1.0/no-such-job", + "detail": desc, + "status": OWSNotFound.code, + "cause": str(process_id) + }, + code=title, locator="process", description=desc # old format + ) + return job + + +def get_job_list_links(job_total, filters, request): + # type: (int, Dict[str, AnyValueType], AnyRequestType) -> List[JSON] + """ + Obtains a list of all relevant links for the corresponding job listing defined by query parameter filters. + + :raises IndexError: if the paging values are out of bounds compared to available total :term:`Job` matching search. + """ + base_url = get_weaver_url(request) + + # reapply queries that must be given to obtain the same result in case of subsequent requests (sort, limits, etc.) + kvp_params = {param: value for param, value in request.params.items() if param != "page"} + # patch datetime that have some extra character manipulation (reapply '+' auto-converted to ' ' by params parser) + if "datetime" in kvp_params: + kvp_params["datetime"] = kvp_params["datetime"].replace(" ", "+") + alt_kvp = deepcopy(kvp_params) + + # request job uses general endpoint, obtain the full path if any service/process was given as alternate location + if request.path.startswith(sd.jobs_service.path): + job_path = base_url + sd.jobs_service.path + alt_path = None + parent_url = None + # cannot generate full path apply for 'service' by itself + if filters["process"] and filters["service"]: + alt_path = base_url + sd.provider_jobs_service.path.format( + provider_id=filters["service"], process_id=filters["process"] + ) + parent_url = alt_path.rsplit("/", 1)[0] + elif filters["process"]: + alt_path = base_url + sd.process_jobs_service.path.format(process_id=filters["process"]) + parent_url = alt_path.rsplit("/", 1)[0] + for param in ["service", "provider", "process"]: + alt_kvp.pop(param, None) + # path is whichever specific service/process endpoint, jobs are pre-filtered by them + # transform sub-endpoints into matching query parameters and use generic path as alternate location + else: + job_path = base_url + request.path + alt_path = base_url + sd.jobs_service.path + alt_kvp["process"] = filters["process"] + if filters["service"]: + alt_kvp["provider"] = filters["service"] + parent_url = job_path.rsplit("/", 1)[0] + + cur_page = filters["page"] + per_page = filters["limit"] + max_page = max(math.ceil(job_total / per_page) - 1, 0) + if cur_page < 0 or cur_page > max_page: + raise IndexError(f"Page index {cur_page} is out of range from [0,{max_page}].") + + alt_links = [] + if alt_path: + alt_links = [{ + "href": get_path_kvp(alt_path, page=cur_page, **alt_kvp), "rel": "alternate", + "type": ContentType.APP_JSON, "title": "Alternate endpoint with equivalent set of filtered jobs." + }] + + links = alt_links + [ + {"href": job_path, "rel": "collection", + "type": ContentType.APP_JSON, "title": "Complete job listing (no filtering queries applied)."}, + {"href": base_url + sd.jobs_service.path, "rel": "search", + "type": ContentType.APP_JSON, "title": "Generic query endpoint to search for jobs."}, + {"href": job_path + "?detail=false", "rel": "preview", + "type": ContentType.APP_JSON, "title": "Job listing summary (UUID and count only)."}, + {"href": job_path, "rel": "http://www.opengis.net/def/rel/ogc/1.0/job-list", + "type": ContentType.APP_JSON, "title": "List of registered jobs."}, + {"href": get_path_kvp(job_path, page=cur_page, **kvp_params), "rel": "current", + "type": ContentType.APP_JSON, "title": "Current page of job query listing."}, + {"href": get_path_kvp(job_path, page=0, **kvp_params), "rel": "first", + "type": ContentType.APP_JSON, "title": "First page of job query listing."}, + {"href": get_path_kvp(job_path, page=max_page, **kvp_params), "rel": "last", + "type": ContentType.APP_JSON, "title": "Last page of job query listing."}, + ] + if cur_page > 0: + links.append({ + "href": get_path_kvp(job_path, page=cur_page - 1, **kvp_params), "rel": "prev", + "type": ContentType.APP_JSON, "title": "Previous page of job query listing." + }) + if cur_page < max_page: + links.append({ + "href": get_path_kvp(job_path, page=cur_page + 1, **kvp_params), "rel": "next", + "type": ContentType.APP_JSON, "title": "Next page of job query listing." + }) + if parent_url: + links.append({ + "href": parent_url, "rel": "up", + "type": ContentType.APP_JSON, "title": "Parent collection for which listed jobs apply." + }) + return links + + +def get_schema_query(schema, strict=True): + # type: (Optional[JobInputsOutputsSchemaType], bool) -> Optional[JobInputsOutputsSchemaType] + if not schema: + return None + # unescape query (eg: "OGC+strict" becomes "OGC string" from URL parsing) + schema_checked = str(schema).replace(" ", "+").lower() + if JobInputsOutputsSchema.get(schema_checked) is None: + raise HTTPBadRequest(json={ + "type": "InvalidParameterValue", + "detail": "Query parameter 'schema' value is invalid.", + "status": HTTPBadRequest.code, + "locator": "query", + "value": str(schema), + }) + if not strict: + return schema_checked.split("+")[0] + return schema_checked + + +def make_result_link(result_id, result, job_id, settings): + # type: (str, Union[ExecutionResultObject, ExecutionResultArray], AnyUUID, SettingsType) -> List[str] + """ + Convert a result definition as ``value`` into the corresponding ``reference`` for output transmission. + + .. seealso:: + :rfc:`8288`: HTTP ``Link`` header specification. + """ + values = result if isinstance(result, list) else [result] + suffixes = list(f".{idx}" for idx in range(len(values))) if isinstance(result, list) else [""] + wps_url = get_wps_output_url(settings).strip("/") + links = [] + for suffix, value in zip(suffixes, values): + key = get_any_value(result, key=True) + if key != "href": + # literal data to be converted to link + # plain text file must be created containing the raw literal data + typ = ContentType.TEXT_PLAIN # as per '/rec/core/process-execute-sync-document-ref' + enc = "UTF-8" + out = get_wps_output_dir(settings) + val = get_any_value(value, data=True, file=False) + loc = os.path.join(job_id, result_id + suffix + ".txt") + url = f"{wps_url}/{loc}" + path = os.path.join(out, loc) + with open(path, mode="w", encoding=enc) as out_file: + out_file.write(val) + else: + fmt = get_field(result, "format", default={"mediaType": ContentType.TEXT_PLAIN}) + typ = get_field(fmt, "mime_type", search_variations=True, default=ContentType.TEXT_PLAIN) + enc = get_field(fmt, "encoding", search_variations=True, default=None) + url = get_any_value(value, data=False, file=True) # should already include full path + if fmt == ContentType.TEXT_PLAIN and not enc: # only if text, otherwise binary content could differ + enc = "UTF-8" # default both omit/empty + encoding = f"; charset={enc}" if enc else "" + links.append(f"<{url}>; rel=\"{result_id}{suffix}\"; type={typ}{encoding}") + return links + + +def get_results(job, # type: Job + container, # type: AnySettingsContainer + value_key=None, # type: Optional[str] + schema=JobInputsOutputsSchema.OLD, # type: JobInputsOutputsSchemaType + link_references=False, # type: bool + ): # type: (...) -> Tuple[ExecutionResults, HeadersTupleType] + """ + Obtains the job results with extended full WPS output URL as applicable and according to configuration settings. + + :param job: job from which to retrieve results. + :param container: any container giving access to instance settings (to resolve reference output location). + :param value_key: + If not specified, the returned values will have the appropriate ``data``/``href`` key according to the content. + Otherwise, all values will have the specified key. + :param schema: + Selects which schema to employ for representing the output results (listing or mapping). + :param link_references: + If enabled, an output that was requested by reference instead of value will be returned as ``Link`` reference. + :returns: + Tuple with: + - List or mapping of all outputs each with minimally an ID and value under the requested key. + - List of ``Link`` headers for reference outputs when requested. Empty otherwise. + """ + settings = get_settings(container) + wps_url = get_wps_output_url(settings) + if not wps_url.endswith("/"): + wps_url = wps_url + "/" + schema = JobInputsOutputsSchema.get(str(schema).lower(), default=JobInputsOutputsSchema.OLD) + strict = schema.endswith("+strict") + schema = schema.split("+")[0] + ogc_api = schema == JobInputsOutputsSchema.OGC + outputs = {} if ogc_api else [] + fmt_key = "mediaType" if ogc_api else "mimeType" + out_ref = convert_output_params_schema(job.outputs, JobInputsOutputsSchema.OGC) if link_references else {} + references = {} + for result in job.results: + rtype = "data" if any(k in result for k in ["data", "value"]) else "href" + value = get_any_value(result) + out_key = rtype + out_id = get_any_id(result) + out_mode = out_ref.get(out_id, {}).get("transmissionMode") + as_ref = link_references and out_mode == ExecuteTransmissionMode.REFERENCE + if rtype == "href": + # fix paths relative to instance endpoint, but leave explicit links as is (eg: S3 bucket, remote HTTP, etc.) + if value.startswith("/"): + value = str(value).lstrip("/") + if "://" not in value: + value = wps_url + value + elif ogc_api: + out_key = "value" + elif value_key: + out_key = value_key + output = {out_key: value} + if rtype == "href": # required for the rest to be there, other fields optional + if "mimeType" not in result: + result["mimeType"] = get_format(value, default=ContentType.TEXT_PLAIN).mime_type + if ogc_api or not strict: + output["type"] = result["mimeType"] + if not ogc_api or not strict or as_ref: + output["format"] = {fmt_key: result["mimeType"]} + for field in ["encoding", "schema"]: + if field in result: + output["format"][field] = result[field] + elif rtype != "href": + # literal data + # FIXME: BoundingBox not implemented (https://github.com/crim-ca/weaver/issues/51) + dtype = result.get("dataType", any2wps_literal_datatype(value, is_value=True) or "string") + if ogc_api: + output["dataType"] = {"name": dtype} + else: + output["dataType"] = dtype + + if ogc_api or as_ref: + mapping = references if as_ref else outputs + if out_id in mapping: + output_list = mapping[out_id] + if not isinstance(output_list, list): + output_list = [output_list] + output_list.append(output) + mapping[out_id] = output_list + else: + mapping[out_id] = output + else: + # if ordered insert supported by python version, insert ID first + output = dict([("id", out_id)] + list(output.items())) # noqa + outputs.append(output) + + # needed to collect and aggregate outputs of same ID first in case of array + # convert any requested link references using indices if needed + headers = [] + for out_id, output in references.items(): + res_links = make_result_link(out_id, output, job.id, settings) + headers.extend([("Link", link) for link in res_links]) + + return outputs, headers + + +def get_job_results_response(job, container, headers=None): + # type: (Job, AnySettingsContainer, Optional[AnyHeadersContainer]) -> AnyResponseType + """ + Generates the :term:`OGC` compliant :term:`Job` results response according to submitted execution parameters. + + Parameters that impact the format of the response are: + - Amount of outputs to be returned. + - Parameter ``response: raw|document`` + - Parameter ``transmissionMode: value|reference`` per output if ``response: raw``. + + .. seealso:: + More details available for each combination: + - https://docs.ogc.org/is/18-062r2/18-062r2.html#sc_execute_response + - https://docs.ogc.org/is/18-062r2/18-062r2.html#_response_7 + + :param job: Job for which to generate the results response. + :param container: Application settings. + :param headers: Additional headers to provide in the response. + """ + raise_job_dismissed(job, container) + raise_job_bad_status(job, container) + job_status = map_status(job.status) + if job_status in JOB_STATUS_CATEGORIES[StatusCategory.RUNNING]: + raise HTTPNotFound(json={ + "code": "ResultsNotReady", + "title": "JobResultsNotReady", + "type": "http://www.opengis.net/def/exceptions/ogcapi-processes-1/1.0/result-not-ready", + "detail": "Job is not ready to obtain results.", + "status": HTTPNotFound.code, + "cause": {"status": job.status}, + }) + + # when 'response=document', ignore 'transmissionMode=value|reference', respect it when 'response=raw' + # See: + # - https://docs.ogc.org/is/18-062r2/18-062r2.html#_response_7 (/req/core/job-results-async-document) + # - https://docs.ogc.org/is/18-062r2/18-062r2.html#req_core_process-execute-sync-document + is_raw = job.execution_response == ExecuteResponse.RAW + results, refs = get_results(job, container, value_key="value", + schema=JobInputsOutputsSchema.OGC, # not strict to provide more format details + link_references=is_raw) # type: Union[ExecutionResults, HeadersTupleType] + headers = headers or {} + if "location" not in headers: + headers["Location"] = job.status_url(container) + + if not is_raw: + # note: + # Cannot add "links" field in response body because variable Output ID keys are directly at the root + # Possible conflict with an output that would be named "links". + results = sd.Result().deserialize(results) + return HTTPOk(json=results, headers=headers) + + if not results: # avoid schema validation error if all by reference + # Status code 204 for empty body + # see: + # - https://docs.ogc.org/is/18-062r2/18-062r2.html#req_core_process-execute-sync-raw-ref + refs.extend(headers.items()) + return HTTPNoContent(headers=refs) + + # raw response can be only data value, only link or a mix of them + if results: + # https://docs.ogc.org/is/18-062r2/18-062r2.html#req_core_process-execute-sync-raw-value-one + out_info = list(results.items())[0][-1] + out_type = get_any_value(out_info, key=True) + out_data = get_any_value(out_info) + + # FIXME: https://github.com/crim-ca/weaver/issues/376 + # implement multipart, both for multi-output IDs and array-output under same ID + if len(results) > 1 or (isinstance(out_data, list) and len(out_data) > 1): + # https://docs.ogc.org/is/18-062r2/18-062r2.html#req_core_process-execute-sync-raw-value-multi + raise HTTPNotImplemented(json={ + "code": "NotImplemented", + "type": "NotImplemented", + "detail": "Multipart results with 'transmissionMode=value' and 'response=raw' not implemented.", + }) + + # single value only + out_data = out_data[0] if isinstance(out_data, list) else out_data + if out_type == "href": + out_path = map_wps_output_location(out_data, container, exists=True, url=False) + out_type = out_info.get("type") # noqa + out_headers = get_file_headers(out_path, download_headers=True, content_headers=True, content_type=out_type) + resp = FileResponse(out_path) + resp.headers.update(out_headers) + resp.headers.update(headers) + else: + resp = HTTPOk(body=out_data, charset="UTF-8", content_type=ContentType.TEXT_PLAIN, headers=headers) + else: + resp = HTTPOk(headers=headers) + if refs: + # https://docs.ogc.org/is/18-062r2/18-062r2.html#req_core_process-execute-sync-raw-ref + # https://docs.ogc.org/is/18-062r2/18-062r2.html#req_core_process-execute-sync-raw-mixed-multi + resp.headerlist.extend(refs) + return resp + + +def get_job_submission_response(body, headers, error=False): + # type: (JSON, AnyHeadersContainer, bool) -> Union[HTTPOk, HTTPCreated] + """ + Generates the successful response from contents returned by :term:`Job` submission process. + + If :term:`Job` already finished processing within requested ``Prefer: wait=X`` seconds delay (and if allowed by + the :term:`Process` ``jobControlOptions``), return the successful status immediately instead of created status. + + Otherwise, return the status monitoring location of the created :term:`Job` to be monitored asynchronously. + + .. seealso:: + :func:`weaver.processes.execution.submit_job` + :func:`weaver.processes.execution.submit_job_handler` + """ + status = map_status(body.get("status")) + location = get_header("location", headers) + if status in JOB_STATUS_CATEGORIES[StatusCategory.FINISHED]: + if error: + http_class = HTTPBadRequest + http_desc = sd.FailedSyncJobResponse.description + else: + http_class = HTTPOk + http_desc = sd.CompletedJobResponse.description + body = sd.CompletedJobStatusSchema().deserialize(body) + + body["description"] = http_desc + return http_class(location=location, json=body, headers=headers) + + body["description"] = sd.CreatedLaunchJobResponse.description + body = sd.CreatedJobStatusSchema().deserialize(body) + return HTTPCreated(location=location, json=body, headers=headers) + + +def validate_service_process(request): + # type: (PyramidRequest) -> Tuple[Optional[str], Optional[str]] + """ + Verifies that service or process specified by path or query will raise the appropriate error if applicable. + """ + service_name = ( + request.matchdict.get("provider_id", None) or + request.params.get("provider", None) or + request.params.get("service", None) # backward compatibility + ) + process_name = ( + request.matchdict.get("process_id", None) or + request.params.get("process", None) or + request.params.get("processID", None) # OGC-API conformance + ) + item_test = None + item_type = None + + try: + service = None + if service_name: + forbid_local_only(request) + item_type = "Service" + item_test = service_name + store = get_db(request).get_store(StoreServices) + service = store.fetch_by_name(service_name, visibility=Visibility.PUBLIC) + if process_name: + item_type = "Process" + item_test = process_name + # local process + if not service: + store = get_db(request).get_store(StoreProcesses) + store.fetch_by_id(process_name, visibility=Visibility.PUBLIC) + # remote process + else: + processes = service.processes(request) + if process_name not in [p.id for p in processes]: + raise ProcessNotFound + except (ServiceNotFound, ProcessNotFound): + raise HTTPNotFound(json={ + "code": "NoSuch{}".format(item_type), + "description": "{} of id '{}' cannot be found.".format(item_type, item_test) + }) + except (ServiceNotAccessible, ProcessNotAccessible): + raise HTTPUnauthorized(json={ + "code": "Unauthorized{}".format(item_type), + "description": "{} of id '{}' is not accessible.".format(item_type, item_test) + }) + except InvalidIdentifierValue as ex: + raise HTTPBadRequest(json={ + "code": InvalidIdentifierValue.__name__, + "description": str(ex) + }) + + return service_name, process_name + + +def raise_job_bad_status(job, container=None): + # type: (Job, Optional[AnySettingsContainer]) -> None + """ + Raise the appropriate message for :term:`Job` not ready or unable to retrieve output results due to status. + """ + if job.status != Status.SUCCEEDED: + links = job.links(container=container) + if job.status == Status.FAILED: + err_code = None + err_info = None + err_known_modules = [ + "pywps.exceptions", + "owslib.wps", + "weaver.exceptions", + "weaver.owsexceptions", + ] + # try to infer the cause, fallback to generic error otherwise + for error in job.exceptions: + try: + if isinstance(error, dict): + err_code = error.get("Code") + err_info = error.get("Text") + elif isinstance(error, str) and any(error.startswith(mod) for mod in err_known_modules): + err_code, err_info = error.split(":", 1) + err_code = err_code.split(".")[-1].strip() + err_info = err_info.strip() + except Exception: + err_code = None + if err_code: + break + if not err_code: # default + err_code = OWSNoApplicableCode.code + err_info = "unknown" + # /req/core/job-results-failed + raise HTTPBadRequest(json={ + "title": "JobResultsFailed", + "type": err_code, + "detail": "Job results not available because execution failed.", + "status": HTTPBadRequest.code, + "cause": err_info, + "links": links + }) + + # /req/core/job-results-exception/results-not-ready + raise HTTPNotFound(json={ + "title": "JobResultsNotReady", + "type": "http://www.opengis.net/def/exceptions/ogcapi-processes-1/1.0/result-not-ready", + "detail": "Job is not ready to obtain results.", + "status": HTTPNotFound.code, + "cause": {"status": job.status}, + "links": links + }) + + +def raise_job_dismissed(job, container=None): + # type: (Job, Optional[AnySettingsContainer]) -> None + """ + Raise the appropriate messages for dismissed :term:`Job` status. + """ + if job.status == Status.DISMISSED: + # provide the job status links since it is still available for reference + settings = get_settings(container) + job_links = job.links(settings) + job_links = [link for link in job_links if link["rel"] in ["status", "alternate", "collection", "up"]] + raise JobGone( + json={ + "title": "JobDismissed", + "type": "JobDismissed", + "status": JobGone.code, + "detail": "Job was dismissed and artifacts have been removed.", + "cause": {"status": job.status}, + "value": str(job.id), + "links": job_links + } + ) + + +def dismiss_job_task(job, container): + # type: (Job, AnySettingsContainer) -> Job + """ + Cancels any pending or running :mod:`Celery` task and removes completed job artifacts. + + .. note:: + The :term:`Job` object itself is not deleted, only its artifacts. + Therefore, its inputs, outputs, logs, exceptions, etc. are still available in the database, + but corresponding files that would be exposed by ``weaver.wps_output`` configurations are removed. + + :param job: Job to cancel or cleanup. + :param container: Application settings. + :return: Updated and dismissed job. + """ + raise_job_dismissed(job, container) + if job.status in JOB_STATUS_CATEGORIES[StatusCategory.RUNNING]: + # signal to stop celery task. Up to it to terminate remote if any. + LOGGER.debug("Job [%s] dismiss operation: Canceling task [%s]", job.id, job.task_id) + celery_app.control.revoke(job.task_id, terminate=True) + + wps_out_dir = get_wps_output_dir(container) + job_out_dir = os.path.join(wps_out_dir, str(job.id)) + job_out_log = os.path.join(wps_out_dir, str(job.id) + ".log") + job_out_xml = os.path.join(wps_out_dir, str(job.id) + ".xml") + if os.path.isdir(job_out_dir): + LOGGER.debug("Job [%s] dismiss operation: Removing output results.", job.id) + shutil.rmtree(job_out_dir, onerror=lambda func, path, _exc: LOGGER.warning( + "Job [%s] dismiss operation: Failed to delete [%s] due to [%s]", job.id, job_out_dir, _exc + )) + if os.path.isfile(job_out_log): + LOGGER.debug("Job [%s] dismiss operation: Removing output logs.", job.id) + try: + os.remove(job_out_log) + except OSError as exc: + LOGGER.warning("Job [%s] dismiss operation: Failed to delete [%s] due to [%s]", job.id, job_out_log, exc) + if os.path.isfile(job_out_xml): + LOGGER.debug("Job [%s] dismiss operation: Removing output WPS status.", job.id) + try: + os.remove(job_out_xml) + except OSError as exc: + LOGGER.warning("Job [%s] dismiss operation: Failed to delete [%s] due to [%s]", job.id, job_out_xml, exc) + + LOGGER.debug("Job [%s] dismiss operation: Updating job status.") + store = get_db(container).get_store(StoreJobs) + job.status_message = "Job {}.".format(Status.DISMISSED) + job.status = map_status(Status.DISMISSED) + job = store.update_job(job) + return job diff --git a/weaver/wps_restapi/processes/processes.py b/weaver/wps_restapi/processes/processes.py index 954afcb08..0f4659c43 100644 --- a/weaver/wps_restapi/processes/processes.py +++ b/weaver/wps_restapi/processes/processes.py @@ -18,7 +18,7 @@ from weaver.formats import OutputFormat, repr_json from weaver.processes import opensearch from weaver.processes.execution import submit_job -from weaver.processes.utils import deploy_process_from_payload, get_job_submission_response, get_process +from weaver.processes.utils import deploy_process_from_payload, get_process from weaver.status import Status from weaver.store.base import StoreJobs, StoreProcesses from weaver.utils import fully_qualified_name, get_any_id @@ -274,5 +274,4 @@ def submit_local_job(request): Execution location and method is according to deployed Application Package. """ process = get_process(request=request) - body = submit_job(request, process, tags=["wps-rest"]) - return get_job_submission_response(body) + return submit_job(request, process, tags=["wps-rest"]) diff --git a/weaver/wps_restapi/providers/providers.py b/weaver/wps_restapi/providers/providers.py index a4be46d9f..ef8579a80 100644 --- a/weaver/wps_restapi/providers/providers.py +++ b/weaver/wps_restapi/providers/providers.py @@ -17,8 +17,6 @@ from weaver.exceptions import ServiceNotFound, ServiceParsingError, log_unhandled_exceptions from weaver.formats import OutputFormat from weaver.owsexceptions import OWSMissingParameterValue, OWSNotImplemented -from weaver.processes.execution import submit_job -from weaver.processes.utils import get_job_submission_response from weaver.store.base import StoreServices from weaver.utils import get_any_id, get_settings from weaver.wps.utils import get_wps_client @@ -212,8 +210,9 @@ def submit_provider_job(request): """ Execute a remote provider process. """ + from weaver.processes.execution import submit_job # isort:skip # noqa: E402 # pylint: disable=C0413 + store = get_db(request).get_store(StoreServices) provider_id = request.matchdict.get("provider_id") service = store.fetch_by_name(provider_id) - body = submit_job(request, service, tags=["wps-rest"]) - return get_job_submission_response(body) + return submit_job(request, service, tags=["wps-rest"]) diff --git a/weaver/wps_restapi/quotation/quotes.py b/weaver/wps_restapi/quotation/quotes.py index 1dd630216..0811bf531 100644 --- a/weaver/wps_restapi/quotation/quotes.py +++ b/weaver/wps_restapi/quotation/quotes.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING import colander +from celery.exceptions import TimeoutError as CeleryTaskTimeoutError from pyramid.httpexceptions import HTTPAccepted, HTTPBadRequest, HTTPCreated, HTTPNotFound, HTTPOk from weaver.config import WeaverFeature, get_weaver_configuration @@ -15,13 +16,15 @@ from weaver.quotation.estimation import process_quote_estimator from weaver.sort import Sort from weaver.store.base import StoreBills, StoreProcesses, StoreQuotes -from weaver.utils import get_settings, parse_prefer_header_execute_mode +from weaver.utils import as_int, get_header, get_settings, parse_prefer_header_execute_mode from weaver.wps_restapi import swagger_definitions as sd from weaver.wps_restapi.processes.processes import submit_local_job if TYPE_CHECKING: from weaver.datatype import Process + from weaver.typedefs import AnyResponseType, PyramidRequest + LOGGER = logging.getLogger(__name__) @@ -29,6 +32,7 @@ schema=sd.PostProcessQuoteRequestEndpoint(), response_schemas=sd.post_quotes_responses) @log_unhandled_exceptions(logger=LOGGER, message=sd.InternalServerErrorResponseSchema.description) def request_quote(request): + # type: (PyramidRequest) -> AnyResponseType """ Request a quotation for a process. """ @@ -81,13 +85,17 @@ def request_quote(request): } quote = Quote(**quote_info) quote = quote_store.save_quote(quote) - mode, wait, applied = parse_prefer_header_execute_mode(request.headers, process.jobControlOptions) + max_wait = as_int(settings.get("weaver.quote_sync_max_wait"), default=20) + mode, wait, applied = parse_prefer_header_execute_mode(request.headers, process.jobControlOptions, max_wait) result = process_quote_estimator.delay(quote.id) LOGGER.debug("Celery pending task [%s] for quote [%s].", result.id, quote.id) if mode == ExecuteMode.SYNC and wait: LOGGER.debug("Celery task requested as sync if it completes before (wait=%ss)", wait) - result.wait(timeout=wait) + try: + result.wait(timeout=wait) + except CeleryTaskTimeoutError: + pass if result.ready(): quote = quote_store.fetch_by_id(quote.id) data = quote.json() @@ -95,6 +103,13 @@ def request_quote(request): data.update({"links": quote.links(settings)}) data = sd.CreatedQuoteResponse().deserialize(data) return HTTPCreated(json=data) + else: + LOGGER.debug("Celery task requested as sync took too long to complete (wait=%ss). Continue in async.", wait) + # sync not respected, therefore must drop it + # since both could be provided as alternative preferences, drop only async with limited subset + prefer = get_header("Preference-Applied", applied, pop=True) + _, _, async_applied = parse_prefer_header_execute_mode({"Prefer": prefer}, [ExecuteMode.ASYNC]) + applied = async_applied data = quote.partial() data.update({"description": sd.AcceptedQuoteResponse.description}) @@ -109,6 +124,7 @@ def request_quote(request): schema=sd.QuotesEndpoint(), response_schemas=sd.get_quote_list_responses) @log_unhandled_exceptions(logger=LOGGER, message=sd.InternalServerErrorResponseSchema.description) def get_quote_list(request): + # type: (PyramidRequest) -> AnyResponseType """ Get list of quotes IDs. """ @@ -137,6 +153,7 @@ def get_quote_list(request): schema=sd.QuoteEndpoint(), response_schemas=sd.get_quote_responses) @log_unhandled_exceptions(logger=LOGGER, message=sd.InternalServerErrorResponseSchema.description) def get_quote_info(request): + # type: (PyramidRequest) -> AnyResponseType """ Get quote information. """ @@ -155,6 +172,7 @@ def get_quote_info(request): schema=sd.PostQuote(), response_schemas=sd.post_quote_responses) @log_unhandled_exceptions(logger=LOGGER, message=sd.InternalServerErrorResponseSchema.description) def execute_quote(request): + # type: (PyramidRequest) -> AnyResponseType """ Execute a quoted process. """ diff --git a/weaver/wps_restapi/swagger_definitions.py b/weaver/wps_restapi/swagger_definitions.py index 19f3de6a5..4152f7372 100644 --- a/weaver/wps_restapi/swagger_definitions.py +++ b/weaver/wps_restapi/swagger_definitions.py @@ -50,6 +50,7 @@ from weaver.wps_restapi.colander_extras import ( AllOfKeywordSchema, AnyOfKeywordSchema, + EmptyMappingSchema, ExtendedBoolean as Boolean, ExtendedFloat as Float, ExtendedInteger as Integer, @@ -66,7 +67,7 @@ StringRange, XMLObject ) -from weaver.wps_restapi.constants import JobOutputsSchema +from weaver.wps_restapi.constants import JobInputsOutputsSchema from weaver.wps_restapi.patches import ServiceOnlyExplicitGetHead as Service # warning: don't use 'cornice.Service' if TYPE_CHECKING: @@ -563,6 +564,11 @@ class LinkLanguage(ExtendedMappingSchema): hreflang = Language(missing=drop, description="Language of the content located at the link.") +class LinkHeader(ExtendedSchemaNode): + schema_type = String + example = "; rel=\"relation\"; type=text/plain" + + class MetadataBase(ExtendedMappingSchema): title = ExtendedSchemaNode(String(), missing=drop) @@ -1411,8 +1417,8 @@ class JobResponseOptionsEnum(ExtendedSchemaNode): class TransmissionModeEnum(ExtendedSchemaNode): schema_type = String title = "TransmissionMode" - default = ExecuteTransmissionMode.REFERENCE - example = ExecuteTransmissionMode.REFERENCE + default = ExecuteTransmissionMode.VALUE + example = ExecuteTransmissionMode.VALUE validator = OneOf(ExecuteTransmissionMode.values()) @@ -2324,21 +2330,45 @@ class ProviderInputsEndpoint(ProviderPath, ProcessPath, JobPath): header = RequestHeaders() +class JobInputsOutputsQuery(ExtendedMappingSchema): + schema = ExtendedSchemaNode( + String(), + title="JobInputsOutputsQuerySchema", + example=JobInputsOutputsSchema.OGC, + default=JobInputsOutputsSchema.OLD, + validator=OneOfCaseInsensitive(JobInputsOutputsSchema.values()), + summary="Selects the schema employed for representation of submitted job inputs and outputs.", + description=( + "Selects the schema employed for representing job inputs and outputs that were submitted for execution. " + f"When '{JobInputsOutputsSchema.OLD}' is employed, listing of object with IDs is returned. " + f"When '{JobInputsOutputsSchema.OGC}' is employed, mapping of object definitions is returned. " + "If no schema is requested, the original formats from submission are employed, which could be a mix of " + "both representations. Providing a schema forces their corresponding conversion as applicable." + ) + ) + + class JobInputsEndpoint(JobPath): header = RequestHeaders() + querystring = JobInputsOutputsQuery() class JobOutputQuery(ExtendedMappingSchema): schema = ExtendedSchemaNode( - String(), example=JobOutputsSchema.OGC, default=JobOutputsSchema.OLD, - validator=OneOfCaseInsensitive(JobOutputsSchema.values()), + String(), + title="JobOutputResultsSchema", + example=JobInputsOutputsSchema.OGC, + default=JobInputsOutputsSchema.OLD, + validator=OneOfCaseInsensitive(JobInputsOutputsSchema.values()), summary="Selects the schema employed for representation of job outputs.", description=( "Selects the schema employed for representation of job outputs for providing file Content-Type details. " - f"When '{JobOutputsSchema.OLD}' is employed, 'format.mimeType' is used and 'type' is reported as well. " - f"When '{JobOutputsSchema.OGC}' is employed, 'format.mediaType' is used and 'type' is reported as well. " + f"When '{JobInputsOutputsSchema.OLD}' is employed, " + "'format.mimeType' is used and 'type' is reported as well. " + f"When '{JobInputsOutputsSchema.OGC}' is employed, " + "'format.mediaType' is used and 'type' is reported as well. " "When the '+strict' value is added, only the 'format' or 'type' will be represented according to the " - f"reference standard ({JobOutputsSchema.OGC}, {JobOutputsSchema.OLD}) representation." + f"reference standard ({JobInputsOutputsSchema.OGC}, {JobInputsOutputsSchema.OLD}) representation." ) ) @@ -2453,9 +2483,16 @@ class ExecuteOutputSpecList(ExtendedSequenceSchema): output = ExecuteOutputItem() -class ExecuteOutputSpecMap(ExtendedMappingSchema): - input_id = ExecuteOutputDefinition(variable="{input-id}", title="ExecuteOutputSpecMap", - description="Desired output reporting method.") +class ExecuteOutputMapAdditionalProperties(ExtendedMappingSchema): + output_id = ExecuteOutputDefinition(variable="{output-id}", title="ExecuteOutputSpecMap", + description="Desired output reporting method.") + + +class ExecuteOutputSpecMap(AnyOfKeywordSchema): + _any_of = [ + ExecuteOutputMapAdditionalProperties(), # normal {"": {...}} + EmptyMappingSchema(), # allows explicitly provided {} + ] class ExecuteOutputSpec(OneOfKeywordSchema): @@ -2509,8 +2546,8 @@ class ExceptionReportType(ExtendedMappingSchema): class ProcessControl(ExtendedMappingSchema): jobControlOptions = JobControlOptionsList(missing=[ExecuteControlOption.ASYNC], default=[ExecuteControlOption.ASYNC]) - outputTransmission = TransmissionModeList(missing=[ExecuteTransmissionMode.REFERENCE], - default=[ExecuteTransmissionMode.REFERENCE]) + outputTransmission = TransmissionModeList(missing=[ExecuteTransmissionMode.VALUE], + default=[ExecuteTransmissionMode.VALUE]) class ProcessLocations(ExtendedMappingSchema): @@ -2966,12 +3003,19 @@ class ExecuteInputData(OneOfKeywordSchema): # items: # $ref: "inlineOrRefData.yaml" # -class ExecuteInputMapValues(ExtendedMappingSchema): +class ExecuteInputMapAdditionalProperties(ExtendedMappingSchema): schema_ref = f"{OGC_API_SCHEMA_URL}/{OGC_API_SCHEMA_VERSION}/core/openapi/schemas/execute.yaml" input_id = ExecuteInputData(variable="{input-id}", title="ExecuteInputValue", description="Received mapping input value definition during job submission.") +class ExecuteInputMapValues(AnyOfKeywordSchema): + _any_of = [ + ExecuteInputMapAdditionalProperties(), # normal {"": {...}} + EmptyMappingSchema(), # allows explicitly provided {} + ] + + class ExecuteInputValues(OneOfKeywordSchema): _one_of = [ # OLD format: {"inputs": [{"id": "", "value": }, ...]} @@ -2997,26 +3041,48 @@ class ExecuteInputOutputs(ExtendedMappingSchema): # - 'tests.wps_restapi.test_providers.WpsRestApiProcessesTest.test_execute_process_no_error_not_required_params' # - 'tests.wps_restapi.test_providers.WpsRestApiProcessesTest.test_get_provider_process_no_inputs' # - 'tests.wps_restapi.test_colander_extras.test_oneof_variable_dict_or_list' + # + # OGC 'execute.yaml' also does not enforce any required item. + schema_ref = f"{OGC_API_SCHEMA_URL}/{OGC_API_SCHEMA_VERSION}/core/openapi/schemas/execute.yaml" inputs = ExecuteInputValues(default={}, description="Values submitted for execution.") outputs = ExecuteOutputSpec( - # FIXME: add documentation reference link OGC/Weaver for further details. - description="Defines which outputs to be obtained from the execution (filtered or all), " - "as well as the reporting method for each output according to 'transmissionMode', " - "the 'response' type, and the execution 'mode' provided.", - # FIXME: allow omitting 'outputs' (https://github.com/crim-ca/weaver/issues/375) - # maybe this is good enough, but should have a proper test for it - # default={} + description=( + "Defines which outputs to be obtained from the execution (filtered or all), " + "as well as the reporting method for each output according to 'transmissionMode', " + "the 'response' type, and the execution 'mode' provided " + "(see for more details: https://pavics-weaver.readthedocs.io/en/latest/processes.html#execution-body)." + ), + default={} ) class Execute(ExecuteInputOutputs): - mode = JobExecuteModeEnum() + mode = JobExecuteModeEnum( + missing=drop, + default=ExecuteMode.AUTO, + deprecated=True, + description=( + "Desired execution mode specified directly. This is intended for backward compatibility support. " + "To obtain more control over execution mode selection, employ the official Prefer header instead " + "(see for more details: https://pavics-weaver.readthedocs.io/en/latest/processes.html#execution-mode)." + ), + validator=OneOf(ExecuteMode.values()) + ) + response = JobResponseOptionsEnum( + missing=drop, + default=ExecuteResponse.DOCUMENT, + description=( + "Indicates the desired representation format of the response. " + "(see for more details: https://pavics-weaver.readthedocs.io/en/latest/processes.html#execution-body)." + ), + validator=OneOf(ExecuteResponse.values()) + ) notification_email = ExtendedSchemaNode( String(), missing=drop, validator=Email(), - description="Optionally send a notification email when the job is done.") - response = JobResponseOptionsEnum() + description="Optionally send a notification email when the job is done." + ) class QuoteStatusSchema(ExtendedSchemaNode): @@ -3665,8 +3731,7 @@ class Result(ExtendedMappingSchema): ) -class JobInputsBody(ExtendedMappingSchema): - inputs = ExecuteInputValues() +class JobInputsBody(ExecuteInputOutputs): links = LinkList(missing=drop) @@ -4289,8 +4354,20 @@ class NotImplementedPostProviderResponse(ExtendedMappingSchema): description = "Provider registration not supported using specified definition." +class PreferenceAppliedHeader(ExtendedSchemaNode): + description = "Applied preferences from submitted 'Prefer' header after validation." + name = "Preference-Applied" + schema_type = String + example = "wait=10s, respond-async" + + +class LocationHeader(URL): + name = "Location" + + class CreatedJobLocationHeader(ResponseHeaders): - Location = URL(description="Status monitoring location of the job execution.") + location = LocationHeader(description="Status monitoring location of the job execution.") + prefer_applied = PreferenceAppliedHeader(missing=drop) class CreatedLaunchJobResponse(ExtendedMappingSchema): @@ -4299,6 +4376,25 @@ class CreatedLaunchJobResponse(ExtendedMappingSchema): body = CreatedJobStatusSchema() +class CompletedJobLocationHeader(ResponseHeaders): + location = LocationHeader(description="Status location of the completed job execution.") + prefer_applied = PreferenceAppliedHeader(missing=drop) + + +class CompletedJobStatusSchema(DescriptionSchema, JobStatusInfo): + pass + + +class CompletedJobResponse(ExtendedMappingSchema): + description = "Job submitted and completed execution synchronously." + header = CompletedJobLocationHeader() + body = CompletedJobStatusSchema() + + +class FailedSyncJobResponse(CompletedJobResponse): + description = "Job submitted and failed synchronous execution. See server logs for more details." + + class OkDeleteProcessJobResponse(ExtendedMappingSchema): header = ResponseHeaders() body = DismissedJobSchema() @@ -4377,6 +4473,21 @@ class OkGetJobResultsResponse(ExtendedMappingSchema): body = Result() +class NoContentJobResultsHeaders(NoContent): + content_length = ContentLengthHeader(example="0") + link = LinkHeader(description=( + "Link to a result requested by reference output transmission. " + "Link relation indicates the result ID. " + "Additional parameters indicate expected content-type of the resource. " + "Literal data requested by reference are returned with contents dumped to plain text file." + )) + + +class NoContentJobResultsResponse(ExtendedMappingSchema): + header = NoContentJobResultsHeaders() + body = NoContent(default="") + + class CreatedQuoteExecuteResponse(ExtendedMappingSchema): header = ResponseHeaders() body = CreatedQuotedJobStatusSchema() @@ -4695,12 +4806,18 @@ class GoneVaultFileDownloadResponse(ExtendedMappingSchema): "501": NotImplementedPostProviderResponse(), } post_provider_process_job_responses = { + "200": CompletedJobResponse(description="success"), "201": CreatedLaunchJobResponse(description="success"), + "204": NoContentJobResultsResponse(description="success"), + "400": FailedSyncJobResponse(), "403": ForbiddenProviderAccessResponseSchema(), "500": InternalServerErrorResponseSchema(), } post_process_jobs_responses = { + "200": CompletedJobResponse(description="success"), "201": CreatedLaunchJobResponse(description="success"), + "204": NoContentJobResultsResponse(description="success"), + "400": FailedSyncJobResponse(), "403": ForbiddenProviderAccessResponseSchema(), "500": InternalServerErrorResponseSchema(), } @@ -4794,6 +4911,7 @@ class GoneVaultFileDownloadResponse(ExtendedMappingSchema): "value": EXAMPLES["job_results.json"], } }), + "204": NoContentJobResultsResponse(description="success"), "400": InvalidJobResponseSchema(), "404": NotFoundJobResponseSchema(), "410": GoneJobResponseSchema(),