diff --git a/CHANGELOG.md b/CHANGELOG.md index 8419877..9578e8c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- CASMCMS-8979 - add a status endpoint for the remote build nodes. ## [3.16.2] - 2024-07-25 ### Dependencies diff --git a/api/openapi.yaml b/api/openapi.yaml index cf2e6ab..b7f048e 100644 --- a/api/openapi.yaml +++ b/api/openapi.yaml @@ -629,6 +629,46 @@ paths: $ref: '#/components/responses/NotFound' '500': $ref: '#/components/responses/InternalServerError' + /v3/remote-build-nodes/status/{remote_build_node_xname}: + parameters: + - $ref: '#/components/parameters/remote_build_node_xname' + get: + summary: List remote build node status objects + operationId: get_all_v3_remote_build_status + tags: + - remote build node status + - v3 + description: Retrieve the status of all remote build nodes that are registered with IMS. + responses: + '200': + description: A collection of the status of each remote build node + content: + application/json: + schema: + items: + $ref: '#/components/schemas/RemoteBuildNodeStatus' + type: array + '500': + $ref: '#/components/responses/InternalServerError' + /v3/remote-build-nodes/status: + get: + summary: List remote build node status objects + operationId: get_all_v3_remote_build_status + tags: + - remote build node status + - v3 + description: Retrieve the status of all remote build nodes that are registered with IMS. + responses: + '200': + description: A collection of the status of each remote build node + content: + application/json: + schema: + items: + $ref: '#/components/schemas/RemoteBuildNodeStatus' + type: array + '500': + $ref: '#/components/responses/InternalServerError' /v3/jobs: get: summary: Retrieve a list of JobRecords that are registered with IMS @@ -2072,6 +2112,42 @@ components: example: x3000c1s10b1n0 type: string minLength: 1 + RemoteBuildNodeStatus: + description: A Remote Build Node Status + type: object + required: + - xname + properties: + xname: + description: Xname of the remote build node + example: x3000c1s10b1n0 + type: string + minLength: 1 + nodeArch: + description: Architecture of the remote build node + example: x86_64 + type: string + minLength: 1 + numCurrentJobs: + description: Number of current jobs running on the remote build node + example: 15 + type: integer + minLength: 1 + podmanStatus: + description: Status of the podman executable on the remote build node + example: Podman present at /usr/bin/podman + type: string + minLength: 1 + sshStatus: + description: Status of the ssh connection to the remote build node + example: SSH connection established + type: string + minLength: 1 + ableToRunJobs: + description: If the node is able to run new jobs + example: True + type: boolean + minLength: 1 ArtifactLinkRecord: description: An Artifact Link Record type: object diff --git a/src/server/models/jobs.py b/src/server/models/jobs.py index 8a0642f..076226e 100644 --- a/src/server/models/jobs.py +++ b/src/server/models/jobs.py @@ -34,6 +34,7 @@ from marshmallow.validate import Length, OneOf, Range from src.server.helper import ARCH_ARM64, ARCH_X86_64 +from src.server.models.remote_build_nodes import RemoteNodeStatus JOB_TYPE_CREATE = 'create' JOB_TYPE_CUSTOMIZE = 'customize' @@ -259,13 +260,14 @@ def find_remote_node_for_job(app, job: V2JobRecordSchema) -> str: """ app.logger.info(f"Checking for remote build node for job") best_node = "" - best_node_job_count = 10000 + best_node_job_count = RemoteNodeStatus.UNKNOWN_NUM_JOBS - 1 + for xname, remote_node in app.data['remote_build_nodes'].items(): - arch, numJobs = remote_node.getStatus() - if arch != None and arch == job.arch: - app.logger.info(f"Matching remote node: {xname}, current jobs on node: {numJobs}") - # matching arch - can use the node, now pick the best - if best_node == "" or numJobs < best_node_job_count: + nodeStatus = remote_node.getStatus() + if nodeStatus.ableToRunJobs and nodeStatus.nodeArch == job.arch: + app.logger.info(f"Matching remote node: {xname}, current jobs on node: {nodeStatus.numCurrentJobs}") + # matching arch - can use the node, now pick the node with the least jobs running + if best_node == "" or nodeStatus.numCurrentJobs < best_node_job_count: best_node = remote_node.xname - best_node_job_count = numJobs + best_node_job_count = nodeStatus.numCurrentJobs return best_node diff --git a/src/server/models/remote_build_nodes.py b/src/server/models/remote_build_nodes.py index 7130250..9ee3998 100644 --- a/src/server/models/remote_build_nodes.py +++ b/src/server/models/remote_build_nodes.py @@ -26,6 +26,7 @@ """ import socket +import json from flask import current_app as app from marshmallow import Schema, fields, post_load, RAISE @@ -38,6 +39,24 @@ from src.server.helper import ARCH_ARM64, ARCH_X86_64 +class RemoteNodeStatus: + """ Object to hold the current status of a remote build node """ + + # status variable to represent and unknown number of jobs on a node + UNKNOWN_NUM_JOBS = 10000 + + def __init__(self, xname: str) -> None: + self.xname = xname + self.sshStatus = "Unknown" + self.podmanStatus = "Unknown" + self.nodeArch = "Unknown" + self.numCurrentJobs = self.UNKNOWN_NUM_JOBS + self.ableToRunJobs = False + + def toJson(self): + return self.__dict__ + #return json.dumps(self, default=lambda o: o.__dict__) + class V3RemoteBuildNodeRecord: """ The RemoteBuildNodeRecord object """ @@ -49,21 +68,19 @@ def __init__(self, xname): def __repr__(self): return ''.format(self=self) - def getStatus(self) -> (str, int): #(arch, current jobs) + def getStatus(self) -> RemoteNodeStatus: """ Utility function to verify that a node is set up and available for remote builds. If the node can not be contacted or is not set up for running IMS jobs, this will return (None,None) Returns: - Archetecture of the node if it can be determined - Number of jobs currently running on the node - + RemoteNodeStatus object with details about the current state of the + remote build node. """ # start with status Invalid - arch = None - numJobs = None + status = RemoteNodeStatus(self.xname) # connect to the remote node connect_kwargs = {"key_filename": "/app/id_ecdsa"} @@ -75,7 +92,9 @@ def getStatus(self) -> (str, int): #(arch, current jobs) except (BadHostKeyException, AuthenticationException, NoValidConnectionsError, SSHException, socket.error) as error: app.logger.error(f"Unable to connect to node: {self.xname}, Error: {error}") - return arch, numJobs + status.sshStatus = f"Unable to connect to node. Error: {error}" + return status + status.sshStatus = "SSH connection established." # make sure the above connection gets closed on exit try: @@ -86,20 +105,23 @@ def getStatus(self) -> (str, int): #(arch, current jobs) # check result if result.exited != 0: - app.logger.error(f"Unable to determine archecture of node: {self.xname}, Error: {result.stdout} {result.stderr}") - return arch, numJobs + app.logger.error(f"Unable to determine architecture of node: {self.xname}, Error: {result.stdout} {result.stderr}") + status.nodeArch = f"Unable to determine architecture of node. Error: {result.stdout} {result.stderr}" + return status # see if we can pull out a known arch type if "aarch64" in result.stdout: - arch = ARCH_ARM64 + status.nodeArch = ARCH_ARM64 elif "x86" in result.stdout: - arch = ARCH_X86_64 + status.nodeArch = ARCH_X86_64 else: - app.logger.error(f"Unable to determine archecture of node: {self.xname}, Error: {result.stdout}") - return arch, numJobs + app.logger.error(f"Undefined architecture type for node: {self.xname}, Error: {result.stdout}") + status.nodeArch = f"Undefined architecture type for node, result: {result.stdout}" + return status except (UnexpectedExit, Failure) as error: - app.logger.error(f"Unable to determine archecture of node: {self.xname}, Error: {error}") - return arch, numJobs + app.logger.error(f"Unable to determine architecture of node: {self.xname}, Error: {error}") + status.nodeArch = f"Unable to determine architecture of node. Error: {error}" + return status # insure it has podman installed try: @@ -109,16 +131,26 @@ def getStatus(self) -> (str, int): #(arch, current jobs) # check result if result.exited != 0: app.logger.error(f"Unable to determine if podman is installed on node: {self.xname}, Error: {result.stdout} {result.stderr}") - return None,None + status.podmanStatus = f"Unable to determine if podman is installed on node. Error: {result.stdout} {result.stderr}" + return status # see if we can pull out a known arch type if "/usr/bin/podman" not in result.stdout: app.logger.error(f"Podman not installed on node: {self.xname}, Error: {result.stdout}") - return + status.podmanStatus = f"Podman not installed on node." + return status + + # report podman is present + status.podmanStatus = f"Podman present at /usr/bin/podman" except (UnexpectedExit, Failure) as error: app.logger.error(f"Unable determine if tools are installed on node: {self.xname}, Error: {error}") - return None,None + status.podmanStatus = f"Unable determine if tools are installed on node. Error: {error}" + return status + # Don't fail the remote node over gathering number of current jobs - mark + # the node as valid now. + status.ableToRunJobs = True + # Every running IMS job will create a working directory '/tmp/ims_(IMS_JOB_ID)'. # Count the number of these directories to find the number of running jobs on # the node - they are cleaned up when the job is complete on the node. @@ -128,19 +160,16 @@ def getStatus(self) -> (str, int): #(arch, current jobs) if result.exited != 0: # let this go through and schedule a job on the node app.logger.error(f"Unable to determine number of jobs on node: {self.xname}, Error: {result.stdout} {result.stderr}") - numJobs = 0 else: - numJobs = int(result.stdout) + status.numCurrentJobs = int(result.stdout) except (UnexpectedExit, Failure) as error: # Just log this, but allow the job to run app.logger.error(f"Unable determine number of running jobs on node: {self.xname}, Error: {error}") - numJobs = 0 finally: # close tha active connection c.close() - return arch, numJobs - + return status class V3RemoteBuildNodeRecordInputSchema(Schema): """ A schema specifically for defining and validating user input """ diff --git a/src/server/v3/__init__.py b/src/server/v3/__init__.py index 39792d8..1ec3e5c 100644 --- a/src/server/v3/__init__.py +++ b/src/server/v3/__init__.py @@ -1,7 +1,7 @@ # # MIT License # -# (C) Copyright 2020-2022 Hewlett Packard Enterprise Development LP +# (C) Copyright 2020-2022, 2024 Hewlett Packard Enterprise Development LP # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), @@ -41,7 +41,7 @@ V3RecipeResource, V3RecipeCollection, \ V3DeletedRecipeResource, V3DeletedRecipeCollection from src.server.v3.resources.remote_build_nodes import V3RemoteBuildNodeResource, \ - V3RemoteBuildNodeCollection + V3RemoteBuildNodeCollection, V3RemoteBuildStatus, V3RemoteBuildStatusCollection app_errors = { # Custom 405 error format to conform to RFC 7807 'MethodNotAllowed': json.loads( @@ -60,6 +60,12 @@ apiv3.add_resource(V3RemoteBuildNodeCollection, '/'.join([uri_prefix, 'remote-build-nodes']), endpoint='_'.join([endpoint_prefix, 'remote_build_nodes_collection'])) + apiv3.add_resource(V3RemoteBuildStatus, + '/'.join([uri_prefix, 'remote-build-nodes/status/']), + endpoint='_'.join([endpoint_prefix, 'remote_build_status'])) + apiv3.add_resource(V3RemoteBuildStatusCollection, + '/'.join([uri_prefix, 'remote-build-nodes/status']), + endpoint='_'.join([endpoint_prefix, 'remote_build_status_collection'])) apiv3.add_resource(V3PublicKeyResource, '/'.join([uri_prefix, 'public-keys/']), diff --git a/src/server/v3/resources/remote_build_nodes.py b/src/server/v3/resources/remote_build_nodes.py index 0ff0a3d..6c122be 100644 --- a/src/server/v3/resources/remote_build_nodes.py +++ b/src/server/v3/resources/remote_build_nodes.py @@ -32,12 +32,48 @@ from src.server.errors import problemify, generate_missing_input_response, generate_data_validation_failure, \ generate_resource_not_found_response from src.server.helper import get_log_id -from src.server.models.remote_build_nodes import V3RemoteBuildNodeRecordInputSchema, V3RemoteBuildNodeRecordSchema, V3RemoteBuildNodeRecord +from src.server.models.remote_build_nodes import V3RemoteBuildNodeRecordInputSchema, V3RemoteBuildNodeRecordSchema, V3RemoteBuildNodeRecord, RemoteNodeStatus from src.server.v3.models import PATCH_OPERATION_UNDELETE remote_build_node_user_input_schema = V3RemoteBuildNodeRecordInputSchema() remote_build_node_schema = V3RemoteBuildNodeRecordSchema() +class V3RemoteBuildStatus(Resource): + """ + Class for querying the current status of the remote build nodes + """ + + def get(self, remote_build_node_xname): + """ Retrieve a remote build node. """ + log_id = get_log_id() + current_app.logger.info("%s ++ remote_build_status.v3.GET %s", log_id, remote_build_node_xname) + + if remote_build_node_xname not in current_app.data['remote_build_nodes']: + current_app.logger.info("%s no IMS remote build node matches xname=%s", log_id, remote_build_node_xname) + return generate_resource_not_found_response() + + return_json = current_app.data['remote_build_nodes'][remote_build_node_xname].getStatus().toJson() + #return_json = current_app.data['remote_build_nodes'][remote_build_node_xname].getStatus() + current_app.logger.info("%s Returning json response: %s", log_id, return_json) + return jsonify(return_json) + +class V3RemoteBuildStatusCollection(Resource): + """ + Class for querying the current status of all the remote build nodes + """ + + def get(self): + """ Retrieve a remote build node. """ + log_id = get_log_id() + current_app.logger.info("%s ++ remote_build_status_collection.v3.GET", log_id) + + return_json = [] + for remote_node in current_app.data['remote_build_nodes'].values(): + return_json.append(remote_node.getStatus().toJson()) + + current_app.logger.info("%s Returning json response: %s", log_id, return_json) + return jsonify(return_json) + class V3RemoteBuildNodeCollection(Resource): """ Class representing the operations that can be taken on a collection of remote builds nodes @@ -113,7 +149,7 @@ def get(self, remote_build_node_xname): current_app.logger.info("%s ++ remote_build_nodes.v3.GET %s", log_id, remote_build_node_xname) if remote_build_node_xname not in current_app.data['remote_build_nodes']: - current_app.logger.info("%s no IMS remote bild node matches xname=%s", log_id, remote_build_node_xname) + current_app.logger.info("%s no IMS remote build node matches xname=%s", log_id, remote_build_node_xname) return generate_resource_not_found_response() return_json = remote_build_node_schema.dump(current_app.data['remote_build_nodes'][remote_build_node_xname]) diff --git a/tests/v3/test_v3_remote_build_nodes.py b/tests/v3/test_v3_remote_build_nodes.py index 55e1cba..be14a70 100644 --- a/tests/v3/test_v3_remote_build_nodes.py +++ b/tests/v3/test_v3_remote_build_nodes.py @@ -1,7 +1,7 @@ # # MIT License # -# (C) Copyright 2023 Hewlett Packard Enterprise Development LP +# (C) Copyright 2023-2024 Hewlett Packard Enterprise Development LP # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), @@ -143,5 +143,53 @@ def test_post_422_name_is_blank(self): check_error_responses(self, response, 422, ['status', 'title', 'detail', 'errors']) self.assertIn("xname", response.json["errors"], "Expected xname to be listed in error detail") +class TestV3RemoteBuildStatusEndpoint(TestCase): + """ + Test the remote-build-nodes/status/{remote_build_node_xname} endpoint (ims.v3.resources.remote_build_node.RemoteBuildStatus) + """ + + def setUp(self): + super(TestV3RemoteBuildStatusEndpoint, self).setUp() + self.app = self.useFixture(V3FlaskTestClientFixture()).client + self.data = { + 'xname': self.getUniqueString() + } + self.useFixture(V3RemoteBuildNodesDataFixture(initial_data=self.data)) + self.test_uri = '/v3/remote-build-nodes/status/{}'.format(self.data['xname']) + + def test_get(self): + """ Test the remote-build-nodes/status/{remote_build_node_xname} resource retrieval """ + response = self.app.get(self.test_uri) + self.assertEqual(response.status_code, 200, 'status code was not 200') + response_data = json.loads(response.data) + self.assertEqual(response_data['xname'], self.data['xname']) + + def test_get_404_bad_id(self): + """ Test the remote-build-nodes/status/{remote_build_node_xname} resource retrieval with an unknown id """ + response = self.app.get('/v3/remote-build-nodes/status/{}'.format(str(uuid.uuid4()))) + check_error_responses(self, response, 404, ['status', 'title', 'detail']) + +class TestV3RemoteBuildStatusCollectionEndpoint(TestCase): + """ + Test the remote-build-nodes/ collection endpoint (ims.v3.resources.remote_build_nodes.RemoteBuildStatusCollection) + """ + + def setUp(self): + super(TestV3RemoteBuildStatusCollectionEndpoint, self).setUp() + self.test_uri = '/v3/remote-build-nodes/status' + self.app = self.useFixture(V3FlaskTestClientFixture()).client + self.data = { + 'xname': self.getUniqueString() + } + self.test_remote_build_nodes = self.useFixture(V3RemoteBuildNodesDataFixture(initial_data=self.data)).datastore + + def test_get(self): + """ Test happy path GET """ + response = self.app.get(self.test_uri) + self.assertEqual(response.status_code, 200, 'status code was not 200') + self.assertThat(json.loads(response.data), HasLength(1), 'collection did not have an entry') + response_data = json.loads(response.data)[0] + self.assertEqual(response_data['xname'], self.data['xname']) + if __name__ == '__main__': unittest.main()