Skip to content

Commit

Permalink
CASMCMS-8979 - add remote build node status endpoint.
Browse files Browse the repository at this point in the history
  • Loading branch information
dlaine-hpe committed Aug 21, 2024
1 parent cbe7a3b commit d0f0bca
Show file tree
Hide file tree
Showing 7 changed files with 234 additions and 35 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
### Added
- CASMCMS-8979 - add a status endpoint for the remote build nodes.

## [3.16.2] - 2024-07-25
### Dependencies
Expand Down
76 changes: 76 additions & 0 deletions api/openapi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,46 @@ paths:
$ref: '#/components/responses/NotFound'
'500':
$ref: '#/components/responses/InternalServerError'
/v3/remote-build-nodes/status/{remote_build_node_xname}:
parameters:
- $ref: '#/components/parameters/remote_build_node_xname'
get:
summary: List remote build node status objects
operationId: get_all_v3_remote_build_status
tags:
- remote build node status
- v3
description: Retrieve the status of all remote build nodes that are registered with IMS.
responses:
'200':
description: A collection of the status of each remote build node
content:
application/json:
schema:
items:
$ref: '#/components/schemas/RemoteBuildNodeStatus'
type: array
'500':
$ref: '#/components/responses/InternalServerError'
/v3/remote-build-nodes/status:
get:
summary: List remote build node status objects
operationId: get_all_v3_remote_build_status
tags:
- remote build node status
- v3
description: Retrieve the status of all remote build nodes that are registered with IMS.
responses:
'200':
description: A collection of the status of each remote build node
content:
application/json:
schema:
items:
$ref: '#/components/schemas/RemoteBuildNodeStatus'
type: array
'500':
$ref: '#/components/responses/InternalServerError'
/v3/jobs:
get:
summary: Retrieve a list of JobRecords that are registered with IMS
Expand Down Expand Up @@ -2072,6 +2112,42 @@ components:
example: x3000c1s10b1n0
type: string
minLength: 1
RemoteBuildNodeStatus:
description: A Remote Build Node Status
type: object
required:
- xname
properties:
xname:
description: Xname of the remote build node
example: x3000c1s10b1n0
type: string
minLength: 1
nodeArch:
description: Architecture of the remote build node
example: x86_64
type: string
minLength: 1
numCurrentJobs:
description: Number of current jobs running on the remote build node
example: 15
type: integer
minLength: 1
podmanStatus:
description: Status of the podman executable on the remote build node
example: Podman present at /usr/bin/podman
type: string
minLength: 1
sshStatus:
description: Status of the ssh connection to the remote build node
example: SSH connection established
type: string
minLength: 1
ableToRunJobs:
description: If the node is able to run new jobs
example: True
type: boolean
minLength: 1
ArtifactLinkRecord:
description: An Artifact Link Record
type: object
Expand Down
16 changes: 9 additions & 7 deletions src/server/models/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from marshmallow.validate import Length, OneOf, Range

from src.server.helper import ARCH_ARM64, ARCH_X86_64
from src.server.models.remote_build_nodes import RemoteNodeStatus

JOB_TYPE_CREATE = 'create'
JOB_TYPE_CUSTOMIZE = 'customize'
Expand Down Expand Up @@ -259,13 +260,14 @@ def find_remote_node_for_job(app, job: V2JobRecordSchema) -> str:
"""
app.logger.info(f"Checking for remote build node for job")
best_node = ""
best_node_job_count = 10000
best_node_job_count = RemoteNodeStatus.UNKNOWN_NUM_JOBS - 1

for xname, remote_node in app.data['remote_build_nodes'].items():
arch, numJobs = remote_node.getStatus()
if arch != None and arch == job.arch:
app.logger.info(f"Matching remote node: {xname}, current jobs on node: {numJobs}")
# matching arch - can use the node, now pick the best
if best_node == "" or numJobs < best_node_job_count:
nodeStatus = remote_node.getStatus()
if nodeStatus.ableToRunJobs and nodeStatus.nodeArch == job.arch:
app.logger.info(f"Matching remote node: {xname}, current jobs on node: {nodeStatus.numCurrentJobs}")
# matching arch - can use the node, now pick the node with the least jobs running
if best_node == "" or nodeStatus.numCurrentJobs < best_node_job_count:
best_node = remote_node.xname
best_node_job_count = numJobs
best_node_job_count = nodeStatus.numCurrentJobs
return best_node
75 changes: 52 additions & 23 deletions src/server/models/remote_build_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
"""

import socket
import json
from flask import current_app as app

from marshmallow import Schema, fields, post_load, RAISE
Expand All @@ -38,6 +39,24 @@

from src.server.helper import ARCH_ARM64, ARCH_X86_64

class RemoteNodeStatus:
""" Object to hold the current status of a remote build node """

# status variable to represent and unknown number of jobs on a node
UNKNOWN_NUM_JOBS = 10000

def __init__(self, xname: str) -> None:
self.xname = xname
self.sshStatus = "Unknown"
self.podmanStatus = "Unknown"
self.nodeArch = "Unknown"
self.numCurrentJobs = self.UNKNOWN_NUM_JOBS
self.ableToRunJobs = False

def toJson(self):
return self.__dict__
#return json.dumps(self, default=lambda o: o.__dict__)

class V3RemoteBuildNodeRecord:
""" The RemoteBuildNodeRecord object """

Expand All @@ -49,21 +68,19 @@ def __init__(self, xname):
def __repr__(self):
return '<V3RemoteBuildNodeRecord(xname={self.xname!r})>'.format(self=self)

def getStatus(self) -> (str, int): #(arch, current jobs)
def getStatus(self) -> RemoteNodeStatus:
"""
Utility function to verify that a node is set up and available for remote
builds. If the node can not be contacted or is not set up for running IMS
jobs, this will return (None,None)
Returns:
Archetecture of the node if it can be determined
Number of jobs currently running on the node
RemoteNodeStatus object with details about the current state of the
remote build node.
"""

# start with status Invalid
arch = None
numJobs = None
status = RemoteNodeStatus(self.xname)

# connect to the remote node
connect_kwargs = {"key_filename": "/app/id_ecdsa"}
Expand All @@ -75,7 +92,9 @@ def getStatus(self) -> (str, int): #(arch, current jobs)
except (BadHostKeyException, AuthenticationException, NoValidConnectionsError,
SSHException, socket.error) as error:
app.logger.error(f"Unable to connect to node: {self.xname}, Error: {error}")
return arch, numJobs
status.sshStatus = f"Unable to connect to node. Error: {error}"
return status
status.sshStatus = "SSH connection established."

# make sure the above connection gets closed on exit
try:
Expand All @@ -86,20 +105,23 @@ def getStatus(self) -> (str, int): #(arch, current jobs)

# check result
if result.exited != 0:
app.logger.error(f"Unable to determine archecture of node: {self.xname}, Error: {result.stdout} {result.stderr}")
return arch, numJobs
app.logger.error(f"Unable to determine architecture of node: {self.xname}, Error: {result.stdout} {result.stderr}")
status.nodeArch = f"Unable to determine architecture of node. Error: {result.stdout} {result.stderr}"
return status

# see if we can pull out a known arch type
if "aarch64" in result.stdout:
arch = ARCH_ARM64
status.nodeArch = ARCH_ARM64
elif "x86" in result.stdout:
arch = ARCH_X86_64
status.nodeArch = ARCH_X86_64
else:
app.logger.error(f"Unable to determine archecture of node: {self.xname}, Error: {result.stdout}")
return arch, numJobs
app.logger.error(f"Undefined architecture type for node: {self.xname}, Error: {result.stdout}")
status.nodeArch = f"Undefined architecture type for node, result: {result.stdout}"
return status
except (UnexpectedExit, Failure) as error:
app.logger.error(f"Unable to determine archecture of node: {self.xname}, Error: {error}")
return arch, numJobs
app.logger.error(f"Unable to determine architecture of node: {self.xname}, Error: {error}")
status.nodeArch = f"Unable to determine architecture of node. Error: {error}"
return status

# insure it has podman installed
try:
Expand All @@ -109,16 +131,26 @@ def getStatus(self) -> (str, int): #(arch, current jobs)
# check result
if result.exited != 0:
app.logger.error(f"Unable to determine if podman is installed on node: {self.xname}, Error: {result.stdout} {result.stderr}")
return None,None
status.podmanStatus = f"Unable to determine if podman is installed on node. Error: {result.stdout} {result.stderr}"
return status

# see if we can pull out a known arch type
if "/usr/bin/podman" not in result.stdout:
app.logger.error(f"Podman not installed on node: {self.xname}, Error: {result.stdout}")
return
status.podmanStatus = f"Podman not installed on node."
return status

# report podman is present
status.podmanStatus = f"Podman present at /usr/bin/podman"
except (UnexpectedExit, Failure) as error:
app.logger.error(f"Unable determine if tools are installed on node: {self.xname}, Error: {error}")
return None,None
status.podmanStatus = f"Unable determine if tools are installed on node. Error: {error}"
return status

# Don't fail the remote node over gathering number of current jobs - mark
# the node as valid now.
status.ableToRunJobs = True

# Every running IMS job will create a working directory '/tmp/ims_(IMS_JOB_ID)'.
# Count the number of these directories to find the number of running jobs on
# the node - they are cleaned up when the job is complete on the node.
Expand All @@ -128,19 +160,16 @@ def getStatus(self) -> (str, int): #(arch, current jobs)
if result.exited != 0:
# let this go through and schedule a job on the node
app.logger.error(f"Unable to determine number of jobs on node: {self.xname}, Error: {result.stdout} {result.stderr}")
numJobs = 0
else:
numJobs = int(result.stdout)
status.numCurrentJobs = int(result.stdout)
except (UnexpectedExit, Failure) as error:
# Just log this, but allow the job to run
app.logger.error(f"Unable determine number of running jobs on node: {self.xname}, Error: {error}")
numJobs = 0
finally:
# close tha active connection
c.close()

return arch, numJobs

return status

class V3RemoteBuildNodeRecordInputSchema(Schema):
""" A schema specifically for defining and validating user input """
Expand Down
10 changes: 8 additions & 2 deletions src/server/v3/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#
# MIT License
#
# (C) Copyright 2020-2022 Hewlett Packard Enterprise Development LP
# (C) Copyright 2020-2022, 2024 Hewlett Packard Enterprise Development LP
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
Expand Down Expand Up @@ -41,7 +41,7 @@
V3RecipeResource, V3RecipeCollection, \
V3DeletedRecipeResource, V3DeletedRecipeCollection
from src.server.v3.resources.remote_build_nodes import V3RemoteBuildNodeResource, \
V3RemoteBuildNodeCollection
V3RemoteBuildNodeCollection, V3RemoteBuildStatus, V3RemoteBuildStatusCollection
app_errors = {
# Custom 405 error format to conform to RFC 7807
'MethodNotAllowed': json.loads(
Expand All @@ -60,6 +60,12 @@
apiv3.add_resource(V3RemoteBuildNodeCollection,
'/'.join([uri_prefix, 'remote-build-nodes']),
endpoint='_'.join([endpoint_prefix, 'remote_build_nodes_collection']))
apiv3.add_resource(V3RemoteBuildStatus,
'/'.join([uri_prefix, 'remote-build-nodes/status/<remote_build_node_xname>']),
endpoint='_'.join([endpoint_prefix, 'remote_build_status']))
apiv3.add_resource(V3RemoteBuildStatusCollection,
'/'.join([uri_prefix, 'remote-build-nodes/status']),
endpoint='_'.join([endpoint_prefix, 'remote_build_status_collection']))

apiv3.add_resource(V3PublicKeyResource,
'/'.join([uri_prefix, 'public-keys/<public_key_id>']),
Expand Down
40 changes: 38 additions & 2 deletions src/server/v3/resources/remote_build_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,48 @@
from src.server.errors import problemify, generate_missing_input_response, generate_data_validation_failure, \
generate_resource_not_found_response
from src.server.helper import get_log_id
from src.server.models.remote_build_nodes import V3RemoteBuildNodeRecordInputSchema, V3RemoteBuildNodeRecordSchema, V3RemoteBuildNodeRecord
from src.server.models.remote_build_nodes import V3RemoteBuildNodeRecordInputSchema, V3RemoteBuildNodeRecordSchema, V3RemoteBuildNodeRecord, RemoteNodeStatus
from src.server.v3.models import PATCH_OPERATION_UNDELETE

remote_build_node_user_input_schema = V3RemoteBuildNodeRecordInputSchema()
remote_build_node_schema = V3RemoteBuildNodeRecordSchema()

class V3RemoteBuildStatus(Resource):
"""
Class for querying the current status of the remote build nodes
"""

def get(self, remote_build_node_xname):
""" Retrieve a remote build node. """
log_id = get_log_id()
current_app.logger.info("%s ++ remote_build_status.v3.GET %s", log_id, remote_build_node_xname)

if remote_build_node_xname not in current_app.data['remote_build_nodes']:
current_app.logger.info("%s no IMS remote build node matches xname=%s", log_id, remote_build_node_xname)
return generate_resource_not_found_response()

return_json = current_app.data['remote_build_nodes'][remote_build_node_xname].getStatus().toJson()
#return_json = current_app.data['remote_build_nodes'][remote_build_node_xname].getStatus()
current_app.logger.info("%s Returning json response: %s", log_id, return_json)
return jsonify(return_json)

class V3RemoteBuildStatusCollection(Resource):
"""
Class for querying the current status of all the remote build nodes
"""

def get(self):
""" Retrieve a remote build node. """
log_id = get_log_id()
current_app.logger.info("%s ++ remote_build_status_collection.v3.GET", log_id)

return_json = []
for remote_node in current_app.data['remote_build_nodes'].values():
return_json.append(remote_node.getStatus().toJson())

current_app.logger.info("%s Returning json response: %s", log_id, return_json)
return jsonify(return_json)

class V3RemoteBuildNodeCollection(Resource):
"""
Class representing the operations that can be taken on a collection of remote builds nodes
Expand Down Expand Up @@ -113,7 +149,7 @@ def get(self, remote_build_node_xname):
current_app.logger.info("%s ++ remote_build_nodes.v3.GET %s", log_id, remote_build_node_xname)

if remote_build_node_xname not in current_app.data['remote_build_nodes']:
current_app.logger.info("%s no IMS remote bild node matches xname=%s", log_id, remote_build_node_xname)
current_app.logger.info("%s no IMS remote build node matches xname=%s", log_id, remote_build_node_xname)
return generate_resource_not_found_response()

return_json = remote_build_node_schema.dump(current_app.data['remote_build_nodes'][remote_build_node_xname])
Expand Down
Loading

0 comments on commit d0f0bca

Please sign in to comment.