Skip to content

Commit

Permalink
Merge branch 'RESTAPI-1237-Investigate-Filesystem-check' into 'master'
Browse files Browse the repository at this point in the history
Improved file system check

See merge request firecrest/firecrest!319
  • Loading branch information
Ivano Bonesana committed Sep 11, 2024
2 parents 1fb8775 + acd09d6 commit 3421e6d
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 64 deletions.
6 changes: 5 additions & 1 deletion deploy/test-build/cluster/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ RUN set -x \
&& chown -R slurm:slurm /var/log/slurm \
&& chown -R slurm:slurm /var/spool/slurm*


COPY cluster/slurm/*.conf /etc/slurm/
COPY cluster/slurm/*.sh /
# slurmdbd requires slurmdbd.conf with 600
Expand Down Expand Up @@ -95,6 +94,11 @@ RUN echo 'test1:test11' | chpasswd && echo 'test2:test22' | chpasswd
RUN useradd -m -s /bin/bash testuser && echo 'testuser:testuser' | chpasswd
RUN useradd -m -s /bin/bash service-account-firecrest-sample && echo 'service-account-firecrest-sample:service-account-firecrest-sample' | chpasswd

RUN mkdir -p /scratch/test1 ; chown test1 /scratch/test1
RUN mkdir -p /scratch/test2 ; chown test2 /scratch/test2
RUN mkdir -p /scratch/testuser ; chown testuser /scratch/testuser
RUN mkdir -p /scratch/service-account-firecrest-sample ; chown service-account-firecrest-sample /scratch/service-account-firecrest-sample

ADD cluster/ssh/* /etc/ssh/
ADD environment/keys/ca-key.pub /etc/ssh/
RUN chmod -R 400 /etc/ssh/ && chown -R root:root /etc/ssh/
Expand Down
110 changes: 54 additions & 56 deletions src/status/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,26 +165,25 @@ def test_service(servicename, status_list, trace_header=None):


# test individual system function
def test_system(machinename, headers, status_list=[]):
def test_system(machinename, headers, status_list=[], test_filesystem=False):

app.logger.info(f"Testing {machinename} system status")

if machinename not in SYSTEMS_PUBLIC:
# System does not exist
status_list.append( {"status": -3, "system": machinename} )
return

if machinename not in FILESYSTEMS:
# System does not exist
status_list.append( {"status": -3, "system": machinename} )
return

for i in range(len(SYSTEMS_PUBLIC)):
if SYSTEMS_PUBLIC[i] == machinename:
machine = SYSTEMS[i]
break

mounted_fs = FILESYSTEMS[machinename]


# try to connect (unsuccesfully) with dummy user and pwd, catching SSH exception
try:
client = paramiko.SSHClient()
Expand Down Expand Up @@ -212,32 +211,51 @@ def test_system(machinename, headers, status_list=[]):
is_username_ok = get_username(headers[AUTH_HEADER_NAME])
if not is_username_ok["result"]:
app.logger.error(f"Couldn't extract username from JWT token: {is_username_ok['reason']}")
status_list.append({"status": -5, "system": machinename, "filesystem": fs, "reason": is_username_ok['reason']})
# Report system does not accept connections
status_list.append({"status": -1, "system": machinename, "reason": is_username_ok['reason']})
return

failfs = []
for fs in mounted_fs:
try:
r = requests.get(f"{UTILITIES_URL}/ls",
params={"targetPath": fs["path"], "numericUid": "True"},
headers=headers,
verify=(SSL_CRT if SSL_ENABLED else False),
timeout=(int(UTILITIES_TIMEOUT) + 1))
if not r.ok:
failfs.append(fs["path"])
else:
j = json.loads(r.content)
if len(j['output']) == 0:
# Test command submission with "whoami"
try:
r = requests.get(f"{UTILITIES_URL}/whoami",
params={},
headers=headers,
verify=(SSL_CRT if SSL_ENABLED else False),
timeout=(int(UTILITIES_TIMEOUT) + 1))
if not r.ok:
# Report system does not accept connections
status_list.append({"status": -1, "system": machinename, "reason": is_username_ok['reason']})
else:
# Report system Ok
status_list.append({"status": 0, "system": machinename})
except:
# Report system down
status_list.append({"status": -2, "system": machinename, "reason": is_username_ok['reason']})

# Test filesystems on request
if test_filesystem:
failfs = []
for fs in FILESYSTEMS[machinename]:
try:
r = requests.get(f"{UTILITIES_URL}/ls",
params={"targetPath": fs["path"], "numericUid": "True", "unsorted": "True",
"notListing": "True"},
headers=headers,
verify=(SSL_CRT if SSL_ENABLED else False),
timeout=(int(UTILITIES_TIMEOUT) + 1))
if not r.ok:
failfs.append(fs["path"])
except:
failfs.append(fs["path"])

if len(failfs) > 0:
app.logger.error("Status: -4")
status_list.append({"status": -4, "system": machinename, "filesystem": ",".join(failfs)})
return
else:
j = json.loads(r.content)
if len(j['output']) == 0:
failfs.append(fs["path"])
except:
failfs.append(fs["path"])

status_list.append({"status": 0, "system": machinename})
if len(failfs) > 0:
app.logger.error("Status: -4")
# Report filesystem error
status_list.append({"status": -4, "system": machinename, "filesystem": ",".join(failfs)})

except paramiko.ssh_exception.NoValidConnectionsError as e:
# host up but SSH not working
Expand All @@ -263,16 +281,13 @@ def test_system(machinename, headers, status_list=[]):

finally:
client.close()

return

def check_fs(system,filesystem, headers):

headers["X-Machine-Name"] = system

try:
r = requests.get(f"{UTILITIES_URL}/ls",
params={"targetPath": filesystem, "numericUid": "True"},
params={"targetPath": filesystem, "numericUid": "True", "unsorted": "True", "notListing": "True"},
headers=headers,
verify=(SSL_CRT if SSL_ENABLED else False),
timeout=(int(UTILITIES_TIMEOUT) + 1))
Expand All @@ -284,22 +299,18 @@ def check_fs(system,filesystem, headers):
return 400
except:
return 400

return 200


def check_filesystem(system, filesystems,headers):

resp_json = []
try:

for fs in filesystems:
resp_fs = {}

resp_fs["name"] = fs["name"]
resp_fs["path"] = fs["path"]
resp_fs["description"] = fs["description"]


status_code = check_fs(system, fs["path"], headers)
resp_fs["status_code"] = status_code
Expand All @@ -321,42 +332,28 @@ def check_filesystem(system, filesystems,headers):
@check_auth_header
@cache.cached(timeout=CACHE_TIMEOUT_15m, forced_update=no_cache)
def get_all_filesystems():

[headers, ID] = get_tracing_headers(request)

# resp_json json to fill with responses from each system
resp_json = {}


for system in FILESYSTEMS:

if system not in SYSTEMS_PUBLIC:
return jsonify(description="Filesystem information", out=f"System '{system}' doesn't exist"), 404

if DEBUG_MODE:
app.logger.debug(f"Checking filesystems in {system}")

resp_json[system] = []
try:

filesystems = FILESYSTEMS[system]

if DEBUG_MODE:
app.logger.debug(f"Checking filesystems in {system}")

resp_system = check_filesystem(system,filesystems,headers)

resp_json[system] = resp_system["out"]

except KeyError as ke:
app.logger.error(ke.args)
return jsonify(description="Filesystem information", out=f"Machine {system} doesn't exist"), 404

return jsonify(description="Filesystem information", out=resp_json), 200






# get information about a specific system
Expand Down Expand Up @@ -398,7 +395,7 @@ def status_system(machinename):
[headers, ID] = get_tracing_headers(request)

status_list = []
test_system(machinename,headers,status_list)
test_system(machinename, headers, status_list, True)

# possible responses:
# 0: host up and SSH running
Expand All @@ -415,11 +412,6 @@ def status_system(machinename):
out={"system":machinename, "status":"not available", "description": f"Error on JWT token: {reason}"}
return jsonify(description="Filesystem is not available.", out=out), 200

if status == -4:
filesystem = status_list[0]["filesystem"]
out={"system":machinename, "status":"not available", "description": f"Filesystem {filesystem} is not available"}
return jsonify(description="Filesystem is not available.", out=out), 200

if status == -3:
return jsonify(description="System does not exists."), 404

Expand All @@ -431,8 +423,14 @@ def status_system(machinename):
out={"system":machinename, "status":"not available", "description":"System does not accept connections"}
return jsonify(description="System information", out=out), 200


# System is available
out = {"system": machinename, "status": "available", "description": "System ready"}

# Check filesystem separately
for st in status_list:
if st["status"] == -4:
filesystem_status = st["filesystem"]
out["description"] = f"Filesystem {filesystem_status} is not available"
return jsonify(description="System information", out=out), 200


Expand Down
34 changes: 27 additions & 7 deletions src/utilities/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def list_directory():


## parse ls output
def ls_parse_folder(folder_content:str,path:str=""):
def ls_parse_folder(folder_content:str, path:str=""):
# Example of ls output
# total 3
# lrwxrwxrwx 1 username groupname 46 2023-07-25T14:18:00 "filename" -> "target link"
Expand All @@ -179,12 +179,21 @@ def ls_parse_folder(folder_content:str,path:str=""):
file_list = []
file_pattern = (r'^(?P<type>\S)(?P<permissions>\S+)\s+\d+\s+(?P<user>\S+)\s+'
r'(?P<group>\S+)\s+(?P<size>\d+)\s+(?P<last_modified>(\d|-|T|:)+)\s+(?P<filename>.+)$')
escape_pattern = r'^[a-zA-Z]+\s+[0-9]+' # ignore "total 3" line

lines = folder_content.splitlines()

file_list = []
filename_list = []
matching_list = False
# If the ls -l groups are matching, matching_list value is True and file_list is being populated.
# Otherwise, filename_list is used to store entries since a ls -1f is considered to have been requested
for entry in lines:
matches = re.finditer(file_pattern, entry)
for m in matches:
# Matching ls -l patterns: set matching_list flat to True and clear filename_list.
matching_list = True
filename_list.clear()
tokens = shlex.split(m.group("filename"))
if len(tokens) == 1:
name = tokens[0]
Expand All @@ -208,7 +217,12 @@ def ls_parse_folder(folder_content:str,path:str=""):
"last_modified": m.group("last_modified"),
"size": m.group("size")
})
return file_list
if not matching_list:
if not re.match(escape_pattern, entry):
filename_list.append(entry)

# Return the file_list when matching the pattern, otherwise return the filename_list
return file_list if matching_list else filename_list


## parse ls output
Expand All @@ -229,7 +243,7 @@ def remove_prefix(text, prefix):

file_list = []
#Check if ls has recursive folders
if(re.match(r'\"(.+)\":\n',retval["msg"])):
if re.match(r'\"(.+)\":\n', retval["msg"]):
folders = re.split(r'\"(.+)\":\n',retval["msg"])
root_folder = ""
for i in range(1,len(folders),2):
Expand All @@ -243,7 +257,6 @@ def remove_prefix(text, prefix):
else:
file_list += ls_parse_folder(retval["msg"])


totalSize = len(file_list)
logging.info(f"Length of file list: {len(file_list)}")

Expand Down Expand Up @@ -520,9 +533,11 @@ def common_fs_operation(request, command):
file_transfer = 'download'
elif command == "ls":
options = ""
# long listing format enabled by default
l = "-l"
if get_boolean_var(request.args.get("showhidden", False)):
# if set shows entrys starting with . (not including . and/or .. dirs)
options = "-A "
options += "-A "
if get_boolean_var(request.args.get("numericUid", False)):
# do not resolve UID and GID to names
options += "--numeric-uid-gid "
Expand All @@ -532,7 +547,13 @@ def common_fs_operation(request, command):
if get_boolean_var(request.args.get("followLinks", False)):
# follow symbolic links
options += "-L "
action = f"ls -l --quoting-style=c {options} --time-style=+%Y-%m-%dT%H:%M:%S -- '{targetPath}'"
if get_boolean_var(request.args.get("unsorted", False)):
# do not sort lines
options += "-1f "
if get_boolean_var(request.args.get("notListing", False)):
# do not use long listing format
l = ""
action = f"ls {l} --quoting-style=c {options} --time-style=+%Y-%m-%dT%H:%M:%S -- '{targetPath}'"
elif command == "mkdir":
try:
p = request.form["p"]
Expand Down Expand Up @@ -609,7 +630,6 @@ def common_fs_operation(request, command):
except:
return jsonify(description=ret_data["description"]), ret_data["status_code"], ret_data["header"]


description = f"Success to {command} file."
output = ''
if command == 'checksum':
Expand Down

0 comments on commit 3421e6d

Please sign in to comment.