Skip to content

Commit

Permalink
Merge pull request #319 from KPMP/KPMP-5665_Move_Derived
Browse files Browse the repository at this point in the history
Kpmp 5665 move derived
  • Loading branch information
rlreamy authored Dec 19, 2024
2 parents 2c2fd64 + 6d00790 commit 5457d75
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 42 deletions.
117 changes: 76 additions & 41 deletions scripts/packageChecker/packageChecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,27 @@
import logging
import requests
import os
import shutil
import csv
from dotenv import load_dotenv
import numpy as np
import argparse

load_dotenv()

slack_passcode = os.environ.get('slack_passcode')
logger = logging.getLogger("packageChecker")
logging.basicConfig(level=logging.ERROR)
logging.basicConfig(level=logging.DEBUG)
slack_url = "https://hooks.slack.com/services/" + slack_passcode
data_directory = os.environ.get('data_directory')


class PackageChecker:

def __init__(self):
try:
mongo_client = pymongo.MongoClient("mongodb://localhost:27017/", serverSelectionTimeoutMS=5000)
self.dataLake= mongo_client['dataLake']
self.dataLake = mongo_client['dataLake']
except:
print("Unable to connect to database")

Expand All @@ -30,92 +33,96 @@ def get_expected_files(self, package):
expected_file_names.append(file_info['fileName'])
return expected_file_names

def find_empty_packages(self):
def find_empty_packages(self, move_derived=False):
empty_package_list = []
missing_package_list = []
extra_package_list = []

missing_files_header = ["Package ID", "Missing Files"]
missing_files_csv = open("missing_files.csv", "w")
missing_writer = csv.writer(missing_files_csv)
missing_writer.writerow(missing_files_header)

extra_files_header = ['Package ID', 'Extra Files']
extra_files_csv = open("extra_files.csv", "w")
extra_writer = csv.writer(extra_files_csv)
extra_writer.writerow(extra_files_header)
packages = self.dataLake.packages.find({})
mongo_files = self.dataLake.files.find({})
mongo_files_col_list = []

for file_names in mongo_files:
file_name = file_names['fileName']
mongo_files_col_list.append(file_name)


file_name = file_names['fileName']
mongo_files_col_list.append(file_name)

for package in packages:
package_id = package["_id"]
package_states = self.dataLake.state.find({"packageId": package_id}).sort("stateChangeDate", -1).limit(1)
for state in package_states:
if state['state'] == "UPLOAD_SUCCEEDED":
try:
directory = data_directory + "/package_";
files = os.listdir(directory + package_id)
directory = data_directory + "/package_" + package_id;
files = os.listdir(directory)
expected_file_names = self.get_expected_files(package)
actual_file_names = []
if len(files) == 0:
empty_package_list.append(package_id)
else:
for file in files:
ext = os.path.splitext(file)
if not ext[1] == ".bfmemo" and not "expression_matrix" in ext[0]:
actual_file_names.append(file)
if not ext[1] == ".bfmemo" and not "derived" in ext[0]:
actual_file_names.append(file)

if file == "metadata.json" and len(files) == 1:
empty_package_list.append(package_id)

if (not set(expected_file_names).issubset(set(actual_file_names))) and not all(p == "metadata.json" for p in actual_file_names):
if (not set(expected_file_names).issubset(set(actual_file_names))) and not all(
p == "metadata.json" for p in actual_file_names):
empty_package_list.append(package_id)
missing_files_list = set(expected_file_names).difference(set(actual_file_names))

missing_files_list = set(expected_file_names).difference(set(actual_file_names))
missing_files_list = ', '.join(missing_files_list)
disk_files = set(actual_file_names).difference(set(expected_file_names))

if "metadata.json" in disk_files:
disk_files.remove("metadata.json")
disk_files.remove("metadata.json")

disk_files = ", ".join(disk_files)
files_list = np.setdiff1d(disk_files, mongo_files_col_list)
extra_files_list = list(files_list)

if '' in extra_files_list:
extra_files_list.remove("")
extra_files_list.remove("")

if len(missing_files_list) != 0 and file_name not in missing_files_list:
data = [
[package_id, missing_files_list]
]
missing_writer.writerows(data)
data = [
[package_id, missing_files_list]
]
missing_writer.writerows(data)

if len(extra_files_list) != 0 and file_name not in extra_files_list:
extra_package_list.append(package_id)
data = [
[package_id, extra_files_list]
]
extra_writer.writerows(data)

extra_package_list.append(package_id)
data = [
[package_id, extra_files_list]
]
extra_writer.writerows(data)
if (move_derived):
for file_name in str(extra_files_list).split(", "):
clean_file_name = file_name.replace("[", "").replace("]", "").replace(",", "").replace("'", "")
if clean_file_name != 'derived' and 'svs' not in clean_file_name:
self.move_file_to_derived(directory, clean_file_name)

except:
missing_package_list.append(package_id)

missing_files_csv.close()
extra_files_csv.close()
if len(empty_package_list) > 0:
message = "Missing files in packages: " + ', '.join(empty_package_list)
requests.post(
slack_url,
headers={'Content-type': 'application/json', },
data='{"text":"' + message+'"}')
data='{"text":"' + message + '"}')
if len(missing_package_list) > 0:
message = "Missing package directories for packages: " + ', '.join(missing_package_list)
requests.post(
Expand All @@ -125,11 +132,39 @@ def find_empty_packages(self):
if len(extra_package_list) > 0:
message = "Extra files for packages: " + ', '.join(extra_package_list)
requests.post(
slack_url,
headers={'Content-type': 'application/json', },
slack_url,
headers={'Content-type': 'application/json', },
data='{"text":"' + message + '"}')

def move_file_to_derived(self, package_directory, file_name):
derived_dir = package_directory + "/derived"
file_path = package_directory + "/" + file_name
if os.path.isfile(file_path):
print(f"Moving file '{file_path}' to '{derived_dir}'.")
if not os.path.exists(derived_dir):
os.makedirs(derived_dir)

try:
shutil.move(file_path, derived_dir)
print(f"File '{file_path}' moved to '{derived_dir}' successfully.")
except FileNotFoundError:
print(f"Error: File '{source_file}' not found.")
except Exception as e:
print(f"An error occurred: {e}")
else:
print(f"'{file_path}' is not a file. Skipping.")



if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-m",
"--move_derived",
required=False,
action='store_true',
help='move extra files to derived folder'
)
args = parser.parse_args()
checker = PackageChecker()
checker.find_empty_packages()
checker.find_empty_packages(args.move_derived)
3 changes: 2 additions & 1 deletion scripts/packageChecker/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pymongo
slack_sdk
numpy
numpy
shutil

0 comments on commit 5457d75

Please sign in to comment.