Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add method to update pipeline image cache #63

Open
wants to merge 11 commits into
base: develop
Choose a base branch
from
79 changes: 76 additions & 3 deletions beaglecli
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ from urllib.parse import urljoin
from datetime import datetime
import traceback
import csv
import git
import subprocess
import re
import tempfile

from apps.access import access_commands
from apps.cmoch import cmoch_commands
Expand Down Expand Up @@ -75,6 +79,7 @@ Usage:
beaglecli files patch <file_id> [--file-path=<file_path>] [--file-type=<file_type>] [--file-group=<file_group_id>] [--metadata=<metadata>]... [--size=<size>]
beaglecli files list [--page-size=<page_size>] [--path=<path>]... [--metadata=<metadata>]... [--file-group=<file_group>]... [--file-name=<file_name>]... [--filename-regex=<filename_regex>] [--file-type=<file_type>]... [--all]... [--packaged]... [--force]...
beaglecli files delete --file-id=<file_id>...
beaglecli pipeline update-cache <cache_path>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't mind trying the feature out on an example though. Is there a good cache path I could test it out on?

beaglecli sample create <sample-id>
beaglecli sample list [--sample-id=<sample-id>]
beaglecli sample redact <sample-id> [--value=<redact>]
Expand Down Expand Up @@ -157,6 +162,11 @@ def files_commands(arguments, config):
return _patch_file(arguments, config)


def pipeline_commands(arguments, config):
if arguments.get('update-cache'):
return _update_cache(arguments, config)


def storage_commands(arguments, config):
if arguments.get('list'):
return _list_storage(arguments, config)
Expand Down Expand Up @@ -221,6 +231,8 @@ def sample_commands(arguments, config):
def command(arguments, config):
if arguments.get('files'):
return files_commands(arguments, config)
if arguments.get('pipeline'):
return pipeline_commands(arguments, config)
if arguments.get('storage'):
return storage_commands(arguments, config)
if arguments.get('file-types'):
Expand Down Expand Up @@ -351,6 +363,33 @@ def _get_latest_runs(run_dict):
return run_list


def _get_cwl_apps():
url = urljoin(BEAGLE_ENDPOINT, API['pipelines'])
params = dict()
params['page_size'] = 1000000
Copy link
Collaborator

@buehlere buehlere Jan 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've seen some slow downs / bad responses when setting this parameter too high for the files system and runs endpoints. I don't think that should be an issue here since the pipelines list is relatively short. Nonetheless would we maybe want to make this an adjustable parameter somewhere? Magic numbers like these can sometimes be a bit tricky to track down when there's an issue.

response = requests.get(url, headers={
'Authorization': 'Bearer %s' % config.token, 'Content-Type': 'application/json'}, params=params)
cwl_set = set()
if response.ok:
response_json = response.json()
if "results" in response_json:
result_list = response_json["results"]
for single_pipeline in result_list:
pipeline_type = single_pipeline["pipeline_type"]
if pipeline_type == 0:
github = single_pipeline["github"]
version = single_pipeline["version"]
entrypoint = single_pipeline["entrypoint"]
cwl_set.add((github, version, entrypoint))
return cwl_set
else:
print("Error: beagle returned an empty response")
exit(1)
else:
print("ERROR: Could not retrieve app list")
exit(1)


def _get_apps_dict():
url = urljoin(BEAGLE_ENDPOINT, API['pipelines'])
params = dict()
Expand Down Expand Up @@ -595,9 +634,10 @@ def _list_files(arguments, config):
params['filename_regex'] = filename_regex
params['file_type'] = file_type
if all_pages:
count_params = params
count_params['count'] = True
params['page_size'] = requests.get(urljoin(BEAGLE_ENDPOINT, API['files']), headers={'Authorization': 'Bearer %s' % config.token}, params=count_params).json()['count']
count_params = params
count_params['count'] = True
params['page_size'] = requests.get(urljoin(BEAGLE_ENDPOINT, API['files']), headers={
'Authorization': 'Bearer %s' % config.token}, params=count_params).json()['count']
response = requests.get(urljoin(BEAGLE_ENDPOINT, API['files']), headers={
'Authorization': 'Bearer %s' % config.token}, params=params)
response_json = json.dumps(response.json(), indent=4)
Expand Down Expand Up @@ -712,6 +752,38 @@ def _create_sample(arguments, config):
response_json = json.dumps(response.json(), indent=4)
return response_json

# Pipeline


def _update_cache(arguments, config):
cache_path = arguments.get('<cache_path>')
cwl_apps = _get_cwl_apps()
num_cwl_apps = len(cwl_apps)
current_cwl_app = 0
find_pipeline_folder = re.compile("Cloning into '(\S+)'")
for repo, version, cwl in cwl_apps:
with tempfile.TemporaryDirectory() as tmpDir:
print("Working on cwl {} of {}".format(
current_cwl_app, num_cwl_apps))
current_cwl_app += 1
try:
git_repo = git.Git(tmpDir).clone(
repo, "--branch", version, "--recurse-submodules")
except git.exc.GitCommandError:
print("Could not find repo {} version {}".format(repo, version))
pipeline_folder = find_pipeline_folder.match(git_repo).group(1)
cwl_path = os.path.join(tmpDir, pipeline_folder, cwl)
if os.path.exists(cwl_path):
cache_command = "module load singularity/3.7.1; docker_extract.py -s {} {}".format(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would we also maybe want to make the singularity version a parameter?

cache_path, cwl_path)
try:
subprocess.check_output(cache_command, shell=True)
except:
print("Malformed CWL: CWL {} for repo {} version {} might not be valid".format(
cwl, repo, version))
else:
print("Malformed pipeline: Could not find cwl: {} in {} version {}".format(
cwl, repo, version))

# Update

Expand Down Expand Up @@ -959,6 +1031,7 @@ def _redact_sample(arguments, config):
response_json = json.dumps(response.json(), indent=4)
return response_json


if __name__ == '__main__':
config = Config.load()
authenticate_command(config)
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
docopt==0.6.2
requests==2.22.0
git+https://github.com/mskcc/cwl-utils.git
GitPython==3.0.8
pandas