Skip to content

Commit 3708d29

Browse files
committed
updates
1 parent df1c39c commit 3708d29

File tree

1 file changed

+103
-1
lines changed

1 file changed

+103
-1
lines changed

expansion/expansion.py

Lines changed: 103 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import requests, json, fnmatch, os, os.path, sys, subprocess, glob, ntpath, copy, re, operator, statistics, datetime
1+
import requests, json, fnmatch, os, os.path, sys, subprocess, glob, ntpath, copy, re, operator, statistics, datetime, hashlib, uuid
22
import pandas as pd
33
from os import path
44
from pandas import json_normalize
@@ -9,6 +9,7 @@
99
from itertools import cycle
1010
import random
1111
from random import randrange
12+
from pathlib import Path
1213

1314
import numpy as np
1415
import scipy
@@ -5121,6 +5122,9 @@ def create_mock_tsv(self,
51215122
# save dataframe to TSV file
51225123
if filename is None:
51235124
filename = "{}_mock_{}.tsv".format(node,dd_version)
5125+
5126+
Path(outdir).mkdir(parents=True, exist_ok=True)
5127+
51245128
output = "{}/{}".format(outdir,filename)
51255129
df.to_csv(output,sep='\t',index=False)
51265130

@@ -5129,3 +5133,101 @@ def create_mock_tsv(self,
51295133
output = "{}/{}".format(outdir,filename)
51305134
self.submit_file(project_id="DEV-test",filename=output)
51315135
return df
5136+
5137+
5138+
5139+
5140+
def create_mock_project(self,
5141+
dd,
5142+
node_counts=None,
5143+
project_id=None,
5144+
outdir="mock_tsvs",
5145+
excluded_props = [
5146+
"id",
5147+
"submitter_id",
5148+
"type",
5149+
"project_id",
5150+
"created_datetime",
5151+
"updated_datetime",
5152+
"state",
5153+
"file_state",
5154+
"error_type"],
5155+
file_props = [
5156+
"file_name",
5157+
"file_size",
5158+
"md5sum",
5159+
"object_id",
5160+
"storage_urls"],
5161+
excluded_nodes=[],
5162+
submit_tsvs=False
5163+
):
5164+
"""
5165+
5166+
Create mock / simulated data project for a list of nodes in the data dictionary. Ignores program/project root nodes, so make sure those exist first. This is a wrapper for the func Gen3Expansion.create_mock_tsv()
5167+
Args:
5168+
dd (dict): the Gen3 data dictionary you get with Gen3Submission.get_dictionary_all().
5169+
node_counts(dict): node_ids as keys, values is number of records to create for that node.
5170+
For example: {"case":3,"imaging_study":6}
5171+
project_id(str): If no project_id is provided, using the generic 'DEV-test' project_id
5172+
outdir(str): the local directory to write simulated TSV data to.
5173+
excluded_props(list): a list of properties in data dictionary to ignore / exclude from TSVs.
5174+
file_props(list): a list of file_properties to be simulated; unlikely to change from default.
5175+
excluded_nodes(list): a list of nodes to not create mock TSVs for.
5176+
submit_tsvs(boolean): if true, will use sdk to submit the DataFrames via sheepdog
5177+
"""
5178+
dd_version = dd["_settings"]["_dict_version"]
5179+
if project_id is None:
5180+
print("\tNo 'project_id' provided; using the generic 'DEV-test' as the project_id.")
5181+
project_id = "DEV-test"
5182+
prog,proj = project_id.split("-",1)
5183+
5184+
# for the create_mock_tsv() func, we need "node", "count" and "parent_tsvs".
5185+
5186+
# Build node_counts if not provided; this gets us "node" and "count"
5187+
node_counts=None
5188+
if node_counts is None:
5189+
node_order = self.get_submission_order()
5190+
node_counts = {}
5191+
for node in node_order:
5192+
node_id = node[0]
5193+
node_count = node[1]
5194+
print(node_id)
5195+
if node_id == "project" or node_id in excluded_nodes: # skip project node
5196+
continue
5197+
else:
5198+
node_counts[node_id] = node_count*node_count # get progressively larger counts as you go down in data model hierarchy
5199+
print("\tNo node_counts provided; using the following node_counts:\n\t{}".format(node_counts))
5200+
5201+
# Now build "parent_tsvs" for each node in "node_counts":
5202+
all_parent_tsvs = {}
5203+
for node in node_counts:
5204+
print(node)
5205+
parent_tsvs = {}
5206+
node_links = dd[node]['links'][0]
5207+
if 'subgroup' in node_links:
5208+
sublinks = node_links['subgroup']
5209+
link_targets = {i['name']:i['target_type'] for i in sublinks if i['target_type'] not in excluded_nodes}
5210+
if node_links['exclusive'] == True: # check if subgroup links are exclusive
5211+
random_link = random.choice(list(link_targets.items())) # pick only one random link if exclusive
5212+
link_targets = {random_link[0]:random_link[1]}
5213+
else:
5214+
link_targets = {i['name']:i['target_type'] for i in dd[node]['links'] if i['target_type'] not in excluded_nodes} #get targets to filter out excluded nodes
5215+
5216+
for link in link_targets:
5217+
parent_tsvs[link] = "{}/{}_mock_{}.tsv".format(outdir,link_targets[link],dd_version)
5218+
#print("\t\t{}".format(parent_tsvs))
5219+
all_parent_tsvs[node] = parent_tsvs
5220+
5221+
# Create the TSVs
5222+
for node in node_counts:
5223+
# Create the node TSV / DataFrame
5224+
df = self.create_mock_tsv(
5225+
dd=dd,
5226+
node=node,
5227+
count=node_counts[node],
5228+
parent_tsvs=all_parent_tsvs[node],
5229+
project_id=project_id,
5230+
outdir=outdir,
5231+
)
5232+
if submit_tsvs:
5233+
d = self.submit_df(project_id=project_id, df=df, chunk_size=250)

0 commit comments

Comments
 (0)