Skip to content

Commit

Permalink
Merge pull request #98 from mwang87/support_networking_inputs
Browse files Browse the repository at this point in the history
[WIP] Support networking inputs
  • Loading branch information
mwang87 authored Aug 26, 2019
2 parents a130f1c + ee04320 commit 07ed037
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 62 deletions.
4 changes: 4 additions & 0 deletions code/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,9 @@ RUN pip3 install seaborn
#RUN export DEBIAN_FRONTEND=noninteractive
#RUN apt-get install -y r-base r-base-dev

RUN apt-get update -y
RUN apt-get install -y git-core
RUN pip3 install git+https://github.com/mwang87/CCMS_ProteoSAFe_pythonAPI.git

COPY . /app
WORKDIR /app
37 changes: 29 additions & 8 deletions code/redu_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,15 +102,36 @@ def project_new_data(new_file_occurrence_table, output_file):

#reformat the occurance table for the new data being fed in
new_data = pd.read_csv(new_file_occurrence_table, sep = "\t")
all_compound_occurances = new_data["Compound_Name"]
all_file_occurances = new_data["full_CCMS_path"]

new_compound_list = new_data["LibraryID"].tolist()
new_data.drop(labels=["LibraryID", "TotalFiles"], axis=1, inplace=True)
just_matrix = new_data.values

new_sample_list = list(new_data.columns.values)

new_sparse_occ_matrix = pd.DataFrame(data = just_matrix, index = new_compound_list, columns = new_sample_list)

#create a new dataframe with only the information needed to reconstruct, redundant but easier to see
compounds_filname_df = pd.DataFrame({"Compound_Name" : all_compound_occurances, "full_CCMS_path" : all_file_occurances})

#sorting dataframe by sample in order to help? speed up things
compounds_filname_df.sort_values(by = "Compound_Name", axis = 0, inplace = True)

#determine the header for the new table
unique_compounds, compound_index = np.unique(compounds_filname_df["Compound_Name"], return_inverse = True)

#determine the unique samples for the new table
unique_sample, file_index = np.unique(compounds_filname_df["full_CCMS_path"], return_inverse = True)

all_compounds = list(compounds_filname_df["Compound_Name"])
all_samples = list(compounds_filname_df["full_CCMS_path"])

#create a matrix from the coordinates given
data = [1] * len(compound_index)

matrix = sps.coo_matrix((data, (compound_index, file_index)), shape = None).todok().toarray()
#handling duplicates within the array
matrix[matrix > 0] = 1

#convert it into the correct format for the return
new_sparse_occ_matrix = pd.DataFrame(index = list(unique_compounds), columns = list(unique_sample), data = matrix)

new_compound_list = list(unique_compounds)
new_sample_list = list(unique_sample)
#determine which compounds are common between the original and new datasets
find_common_compounds = [item for item in new_compound_list if item in old_compound_list]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,28 @@
URL_SEVEN = "attribute/MassSpectrometer/attributeterms?filters=%5B%5D"
URL_EIGHT = "dump"

def test_pca_library_search(self):
query_url = BASE_URL + "processcomparemultivariate?task={}".format("f39c94cb7afe4568950bf61cdb8fee0d")
r = requests.get(query_url)
r.raise_for_status()

return 0

def test_pca_metabolomics_snets(self):
query_url = BASE_URL + "processcomparemultivariate?task={}".format("1ad7bc366aef45ce81d2dfcca0a9a5e7")
r = requests.get(query_url)
r.raise_for_status()

return 0

def test_pca_feature_based(self):
query_url = BASE_URL + "processcomparemultivariate?task={}".format("bb49a839face44cbb5ec3e6f855e7285")
r = requests.get(query_url)
r.raise_for_status()

return 0


def test_data_dump():
query_url = BASE_URL + URL_EIGHT
response = requests.get(query_url)
Expand All @@ -23,8 +45,6 @@ def test_data_dump():

if file_size < 17762000:
sys.exit(1)
else:
sys.exit(0)

def test_attribute_filtration():
query_url = BASE_URL + URL_SEVEN
Expand All @@ -34,11 +54,11 @@ def test_attribute_filtration():
print(key_value)
expected_keys = ["attributename", "attributeterm", "ontologyterm", "countfiles"]

if (key_value == expected_keys):
sys.exit(0)
else:
if (key_value != expected_keys):
sys.exit(1)

return 0


def test_attribute_terms_display():
query_url = BASE_URL + URL_FOUR
Expand All @@ -48,12 +68,10 @@ def test_attribute_terms_display():

expected_keys = ["attribute_name", "attributedisplay", "countterms"]

if (key_value == expected_keys):
sys.exit(0)

else:
if (key_value != expected_keys):
sys.exit(1)

return 0

def test_file_enrichment():
query_url = BASE_URL + URL_FIVE
Expand All @@ -63,37 +81,36 @@ def test_file_enrichment():

key_value = next(iter(data[0]))

if (key_value == 'filepath'):
sys.exit(0)
if (key_value != 'filepath'):
sys.exit(1)

else:
sys.exit(1)
return 0

def test_compound_enrichment():
query_url = BASE_URL + URL_SIX
params = {'compoundname' : TEST_COMPOUND}
response = requests.post(query_url, params )
data = json.loads(response.content)
key_value = list(data[0].keys())
query_url = BASE_URL + URL_SIX
params = {'compoundname' : TEST_COMPOUND}
response = requests.post(query_url, params )
data = json.loads(response.content)
key_value = list(data[0].keys())

expected_keys = ["attribute_name", "attribute_term", "totalfiles", "compoundfiles", "percentage"]
expected_keys = ["attribute_name", "attribute_term", "totalfiles", "compoundfiles", "percentage"]

if key_value == expected_keys:
sys.exit(0)
else:
sys.exit(1)
if key_value != expected_keys:
sys.exit(1)

return 0

def test_your_pca():
params = {'task': SAMPLE_TASK_ID}
query_url = BASE_URL + URL_TWO
response = requests.get(query_url, params = params)
data = response.content
file_size = sys.getsizeof(data)
params = {'task': SAMPLE_TASK_ID}
query_url = BASE_URL + URL_TWO
response = requests.get(query_url, params = params)
data = response.content
file_size = sys.getsizeof(data)

if (file_size < 28000000):
if (file_size < 28000000):
sys.exit(1)
else:
sys.exit(0)

return 0


def test_global_pca():
Expand All @@ -104,11 +121,4 @@ def test_global_pca():
if (file_size < 27762100):
sys.exit(1)

else:
sys.exit(0)

def main():
test_attribute_filtration()

if __name__ == "__main__":
main()
return 0
81 changes: 66 additions & 15 deletions code/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
import requests_cache
import metadata_validator
import config
import pandas as pd

from ccmsproteosafepythonapi import proteosafe

requests_cache.install_cache('demo_cache', allowable_codes=(200, 404, 500))

Expand Down Expand Up @@ -693,28 +696,76 @@ def displayglobalmultivariate():

@app.route('/processcomparemultivariate', methods=['GET'])
def processcomparemultivariate():
if not os.path.isfile(config.PATH_TO_GLOBAL_OCCURRENCES):
print("Missing Global Data")
return abort(500)
# if not os.path.isfile(config.PATH_TO_GLOBAL_OCCURRENCES):
# print("Missing Global Data")
# return abort(500)

#Making sure we calculate global datata
if not os.path.isfile(config.PATH_TO_COMPONENT_MATRIX):
print("Retrieving Global Identifications")
# #Making sure we calculate global datata
# if not os.path.isfile(config.PATH_TO_COMPONENT_MATRIX):
# print("Retrieving Global Identifications")

redu_pca.calculate_master_projection(config.PATH_TO_GLOBAL_OCCURRENCES)
# redu_pca.calculate_master_projection(config.PATH_TO_GLOBAL_OCCURRENCES)

#Making sure we grab down user query
task_id = request.args['task']
new_analysis_filename = os.path.join(app.config['UPLOAD_FOLDER'], task_id)
if not os.path.isfile(new_analysis_filename):
#Making sure we have the local compound name
remote_url = "https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=compound_filename_occurences/" % (task_id)

r = requests.get(remote_url)
with open(new_analysis_filename, 'wb') as f:
f.write(r.content)

#Actually doing Analysis
if not os.path.isfile(new_analysis_filename):
#TODO: Check the task type, get the URL specific for it, then reformat...
task_information = proteosafe.get_task_information("gnps.ucsd.edu", task_id)

print(task_information)

task_type = task_information["workflow"]

if task_type == "MOLECULAR-LIBRARYSEARCH-V2":
remote_url = "https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task={}&block=main&file=DB_result/".format(task_id)
df = pd.read_csv(remote_url, sep="\t")
df = df[["full_CCMS_path", "Compound_Name"]]
df.to_csv(new_analysis_filename, sep="\t", index=False)
#TODO: demangle with params filename
elif task_type == "METABOLOMICS-SNETS-V2":
clusters_url = "https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task={}&block=main&file=clusterinfo/".format(task_id)
clusters_df = pd.read_csv(clusters_url, sep="\t")

identifications_url = "https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task={}&block=main&file=result_specnets_DB/".format(task_id)
identifications_df = pd.read_csv(identifications_url, sep="\t")

inner_df = clusters_df.merge(identifications_df, how="inner", left_on="#ClusterIdx", right_on="#Scan#")
inner_df = inner_df[["#Filename", "Compound_Name"]]
inner_df["full_CCMS_path"] = inner_df["#Filename"]
inner_df = inner_df[["full_CCMS_path", "Compound_Name"]]

inner_df.to_csv(new_analysis_filename, sep="\t", index=False)
elif task_type == "FEATURE-BASED-MOLECULAR-NETWORKING":
quantification_url = "https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task={}&block=main&file=quantification_table_reformatted/".format(task_id)
#quantification_url = "https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task={}&block=main&file=quantification_table/".format(task_id)
quantification_df = pd.read_csv(quantification_url, sep=",")

quantification_records = quantification_df.to_dict(orient="records")

compound_presence_records = []
for record in quantification_records:
for key in record:
if "Peak area" in key:
if record[key] > 0:
presence_dict = {}
presence_dict["full_CCMS_path"] = key.replace("Peak area", "")
presence_dict["#ClusterIdx"] = record["row ID"]

compound_presence_records.append(presence_dict)

compound_presence_df = pd.DataFrame(compound_presence_records)

identifications_url = "https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task={}&block=main&file=DB_result/".format(task_id)
identifications_df = pd.read_csv(identifications_url, sep="\t")

inner_df = compound_presence_df.merge(identifications_df, how="inner", left_on="#ClusterIdx", right_on="#Scan#")
inner_df = inner_df[["full_CCMS_path", "Compound_Name"]]

inner_df.to_csv(new_analysis_filename, sep="\t", index=False)

#Actually doing Analysis
output_folder = ("./tempuploads")
redu_pca.project_new_data(new_analysis_filename, output_folder)

Expand Down

0 comments on commit 07ed037

Please sign in to comment.