Merge pull request #98 from mwang87/support_networking_inputs

[WIP] Support networking inputs
mwang87 · Aug 26, 2019 · 07ed037 · 07ed037
2 parents a130f1c + ee04320
commit 07ed037
Show file tree

Hide file tree

Showing 4 changed files with 148 additions and 62 deletions.
diff --git a/code/Dockerfile b/code/Dockerfile
@@ -28,5 +28,9 @@ RUN pip3 install seaborn
 #RUN export DEBIAN_FRONTEND=noninteractive
 #RUN apt-get install -y r-base r-base-dev
 
+RUN apt-get update -y
+RUN apt-get install -y git-core
+RUN pip3 install git+https://github.com/mwang87/CCMS_ProteoSAFe_pythonAPI.git
+
 COPY . /app
 WORKDIR /app
diff --git a/code/redu_pca.py b/code/redu_pca.py
@@ -102,15 +102,36 @@ def project_new_data(new_file_occurrence_table, output_file):
 
     #reformat the occurance table for the new data being fed in
     new_data = pd.read_csv(new_file_occurrence_table, sep = "\t")
+    all_compound_occurances = new_data["Compound_Name"]
+    all_file_occurances = new_data["full_CCMS_path"]
 
-    new_compound_list = new_data["LibraryID"].tolist()
-    new_data.drop(labels=["LibraryID", "TotalFiles"], axis=1, inplace=True)
-    just_matrix = new_data.values
-
-    new_sample_list = list(new_data.columns.values)
-
-    new_sparse_occ_matrix = pd.DataFrame(data = just_matrix, index = new_compound_list, columns = new_sample_list)
-
+    #create a new dataframe with only the information needed to reconstruct, redundant but easier to see
+    compounds_filname_df = pd.DataFrame({"Compound_Name" : all_compound_occurances, "full_CCMS_path" : all_file_occurances})
+
+    #sorting dataframe by sample in order to help? speed up things
+    compounds_filname_df.sort_values(by = "Compound_Name", axis = 0, inplace = True)
+
+    #determine the header for the new table
+    unique_compounds, compound_index = np.unique(compounds_filname_df["Compound_Name"], return_inverse = True)
+
+    #determine the unique samples for the new table
+    unique_sample, file_index = np.unique(compounds_filname_df["full_CCMS_path"], return_inverse = True)
+
+    all_compounds = list(compounds_filname_df["Compound_Name"])
+    all_samples = list(compounds_filname_df["full_CCMS_path"])
+
+    #create a matrix from the coordinates given
+    data = [1] * len(compound_index)
+
+    matrix = sps.coo_matrix((data, (compound_index, file_index)), shape = None).todok().toarray()
+    #handling duplicates within the array
+    matrix[matrix > 0] = 1
+
+    #convert it into the correct format for the return
+    new_sparse_occ_matrix = pd.DataFrame(index = list(unique_compounds), columns = list(unique_sample), data = matrix)
+
+    new_compound_list = list(unique_compounds)        
+    new_sample_list = list(unique_sample)    	     	
     #determine which compounds are common between the original and new datasets
     find_common_compounds = [item for item in new_compound_list if item in old_compound_list]
 

diff --git a/code/test/integration_tests.py → code/test/test_redu_integration.py b/code/test/integration_tests.py → code/test/test_redu_integration.py
@@ -15,6 +15,28 @@
 URL_SEVEN = "attribute/MassSpectrometer/attributeterms?filters=%5B%5D"
 URL_EIGHT = "dump"
 
+def test_pca_library_search(self):
+    query_url = BASE_URL + "processcomparemultivariate?task={}".format("f39c94cb7afe4568950bf61cdb8fee0d")
+    r = requests.get(query_url)
+    r.raise_for_status()
+
+    return 0
+
+def test_pca_metabolomics_snets(self):
+    query_url = BASE_URL + "processcomparemultivariate?task={}".format("1ad7bc366aef45ce81d2dfcca0a9a5e7")
+    r = requests.get(query_url)
+    r.raise_for_status()
+
+    return 0
+
+def test_pca_feature_based(self):
+    query_url = BASE_URL + "processcomparemultivariate?task={}".format("bb49a839face44cbb5ec3e6f855e7285")
+    r = requests.get(query_url)
+    r.raise_for_status()
+
+    return 0
+
+
 def test_data_dump():
     query_url = BASE_URL + URL_EIGHT
     response = requests.get(query_url)
@@ -23,8 +45,6 @@ def test_data_dump():
 
     if file_size < 17762000:
         sys.exit(1)
-    else:
-        sys.exit(0)
 
 def test_attribute_filtration():
     query_url = BASE_URL + URL_SEVEN
@@ -34,11 +54,11 @@ def test_attribute_filtration():
     print(key_value)
     expected_keys = ["attributename", "attributeterm", "ontologyterm", "countfiles"]
 
-    if (key_value == expected_keys):
-        sys.exit(0)
-    else:
+    if (key_value != expected_keys):
         sys.exit(1)
 
+    return 0
+
 
 def test_attribute_terms_display():
     query_url = BASE_URL + URL_FOUR
@@ -48,12 +68,10 @@ def test_attribute_terms_display():
 
     expected_keys = ["attribute_name", "attributedisplay", "countterms"]
 
-    if (key_value == expected_keys):
-        sys.exit(0)
-
-    else:
+    if (key_value != expected_keys):
         sys.exit(1)
 
+    return 0
 
 def test_file_enrichment():
     query_url = BASE_URL + URL_FIVE
@@ -63,37 +81,36 @@ def test_file_enrichment():
 
     key_value = next(iter(data[0]))
 
-    if (key_value == 'filepath'):
-       sys.exit(0)
+    if (key_value != 'filepath'):
+       sys.exit(1)
 
-    else:
-        sys.exit(1)
+    return 0
 
 def test_compound_enrichment():
-   query_url = BASE_URL + URL_SIX  
-   params = {'compoundname' : TEST_COMPOUND}
-   response = requests.post(query_url, params )
-   data = json.loads(response.content)  
-   key_value = list(data[0].keys())
+    query_url = BASE_URL + URL_SIX  
+    params = {'compoundname' : TEST_COMPOUND}
+    response = requests.post(query_url, params )
+    data = json.loads(response.content)  
+    key_value = list(data[0].keys())
 
-   expected_keys = ["attribute_name", "attribute_term", "totalfiles", "compoundfiles", "percentage"]
+    expected_keys = ["attribute_name", "attribute_term", "totalfiles", "compoundfiles", "percentage"]
 
-   if key_value == expected_keys:
-       sys.exit(0)
-   else:
-       sys.exit(1)
+    if key_value != expected_keys:
+        sys.exit(1)
+
+    return 0
 
 def test_your_pca():
-   params = {'task': SAMPLE_TASK_ID}
-   query_url = BASE_URL + URL_TWO
-   response = requests.get(query_url, params = params)
-   data = response.content
-   file_size = sys.getsizeof(data) 
+    params = {'task': SAMPLE_TASK_ID}
+    query_url = BASE_URL + URL_TWO
+    response = requests.get(query_url, params = params)
+    data = response.content
+    file_size = sys.getsizeof(data) 
 
-   if (file_size < 28000000):
+    if (file_size < 28000000):
        sys.exit(1)
-   else: 
-       sys.exit(0)   
+
+    return 0
 
 
 def test_global_pca():
@@ -104,11 +121,4 @@ def test_global_pca():
     if (file_size < 27762100):
         sys.exit(1)
 
-    else:
-        sys.exit(0)
-
-def main():
-    test_attribute_filtration()
-
-if __name__ == "__main__":
-    main()
+    return 0
diff --git a/code/views.py b/code/views.py
@@ -14,6 +14,9 @@
 import requests_cache
 import metadata_validator
 import config
+import pandas as pd
+
+from ccmsproteosafepythonapi import proteosafe
 
 requests_cache.install_cache('demo_cache', allowable_codes=(200, 404, 500))
 
@@ -693,28 +696,76 @@ def displayglobalmultivariate():
 
 @app.route('/processcomparemultivariate', methods=['GET'])
 def processcomparemultivariate():
-    if not os.path.isfile(config.PATH_TO_GLOBAL_OCCURRENCES):
-        print("Missing Global Data")
-        return abort(500)
+    # if not os.path.isfile(config.PATH_TO_GLOBAL_OCCURRENCES):
+    #     print("Missing Global Data")
+    #     return abort(500)
 
-    #Making sure we calculate global datata
-    if not os.path.isfile(config.PATH_TO_COMPONENT_MATRIX):
-        print("Retrieving Global Identifications")
+    # #Making sure we calculate global datata
+    # if not os.path.isfile(config.PATH_TO_COMPONENT_MATRIX):
+    #     print("Retrieving Global Identifications")
 
-        redu_pca.calculate_master_projection(config.PATH_TO_GLOBAL_OCCURRENCES)
+    #     redu_pca.calculate_master_projection(config.PATH_TO_GLOBAL_OCCURRENCES)
 
     #Making sure we grab down user query
     task_id = request.args['task']
     new_analysis_filename = os.path.join(app.config['UPLOAD_FOLDER'], task_id)
-    if not os.path.isfile(new_analysis_filename):
-        #Making sure we have the local compound name
-        remote_url = "https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=compound_filename_occurences/" % (task_id)
 
-        r = requests.get(remote_url)
-        with open(new_analysis_filename, 'wb') as f:
-            f.write(r.content)
-
-    #Actually doing Analysis                                                         
+    if not os.path.isfile(new_analysis_filename):
+        #TODO: Check the task type, get the URL specific for it, then reformat...
+        task_information = proteosafe.get_task_information("gnps.ucsd.edu", task_id)
+
+        print(task_information)
+
+        task_type = task_information["workflow"]
+
+        if task_type == "MOLECULAR-LIBRARYSEARCH-V2":
+            remote_url = "https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task={}&block=main&file=DB_result/".format(task_id)
+            df = pd.read_csv(remote_url, sep="\t")
+            df = df[["full_CCMS_path", "Compound_Name"]]
+            df.to_csv(new_analysis_filename, sep="\t", index=False)
+            #TODO: demangle with params filename
+        elif task_type == "METABOLOMICS-SNETS-V2":
+            clusters_url = "https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task={}&block=main&file=clusterinfo/".format(task_id)
+            clusters_df = pd.read_csv(clusters_url, sep="\t")
+
+            identifications_url = "https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task={}&block=main&file=result_specnets_DB/".format(task_id)
+            identifications_df = pd.read_csv(identifications_url, sep="\t")
+
+            inner_df = clusters_df.merge(identifications_df, how="inner", left_on="#ClusterIdx", right_on="#Scan#")
+            inner_df = inner_df[["#Filename", "Compound_Name"]]
+            inner_df["full_CCMS_path"] = inner_df["#Filename"]
+            inner_df = inner_df[["full_CCMS_path", "Compound_Name"]]
+
+            inner_df.to_csv(new_analysis_filename, sep="\t", index=False)
+        elif task_type == "FEATURE-BASED-MOLECULAR-NETWORKING":
+            quantification_url = "https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task={}&block=main&file=quantification_table_reformatted/".format(task_id)
+            #quantification_url = "https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task={}&block=main&file=quantification_table/".format(task_id)
+            quantification_df = pd.read_csv(quantification_url, sep=",")
+
+            quantification_records = quantification_df.to_dict(orient="records")
+
+            compound_presence_records = []
+            for record in quantification_records:
+                for key in record:
+                    if "Peak area" in key:
+                        if record[key] > 0:
+                            presence_dict = {}
+                            presence_dict["full_CCMS_path"] = key.replace("Peak area", "")
+                            presence_dict["#ClusterIdx"] = record["row ID"]
+
+                            compound_presence_records.append(presence_dict)
+
+            compound_presence_df = pd.DataFrame(compound_presence_records)
+
+            identifications_url = "https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task={}&block=main&file=DB_result/".format(task_id)
+            identifications_df = pd.read_csv(identifications_url, sep="\t")
+
+            inner_df = compound_presence_df.merge(identifications_df, how="inner", left_on="#ClusterIdx", right_on="#Scan#")
+            inner_df = inner_df[["full_CCMS_path", "Compound_Name"]]
+
+            inner_df.to_csv(new_analysis_filename, sep="\t", index=False)
+
+    #Actually doing Analysis
     output_folder = ("./tempuploads")
     redu_pca.project_new_data(new_analysis_filename, output_folder)