Merge branch 'master' of github.com:mwang87/ReDU-MS2-GNPS

mwang87 · Sep 9, 2019 · 23c6297 · 23c6297
2 parents 62ee407 + 2101c67
commit 23c6297
Show file tree

Hide file tree

Showing 15 changed files with 855 additions and 139 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Welcome to ReDU :)
+# Welcome to ReDU
 ## Reanalysis of Data User Interface
 
 [ReDU](https://redu.ucsd.edu/) is a community- and data-driven approach to find and reuse public data containing tandem MS data at the repository scale. ReDU is a launchpad for co- or re-analysis of public data via the Global Natural Product Social Molecular Networking Platform [(GNPS)](https://gnps.ucsd.edu/ProteoSAFe/static/gnps-splash.jsp). Our aim is to empower researchers to put their data in the context of public data as well as explore questions using public data at the repository scale.

diff --git a/code/Dockerfile b/code/Dockerfile
@@ -28,5 +28,9 @@ RUN pip3 install seaborn
 #RUN export DEBIAN_FRONTEND=noninteractive
 #RUN apt-get install -y r-base r-base-dev
 
+RUN apt-get update -y
+RUN apt-get install -y git-core
+RUN pip3 install git+https://github.com/mwang87/CCMS_ProteoSAFe_pythonAPI.git
+
 COPY . /app
 WORKDIR /app
diff --git a/code/redu_pca.py b/code/redu_pca.py
@@ -102,15 +102,36 @@ def project_new_data(new_file_occurrence_table, output_file):
 
     #reformat the occurance table for the new data being fed in
     new_data = pd.read_csv(new_file_occurrence_table, sep = "\t")
+    all_compound_occurances = new_data["Compound_Name"]
+    all_file_occurances = new_data["full_CCMS_path"]
 
-    new_compound_list = new_data["LibraryID"].tolist()
-    new_data.drop(labels=["LibraryID", "TotalFiles"], axis=1, inplace=True)
-    just_matrix = new_data.values
-
-    new_sample_list = list(new_data.columns.values)
-
-    new_sparse_occ_matrix = pd.DataFrame(data = just_matrix, index = new_compound_list, columns = new_sample_list)
-
+    #create a new dataframe with only the information needed to reconstruct, redundant but easier to see
+    compounds_filname_df = pd.DataFrame({"Compound_Name" : all_compound_occurances, "full_CCMS_path" : all_file_occurances})
+
+    #sorting dataframe by sample in order to help? speed up things
+    compounds_filname_df.sort_values(by = "Compound_Name", axis = 0, inplace = True)
+
+    #determine the header for the new table
+    unique_compounds, compound_index = np.unique(compounds_filname_df["Compound_Name"], return_inverse = True)
+
+    #determine the unique samples for the new table
+    unique_sample, file_index = np.unique(compounds_filname_df["full_CCMS_path"], return_inverse = True)
+
+    all_compounds = list(compounds_filname_df["Compound_Name"])
+    all_samples = list(compounds_filname_df["full_CCMS_path"])
+
+    #create a matrix from the coordinates given
+    data = [1] * len(compound_index)
+
+    matrix = sps.coo_matrix((data, (compound_index, file_index)), shape = None).todok().toarray()
+    #handling duplicates within the array
+    matrix[matrix > 0] = 1
+
+    #convert it into the correct format for the return
+    new_sparse_occ_matrix = pd.DataFrame(index = list(unique_compounds), columns = list(unique_sample), data = matrix)
+
+    new_compound_list = list(unique_compounds)        
+    new_sample_list = list(unique_sample)    	     	
     #determine which compounds are common between the original and new datasets
     find_common_compounds = [item for item in new_compound_list if item in old_compound_list]
 

diff --git a/code/static/img/redulogo.gif b/code/static/img/redulogo.gif
diff --git a/code/templates/layout.html b/code/templates/layout.html
@@ -38,6 +38,9 @@
         <li class="nav-item">
           <a class="nav-link" href="/datalookup">File Query - Sample Information</a>
         </li>
+        <li class="nav-item">
+            <a class="nav-link" href="ftp://massive.ucsd.edu/MSV000084206/other/ReDU_all_identifications.tsv">Download Annotations</a>
+        </li>    
       </ul>
 
     </nav>

diff --git a/code/test/integration_tests.py b/code/test/integration_tests.py
diff --git a/code/test/test_redu_integration.py b/code/test/test_redu_integration.py
@@ -0,0 +1,117 @@
+import os
+import sys
+import requests
+import json
+
+BASE_URL = "https://redu.ucsd.edu/"
+SAMPLE_TASK_ID = "ffa003f6c4d844188f1f751d34c649b0"
+TEST_COMPOUND = "2,5-Dimethoxyphenethylamine"
+
+def test_pca_library_search():
+    query_url = BASE_URL + "processcomparemultivariate?task={}".format("f39c94cb7afe4568950bf61cdb8fee0d")
+    r = requests.get(query_url)
+    r.raise_for_status()
+
+    return 0
+
+def test_pca_metabolomics_snets():
+    query_url = BASE_URL + "processcomparemultivariate?task={}".format("1ad7bc366aef45ce81d2dfcca0a9a5e7")
+    r = requests.get(query_url)
+    r.raise_for_status()
+
+    return 0
+
+def test_pca_feature_based():
+    query_url = BASE_URL + "processcomparemultivariate?task={}".format("bb49a839face44cbb5ec3e6f855e7285")
+    r = requests.get(query_url)
+    r.raise_for_status()
+
+    return 0
+
+
+def test_data_dump():
+    query_url = BASE_URL + "dump"
+    response = requests.get(query_url)
+    data = response.content
+    file_size = sys.getsizeof(data)
+
+    if file_size < 17762000:
+        return 1
+
+    return 0
+
+def test_attribute_filtration():
+    query_url = BASE_URL + "attribute/MassSpectrometer/attributeterms?filters=%5B%5D"
+    response = requests.get(query_url)
+    data = json.loads(response.content)
+    key_value = list(data[0].keys())
+    expected_keys = ["attributename", "attributeterm", "ontologyterm", "countfiles"]
+
+    if (key_value != expected_keys):
+        return 1
+
+    return 0
+
+
+def test_attribute_terms_display():
+    query_url = BASE_URL + "/attributes"
+    response = requests.get(query_url)
+    data = json.loads(response.content)
+    key_value = list(data[0].keys())
+
+    expected_keys = ["attributename", "attributedisplay", "countterms"]
+
+    if (key_value != expected_keys):
+        return 1
+
+    return 0
+
+def test_file_enrichment():
+    query_url = BASE_URL + "compoundfilename"
+    params = {'compoundname' :  TEST_COMPOUND}
+    response = requests.get(query_url, params = params)
+    data = json.loads(response.content)
+
+    key_value = next(iter(data[0]))
+
+    if (key_value != 'filepath'):
+       return 1
+
+    return 0
+
+def test_compound_enrichment():
+    query_url = BASE_URL + "compoundenrichment"  
+    params = {'compoundname' : TEST_COMPOUND}
+    response = requests.post(query_url, params )
+    data = json.loads(response.content)  
+    key_value = list(data[0].keys())
+
+    expected_keys = ["attribute_name", "attribute_term", "totalfiles", "compoundfiles", "percentage"]
+
+    if key_value != expected_keys:
+        return 1
+
+    return 0
+
+def test_your_pca():
+    params = {'task': SAMPLE_TASK_ID}
+    query_url = BASE_URL + "processcomparemultivariate"
+    response = requests.get(query_url, params = params)
+    data = response.content
+    file_size = sys.getsizeof(data) 
+
+    if (file_size < 28000000):
+       return 1
+
+    return 0
+
+
+def test_global_pca():
+    response = requests.get(BASE_URL + "displayglobalmultivariate")
+    data = response.content
+    file_size = sys.getsizeof(data)    
+
+    if (file_size < 27760000):
+        return 1
+
+    return 0