Skip to content

Commit

Permalink
Merge pull request #272 from SiDDyy007/new-branch
Browse files Browse the repository at this point in the history
New branch
  • Loading branch information
mwang87 committed Apr 13, 2023
2 parents a5b2a75 + 0814c90 commit b834254
Show file tree
Hide file tree
Showing 6 changed files with 275 additions and 1 deletion.
28 changes: 28 additions & 0 deletions code/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
**GNPS Metadata Tools** <br>
This repository contains three Python scripts for working with GNPS metadata files: _gnps_downloader.py_, _gnps_validator.py_, and _gnps_name_matcher.py_. These scripts allow you to aggregate lists of GNPS metadata files, validate the files using the GNPS metadata validator, and match the files to their respective CCMS peak dataset names.<br>

**Installation**<br>
To use these scripts, you'll need to have Python 3 installed on your system. <br>
You can download Python from the official Python website: https://www.python.org/downloads/<br>

-> You can install dependencies using pip:<br>
_pip install pandas urllib requests_<br>
<br>
**Usage**<br>
_**gnps_downloader.py**_<br>
This script aggregates a list of GNPS metadata files, sorts the files by their creation time, and downloads the latest GNPS metadata file. The script then appends the file path and file name into a TSV file.<br>

To run the script, use the following command: <br>
_python3 gnps_downloader.py_

_**gnps_validator.py**_<br>
This script runs the downloaded GNPS metadata files against the metadata validator and stores the list of file names that have passed through the validator. The script also rejects files that haven't passed and appends the passed file names into a TSV file.

To run the script, use the following command:<br>
_python3 gnps_validator.py_

_**gnps_name_matcher.py**_<br>
This script matches the GNPS metadata files to their respective CCMS peak dataset names and gives out a TSV file that contains all the names that match unambiguously.

To run the script, use the following command:<br>
_python3 gnps_name_matcher.py_
77 changes: 77 additions & 0 deletions code/gnps_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Import necessary libraries
import os
import pandas as pd
import urllib
import requests
import io
import sys
import collections
import argparse
from pandas.errors import EmptyDataError

# Print message to indicate importing is done
os.system("echo Importing Done!")

# Set the URL for GNPS metadata CSV file and read it using pandas
gnps_metadata_link = "https://gnps-datasetcache.ucsd.edu/database/filename.csv?_sort=filepath&filepath__endswith=gnps_metadata.tsv&_size=max"
gnps_df = pd.read_csv(gnps_metadata_link)

# Print message to indicate that the CSV file has been read
os.system("echo GNPS CSV Read Done!")

# Convert the create time column to a datetime format
gnps_df['create_time'] = pd.to_datetime(gnps_df['create_time'])

# Sort the DataFrame by the create time column
gnps_df = gnps_df.sort_values(by='create_time')

# Group the DataFrame by the dataset column
groups = gnps_df.groupby('dataset')

# Select the row with the highest create time value for each group
selected_rows = []
for name, group in groups:
idx = group['create_time'].idxmax()
selected_rows.append(group.loc[idx])

# Names of GNPS Files
gnps_list = [row['filepath'] for row in selected_rows]
print("Total gnps file are ", len(gnps_list))
os.system("echo Filepath list generated ... ")

# Set the download link for GNPS files
download_link = "https://massive.ucsd.edu/ProteoSAFe/DownloadResultFile?forceDownload=true&file=f."
os.system("echo We are downloading now ...")

# Create a defaultdict to store file paths
file_paths = collections.defaultdict(list)

# Download files from the links in the gnps_list and store them locally
for index, link in enumerate(gnps_list):
download_url = download_link + link
r = requests.get(download_url)
file_name = str(index) + "_gnps_metadata.tsv"
file_paths["sys_name"].append(file_name)
file_paths["svr_name"].append(link)
with open(file_name,'wb') as f:
f.write(r.content) #Downloading

# Print message to indicate that downloading has been completed successfully
os.system("echo Download has been completed successfully!")

# Print message to indicate that a TSV file for file paths is being created
os.system("echo Creating tsv for file paths ...")

# Write file paths to a TSV file
with open("file_paths.tsv", "w") as f:
f.write("LocalName\tServerName\n")
for i in range(len(file_paths["sys_name"])):
local = file_paths["sys_name"][i]
server = file_paths["svr_name"][i]
f.write(f"{local}\t{server}\n")
f.close()

# Print message to indicate that the TSV file for file paths has been created
os.system("echo Tsv file created for path names!!")
# Print message to indicate that the execution of the code is complete
os.system("echo EXECUTION COMPLETE ! HAVE A GOOD DAY ")
84 changes: 84 additions & 0 deletions code/gnps_name_matcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import os
import pandas as pd
import urllib
import requests
import io
import sys
import argparse
import subprocess
import collections
from subprocess import PIPE, run

ccms_peak_link_start = "https://gnps-datasetcache.ucsd.edu/datasette/database/filename.csv?_sort=filepath&collection__exact=ccms_peak&dataset__exact="
ccms_peak_link_end = "&_size=max"
name = ""
os.system("echo CCMS Read Done!")
current_dir = os.getcwd()
ccms_filenames = collections.defaultdict(set)
# print(current_dir)
if True:
# Read the TSV file and specify the delimiter as a tab
df = pd.read_csv(current_dir + '/passed_file_names.tsv', delimiter='\t', header=None, names=['Name'])
# Extract the names from a specific column (e.g., column 'Name')
passed_file_names = df['Name'].tolist()


os.system("echo Iterating though rows now")
merged_rows = {}
# unique = 0
# dupli = 0

visit = set()
for file in passed_file_names:
# print("Length of visit is ",len(visit))
print("Working on ", file)
# os.system("echo Working on")
# csv_path = os.path.join(current_dir, './data.csv' file)
df = pd.read_csv( current_dir + "/" +file, delimiter='\t')
try:
# Get the value of the first row of the 'column_name' column
dataset = df['MassiveID'].iloc[0]

ccms_df = pd.read_csv(ccms_peak_link_start + dataset + ccms_peak_link_end)
except TypeError:
print(f"Skipping file {file} due to a TypeError.")
continue
for index, row in ccms_df.iterrows():
# dataset = row["dataset"]
filepath = row["filepath"]
ccms_filenames[dataset].add(filepath)

for index, row in df.iterrows():
filename = row["filename"]
filename2 = row["filename"][:-3] + "ML"
for key in ccms_filenames[dataset]:
if key.endswith(filename) or key.endswith(filename2):
if key in visit:
# dupli += 1
merged_rows[key] = []
else:
# unique += 1
visit.add(key)
new_row = [key] + list(row)
merged_rows[key] = list(new_row)
print("Worked on ", file)
os.system("echo Merged Rows list complete")
non_empty_values = [v for v in merged_rows.values() if v]
print("Length of Entries in Final file -> ",len(non_empty_values))

# Read the TSV file containing headers into a pandas DataFrame
# header_df = pd.read_csv(passed_file_names[-1], delimiter='\t', header=None)

# Convert the values in the first row to a list
# headers = header_df.iloc[0].tolist()

# Create a DataFrame from the list with headers
# fnt = pd.DataFrame(non_empty_values, columns=headers)
fnt = pd.DataFrame(non_empty_values)
# Save the DataFrame to a TSV file without column names
# fnt.to_csv('check.tsv', sep='\t', index=False)
fnt.to_csv('check.tsv', sep='\t', header = False, index=False)
# print("Total Unique Entries are ",unique)
# print("Duplicate Entries are ",dupli)


46 changes: 46 additions & 0 deletions code/gnps_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os
import pandas as pd
import urllib
import requests
import io
import sys
import argparse
import subprocess
import collections
from subprocess import PIPE, run

# Print message to indicate importing is done
os.system("echo Importing Done ...")
current_dir = os.getcwd()
# Read the TSV file containing file paths into a pandas DataFrame
df = pd.read_csv(current_dir + '/file_paths.tsv', delimiter='\t')

# Get a list of all the values in the first column (file names)
file_names = df.iloc[:, 0].tolist()

# List to store names of files that have passed validation
passed_file_names = []

# Loop through each file in the list of file names and validate each one
os.system("echo Validating Files now ...")
for file_name in file_names:
# Call the metadata_validator.py script and pass the file name as an argument
output = os.popen("python3 /Users/siddhantpoojary/Desktop/ReDU-MS2-GNPS/code/metadata_validator.py " + current_dir +'/' + file_name).read()
# If the script output starts with "S", then the file has passed validation
if output and output[0] == "S":
passed_file_names.append(file_name)

# Print message to indicate that validation is complete
os.system("echo Validation Completed !")

# Convert the list of passed file names to a pandas DataFrame
passed = pd.DataFrame(passed_file_names)

# Print message to indicate that a TSV file is being created for path names
os.system("echo Now creating tsv file for Path names ...")

# Write the passed file names DataFrame to a TSV file
passed.to_csv('passed_file_names.tsv', sep='\t', index=False, header=False)

# Print message to indicate that the TSV file has been created and the script is complete
os.system("echo TSV File created! Have a good day.")
7 changes: 6 additions & 1 deletion code/metadata_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,11 @@ def perform_validation(filename):

my_validator = Vlad(source=LocalFile(filename),delimiter="\t",ignore_missing_validators=True,validators=validators)
passes_validation = my_validator.validate()

if passes_validation:
print ("Success")
else:
print("Fail")

errors_list = []
for column in my_validator.failures:
Expand Down Expand Up @@ -227,4 +232,4 @@ def main():
# print(row)

if __name__ == "__main__":
main()
main()
34 changes: 34 additions & 0 deletions code/nextflow_sid.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl=2

params.input = "README.md"

TOOL_FOLDER = "$baseDir"

process processData {
publishDir "./nf_output", mode: 'copy'

conda "$TOOL_FOLDER/conda_env.yml"

input:
file input

output:
file 'outputs.tsv.*'
file 'file_paths.tsv'
file 'passed_file_names.tsv'
file 'check.tsv'

"""
python $TOOL_FOLDER/gnps_downloader.py $input
python $TOOL_FOLDER/gnps_validator.py $input passed_file_names.tsv
python $TOOL_FOLDER/gnps_name_matcher.py $input check.tsv
"""
}

workflow {
data = Channel.fromPath(params.input)
processData(data)
}

//

0 comments on commit b834254

Please sign in to comment.