diff --git a/code/README.md b/code/README.md new file mode 100644 index 0000000..de9e6bc --- /dev/null +++ b/code/README.md @@ -0,0 +1,28 @@ +**GNPS Metadata Tools**
+This repository contains three Python scripts for working with GNPS metadata files: _gnps_downloader.py_, _gnps_validator.py_, and _gnps_name_matcher.py_. These scripts allow you to aggregate lists of GNPS metadata files, validate the files using the GNPS metadata validator, and match the files to their respective CCMS peak dataset names.
+ +**Installation**
+To use these scripts, you'll need to have Python 3 installed on your system.
+You can download Python from the official Python website: https://www.python.org/downloads/
+ +-> You can install dependencies using pip:
+_pip install pandas urllib requests_
+
+**Usage**
+_**gnps_downloader.py**_
+This script aggregates a list of GNPS metadata files, sorts the files by their creation time, and downloads the latest GNPS metadata file. The script then appends the file path and file name into a TSV file.
+ +To run the script, use the following command:
+_python3 gnps_downloader.py_ + +_**gnps_validator.py**_
+This script runs the downloaded GNPS metadata files against the metadata validator and stores the list of file names that have passed through the validator. The script also rejects files that haven't passed and appends the passed file names into a TSV file. + +To run the script, use the following command:
+_python3 gnps_validator.py_ + +_**gnps_name_matcher.py**_
+This script matches the GNPS metadata files to their respective CCMS peak dataset names and gives out a TSV file that contains all the names that match unambiguously. + +To run the script, use the following command:
+_python3 gnps_name_matcher.py_ diff --git a/code/gnps_downloader.py b/code/gnps_downloader.py new file mode 100644 index 0000000..43fe6be --- /dev/null +++ b/code/gnps_downloader.py @@ -0,0 +1,77 @@ +# Import necessary libraries +import os +import pandas as pd +import urllib +import requests +import io +import sys +import collections +import argparse +from pandas.errors import EmptyDataError + +# Print message to indicate importing is done +os.system("echo Importing Done!") + +# Set the URL for GNPS metadata CSV file and read it using pandas +gnps_metadata_link = "https://gnps-datasetcache.ucsd.edu/database/filename.csv?_sort=filepath&filepath__endswith=gnps_metadata.tsv&_size=max" +gnps_df = pd.read_csv(gnps_metadata_link) + +# Print message to indicate that the CSV file has been read +os.system("echo GNPS CSV Read Done!") + +# Convert the create time column to a datetime format +gnps_df['create_time'] = pd.to_datetime(gnps_df['create_time']) + +# Sort the DataFrame by the create time column +gnps_df = gnps_df.sort_values(by='create_time') + +# Group the DataFrame by the dataset column +groups = gnps_df.groupby('dataset') + +# Select the row with the highest create time value for each group +selected_rows = [] +for name, group in groups: + idx = group['create_time'].idxmax() + selected_rows.append(group.loc[idx]) + +# Names of GNPS Files +gnps_list = [row['filepath'] for row in selected_rows] +print("Total gnps file are ", len(gnps_list)) +os.system("echo Filepath list generated ... ") + +# Set the download link for GNPS files +download_link = "https://massive.ucsd.edu/ProteoSAFe/DownloadResultFile?forceDownload=true&file=f." +os.system("echo We are downloading now ...") + +# Create a defaultdict to store file paths +file_paths = collections.defaultdict(list) + +# Download files from the links in the gnps_list and store them locally +for index, link in enumerate(gnps_list): + download_url = download_link + link + r = requests.get(download_url) + file_name = str(index) + "_gnps_metadata.tsv" + file_paths["sys_name"].append(file_name) + file_paths["svr_name"].append(link) + with open(file_name,'wb') as f: + f.write(r.content) #Downloading + +# Print message to indicate that downloading has been completed successfully +os.system("echo Download has been completed successfully!") + +# Print message to indicate that a TSV file for file paths is being created +os.system("echo Creating tsv for file paths ...") + +# Write file paths to a TSV file +with open("file_paths.tsv", "w") as f: + f.write("LocalName\tServerName\n") + for i in range(len(file_paths["sys_name"])): + local = file_paths["sys_name"][i] + server = file_paths["svr_name"][i] + f.write(f"{local}\t{server}\n") + f.close() + +# Print message to indicate that the TSV file for file paths has been created +os.system("echo Tsv file created for path names!!") +# Print message to indicate that the execution of the code is complete +os.system("echo EXECUTION COMPLETE ! HAVE A GOOD DAY ") \ No newline at end of file diff --git a/code/gnps_name_matcher.py b/code/gnps_name_matcher.py new file mode 100644 index 0000000..befca4c --- /dev/null +++ b/code/gnps_name_matcher.py @@ -0,0 +1,84 @@ +import os +import pandas as pd +import urllib +import requests +import io +import sys +import argparse +import subprocess +import collections +from subprocess import PIPE, run + +ccms_peak_link_start = "https://gnps-datasetcache.ucsd.edu/datasette/database/filename.csv?_sort=filepath&collection__exact=ccms_peak&dataset__exact=" +ccms_peak_link_end = "&_size=max" +name = "" +os.system("echo CCMS Read Done!") +current_dir = os.getcwd() +ccms_filenames = collections.defaultdict(set) +# print(current_dir) +if True: + # Read the TSV file and specify the delimiter as a tab + df = pd.read_csv(current_dir + '/passed_file_names.tsv', delimiter='\t', header=None, names=['Name']) + # Extract the names from a specific column (e.g., column 'Name') + passed_file_names = df['Name'].tolist() + + + os.system("echo Iterating though rows now") + merged_rows = {} + # unique = 0 + # dupli = 0 + + visit = set() + for file in passed_file_names: + # print("Length of visit is ",len(visit)) + print("Working on ", file) + # os.system("echo Working on") + # csv_path = os.path.join(current_dir, './data.csv' file) + df = pd.read_csv( current_dir + "/" +file, delimiter='\t') + try: + # Get the value of the first row of the 'column_name' column + dataset = df['MassiveID'].iloc[0] + + ccms_df = pd.read_csv(ccms_peak_link_start + dataset + ccms_peak_link_end) + except TypeError: + print(f"Skipping file {file} due to a TypeError.") + continue + for index, row in ccms_df.iterrows(): + # dataset = row["dataset"] + filepath = row["filepath"] + ccms_filenames[dataset].add(filepath) + + for index, row in df.iterrows(): + filename = row["filename"] + filename2 = row["filename"][:-3] + "ML" + for key in ccms_filenames[dataset]: + if key.endswith(filename) or key.endswith(filename2): + if key in visit: + # dupli += 1 + merged_rows[key] = [] + else: + # unique += 1 + visit.add(key) + new_row = [key] + list(row) + merged_rows[key] = list(new_row) + print("Worked on ", file) + os.system("echo Merged Rows list complete") + non_empty_values = [v for v in merged_rows.values() if v] + print("Length of Entries in Final file -> ",len(non_empty_values)) + + # Read the TSV file containing headers into a pandas DataFrame + # header_df = pd.read_csv(passed_file_names[-1], delimiter='\t', header=None) + + # Convert the values in the first row to a list + # headers = header_df.iloc[0].tolist() + + # Create a DataFrame from the list with headers + # fnt = pd.DataFrame(non_empty_values, columns=headers) + fnt = pd.DataFrame(non_empty_values) + # Save the DataFrame to a TSV file without column names + # fnt.to_csv('check.tsv', sep='\t', index=False) + fnt.to_csv('check.tsv', sep='\t', header = False, index=False) + # print("Total Unique Entries are ",unique) + # print("Duplicate Entries are ",dupli) + + diff --git a/code/gnps_validator.py b/code/gnps_validator.py new file mode 100644 index 0000000..692dea0 --- /dev/null +++ b/code/gnps_validator.py @@ -0,0 +1,46 @@ +import os +import pandas as pd +import urllib +import requests +import io +import sys +import argparse +import subprocess +import collections +from subprocess import PIPE, run + +# Print message to indicate importing is done +os.system("echo Importing Done ...") +current_dir = os.getcwd() +# Read the TSV file containing file paths into a pandas DataFrame +df = pd.read_csv(current_dir + '/file_paths.tsv', delimiter='\t') + +# Get a list of all the values in the first column (file names) +file_names = df.iloc[:, 0].tolist() + +# List to store names of files that have passed validation +passed_file_names = [] + +# Loop through each file in the list of file names and validate each one +os.system("echo Validating Files now ...") +for file_name in file_names: + # Call the metadata_validator.py script and pass the file name as an argument + output = os.popen("python3 /Users/siddhantpoojary/Desktop/ReDU-MS2-GNPS/code/metadata_validator.py " + current_dir +'/' + file_name).read() + # If the script output starts with "S", then the file has passed validation + if output and output[0] == "S": + passed_file_names.append(file_name) + +# Print message to indicate that validation is complete +os.system("echo Validation Completed !") + +# Convert the list of passed file names to a pandas DataFrame +passed = pd.DataFrame(passed_file_names) + +# Print message to indicate that a TSV file is being created for path names +os.system("echo Now creating tsv file for Path names ...") + +# Write the passed file names DataFrame to a TSV file +passed.to_csv('passed_file_names.tsv', sep='\t', index=False, header=False) + +# Print message to indicate that the TSV file has been created and the script is complete +os.system("echo TSV File created! Have a good day.") diff --git a/code/metadata_validator.py b/code/metadata_validator.py index 2dc6c12..b12f18e 100644 --- a/code/metadata_validator.py +++ b/code/metadata_validator.py @@ -151,6 +151,11 @@ def perform_validation(filename): my_validator = Vlad(source=LocalFile(filename),delimiter="\t",ignore_missing_validators=True,validators=validators) passes_validation = my_validator.validate() + + if passes_validation: + print ("Success") + else: + print("Fail") errors_list = [] for column in my_validator.failures: @@ -227,4 +232,4 @@ def main(): # print(row) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/code/nextflow_sid.nf b/code/nextflow_sid.nf new file mode 100644 index 0000000..e91c186 --- /dev/null +++ b/code/nextflow_sid.nf @@ -0,0 +1,34 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +params.input = "README.md" + +TOOL_FOLDER = "$baseDir" + +process processData { + publishDir "./nf_output", mode: 'copy' + + conda "$TOOL_FOLDER/conda_env.yml" + + input: + file input + + output: + file 'outputs.tsv.*' + file 'file_paths.tsv' + file 'passed_file_names.tsv' + file 'check.tsv' + + """ + python $TOOL_FOLDER/gnps_downloader.py $input + python $TOOL_FOLDER/gnps_validator.py $input passed_file_names.tsv + python $TOOL_FOLDER/gnps_name_matcher.py $input check.tsv + """ +} + +workflow { + data = Channel.fromPath(params.input) + processData(data) +} + +// \ No newline at end of file