diff --git a/d3b_dff_cli/modules/dewrangle/download_job.py b/d3b_dff_cli/modules/dewrangle/download_job.py index 68ccf5c..e1a866f 100644 --- a/d3b_dff_cli/modules/dewrangle/download_job.py +++ b/d3b_dff_cli/modules/dewrangle/download_job.py @@ -17,8 +17,10 @@ def download_job(jobid, token=None): def main(args): """Main function.""" - status, job_df = download_job(args.jobid) + status, job_res = download_job(args.jobid) if status == "Complete": - job_df.to_csv(args.outfile) + with open(args.outfile, "w") as f: + for line in job_res: + f.write("%s\n" % ",".join(line)) else: print("Job incomplete, please check again later.") diff --git a/d3b_dff_cli/modules/dewrangle/helper_functions.py b/d3b_dff_cli/modules/dewrangle/helper_functions.py index 230a44f..900946c 100644 --- a/d3b_dff_cli/modules/dewrangle/helper_functions.py +++ b/d3b_dff_cli/modules/dewrangle/helper_functions.py @@ -1,11 +1,8 @@ """Dewrangle helper functions""" import os -import sys -import traceback import configparser import requests -import pandas as pd from gql import gql, Client from gql.transport.aiohttp import AIOHTTPTransport from datetime import datetime @@ -592,22 +589,6 @@ def get_job_info(jobid, client=None): return result -def request_to_df(url, **kwargs): - """Call api and return response as a pandas dataframe.""" - my_data = [] - with requests.get(url, **kwargs) as response: - # check if the request was successful - if response.status_code == 200: - for line in response.iter_lines(): - my_data.append(line.decode().split(",")) - else: - print(f"Failed to fetch the CSV. Status code: {response.status_code}") - - my_cols = my_data.pop(0) - df = pd.DataFrame(my_data, columns=my_cols) - return df - - def download_job_result(jobid, client=None, api_key=None): """Check if a job is complete, download results if it is. If the job is a list and hash job, only download the hash result.""" @@ -616,7 +597,7 @@ def download_job_result(jobid, client=None, api_key=None): job_status = None - job_result = None + job_result = [] job_info = get_job_info(jobid, client) @@ -643,7 +624,17 @@ def download_job_result(jobid, client=None, api_key=None): if child_job["operation"] == "VOLUME_HASH": jobid = child_job["id"] url = endpoint + jobid + "/result" - job_result = request_to_df(url, headers=req_header, stream=True) + + # Query Dewrangle REST API and get the job results + with requests.get(url, headers=req_header, stream=True) as response: + # check if the request was successful + if response.status_code == 200: + for line in response.iter_lines(): + job_result.append(line.decode().split(",")) + + else: + print(f"Failed to fetch the CSV. Status code: {response.status_code}") + else: print("Job type {} does not have results to download".format(job_type)) diff --git a/d3b_dff_cli/modules/validation/check_manifest.py b/d3b_dff_cli/modules/validation/check_manifest.py index 79d0c74..08e8c74 100644 --- a/d3b_dff_cli/modules/validation/check_manifest.py +++ b/d3b_dff_cli/modules/validation/check_manifest.py @@ -1,5 +1,4 @@ import json -import pandas as pd import argparse import csv diff --git a/requirements.txt b/requirements.txt index e82c1bb..9a35e83 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,6 @@ graphql-core==3.2.3 idna==3.6 multidict==6.0.5 numpy==1.24.4 -pandas==2.0.3 pysam==0.22.0 python-dateutil==2.8.2 pytz==2023.3.post1