Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/remove pandas #8

Merged
merged 4 commits into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions d3b_dff_cli/modules/dewrangle/download_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@ def download_job(jobid, token=None):
def main(args):
"""Main function."""

status, job_df = download_job(args.jobid)
status, job_res = download_job(args.jobid)
if status == "Complete":
job_df.to_csv(args.outfile)
with open(args.outfile, "w") as f:
for line in job_res:
f.write("%s\n" % ",".join(line))
else:
print("Job incomplete, please check again later.")
33 changes: 12 additions & 21 deletions d3b_dff_cli/modules/dewrangle/helper_functions.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
"""Dewrangle helper functions"""

import os
import sys
import traceback
import configparser
import requests
import pandas as pd
from gql import gql, Client
from gql.transport.aiohttp import AIOHTTPTransport
from datetime import datetime
Expand Down Expand Up @@ -592,22 +589,6 @@ def get_job_info(jobid, client=None):
return result


def request_to_df(url, **kwargs):
"""Call api and return response as a pandas dataframe."""
my_data = []
with requests.get(url, **kwargs) as response:
# check if the request was successful
if response.status_code == 200:
for line in response.iter_lines():
my_data.append(line.decode().split(","))
else:
print(f"Failed to fetch the CSV. Status code: {response.status_code}")

my_cols = my_data.pop(0)
df = pd.DataFrame(my_data, columns=my_cols)
return df


def download_job_result(jobid, client=None, api_key=None):
"""Check if a job is complete, download results if it is.
If the job is a list and hash job, only download the hash result."""
Expand All @@ -616,7 +597,7 @@ def download_job_result(jobid, client=None, api_key=None):

job_status = None

job_result = None
job_result = []

job_info = get_job_info(jobid, client)

Expand All @@ -643,7 +624,17 @@ def download_job_result(jobid, client=None, api_key=None):
if child_job["operation"] == "VOLUME_HASH":
jobid = child_job["id"]
url = endpoint + jobid + "/result"
job_result = request_to_df(url, headers=req_header, stream=True)

# Query Dewrangle REST API and get the job results
with requests.get(url, headers=req_header, stream=True) as response:
# check if the request was successful
if response.status_code == 200:
for line in response.iter_lines():
job_result.append(line.decode().split(","))

else:
print(f"Failed to fetch the CSV. Status code: {response.status_code}")

else:
print("Job type {} does not have results to download".format(job_type))

Expand Down
1 change: 0 additions & 1 deletion d3b_dff_cli/modules/validation/check_manifest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import json
import pandas as pd
import argparse
import csv

Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ graphql-core==3.2.3
idna==3.6
multidict==6.0.5
numpy==1.24.4
pandas==2.0.3
pysam==0.22.0
python-dateutil==2.8.2
pytz==2023.3.post1
Expand Down