Merge pull request #11 from uc-cdis/submit_file

cgmeyer · web-flow · commit 75dbd31440ea · 2019-06-21T15:05:04.000-05:00
added submit_file script for submitting structured data with error ha…
diff --git a/gen3/submission.py b/gen3/submission.py
@@ -1,8 +1,18 @@
 import json
 import requests
+import pandas as pd
+import os
 
 
-class Gen3SubmissionQueryError(Exception):
+class Gen3Error(Exception):
+    pass
+
+
+class Gen3SubmissionQueryError(Gen3Error):
+    pass
+
+
+class Gen3UserError(Gen3Error):
     pass
 
 
@@ -37,7 +47,7 @@ def __export_file(self, filename, output):
         outfile = open(filename, "w")
         outfile.write(output)
         outfile.close
-        print("\nOutput written to file: "+filename)
+        print("\nOutput written to file: " + filename)
 
     def query(self, query_txt, variables=None, max_tries=1):
         """Execute a GraphQL query against a data commons.
@@ -93,13 +103,17 @@ def export_record(self, program, project, uuid, fileformat, filename=None):
             >>> Gen3Submission.export_record("DCF", "CCLE", "d70b41b9-6f90-4714-8420-e043ab8b77b9", "json", filename="DCF-CCLE_one_record.json")
 
         """
-        assert fileformat in ["json","tsv"],"File format must be either 'json' or 'tsv'"
+        assert fileformat in [
+            "json",
+            "tsv",
+        ], "File format must be either 'json' or 'tsv'"
         api_url = "{}/api/v0/submission/{}/{}/export?ids={}&format={}".format(
             self._endpoint, program, project, uuid, fileformat
         )
         output = requests.get(api_url, auth=self._auth_provider).text
         if filename is None:
-            if fileformat == 'json': output = json.loads(output)
+            if fileformat == "json":
+                output = json.loads(output)
             return output
         else:
             self.__export_file(filename, output)
@@ -121,13 +135,17 @@ def export_node(self, program, project, node_type, fileformat, filename=None):
             >>> Gen3Submission.export_node("DCF", "CCLE", "sample", "tsv", filename="DCF-CCLE_sample_node.tsv")
 
         """
-        assert fileformat in ["json","tsv"],"File format must be either 'json' or 'tsv'"
+        assert fileformat in [
+            "json",
+            "tsv",
+        ], "File format must be either 'json' or 'tsv'"
         api_url = "{}/api/v0/submission/{}/{}/export/?node_label={}&format={}".format(
             self._endpoint, program, project, node_type, fileformat
         )
         output = requests.get(api_url, auth=self._auth_provider).text
         if filename is None:
-            if fileformat == 'json': output = json.loads(output)
+            if fileformat == "json":
+                output = json.loads(output)
             return output
         else:
             self.__export_file(filename, output)
@@ -284,3 +302,219 @@ def get_graphql_schema(self):
         output = requests.get(api_url).text
         data = json.loads(output)
         return data
+
+    def submit_file(self, project_id, filename, chunk_size=30, row_offset=0):
+        """Submit data in a spreadsheet file containing multiple records in rows to a Gen3 Data Commons.
+
+        Args:
+            project_id (str): The project_id to submit to.
+            filename (str): The file containing data to submit. The format can be TSV, CSV or XLSX (first worksheet only for now).
+            chunk_size (integer): The number of rows of data to submit for each request to the API.
+            row_offset (integer): The number of rows of data to skip; '0' starts submission from the first row and submits all data.
+
+        Examples:
+            This submits a spreadsheet file containing multiple records in rows to the CCLE project in the sandbox commons.
+
+            >>> Gen3Submission.submit_file("DCF-CCLE","data_spreadsheet.tsv")
+
+        """
+        # Read the file in as a pandas DataFrame
+        f = os.path.basename(filename)
+        if f.lower().endswith(".csv"):
+            df = pd.read_csv(filename, header=0, sep=",", dtype=str).fillna("")
+        elif f.lower().endswith(".xlsx"):
+            xl = pd.ExcelFile(filename, dtype=str)  # load excel file
+            sheet = xl.sheet_names[0]  # sheetname
+            df = xl.parse(sheet)  # save sheet as dataframe
+            converters = {
+                col: str for col in list(df)
+            }  # make sure int isn't converted to float
+            df = pd.read_excel(filename, converters=converters).fillna("")  # remove nan
+        elif filename.lower().endswith((".tsv", ".txt")):
+            df = pd.read_csv(filename, header=0, sep="\t", dtype=str).fillna("")
+        else:
+            raise Gen3UserError("Please upload a file in CSV, TSV, or XLSX format.")
+        df.rename(columns = {c: c.lstrip('*') for c in df.columns}, inplace = True) # remove any leading asterisks in the DataFrame column names
+
+        # Check uniqueness of submitter_ids:
+        if len(list(df.submitter_id)) != len(list(df.submitter_id.unique())):
+            raise Gen3Error(
+                "Warning: file contains duplicate submitter_ids. \nNote: submitter_ids must be unique within a node!"
+            )
+
+        # Chunk the file
+        print("\nSubmitting {} with {} records.".format(filename, str(len(df))))
+        program, project = project_id.split("-", 1)
+        api_url = "{}/api/v0/submission/{}/{}".format(self._endpoint, program, project)
+        headers = {"content-type": "text/tab-separated-values"}
+
+        start = row_offset
+        end = row_offset + chunk_size
+        chunk = df[start:end]
+
+        count = 0
+
+        results = {
+            "invalid": {},  # these are invalid records
+            "other": [],  # any unhandled API responses
+            "details": [],  # entire API response details
+            "succeeded": [],  # list of submitter_ids that were successfully updated/created
+            "responses": [],  # list of API response codes
+        }
+
+        # Start the chunking loop:
+        while (start + len(chunk)) <= len(df):
+
+            timeout = False
+            valid_but_failed = []
+            invalid = []
+            count += 1
+            print(
+                "Chunk {} (chunk size: {}, submitted: {} of {})".format(
+                    str(count),
+                    str(chunk_size),
+                    str(len(results["succeeded"]) + len(results["invalid"])),
+                    str(len(df)),
+                )
+            )
+
+            try:
+                response = requests.put(
+                    api_url,
+                    auth=self._auth_provider,
+                    data=chunk.to_csv(sep="\t", index=False),
+                    headers=headers,
+                ).text
+            except requests.exceptions.ConnectionError as e:
+                results["details"].append(e.message)
+
+            # Handle the API response
+            if (
+                "Request Timeout" in response
+                or "413 Request Entity Too Large" in response
+                or "Connection aborted." in response
+                or "service failure - try again later" in response
+            ):  # time-out, response is not valid JSON at the moment
+
+                print("\t Reducing Chunk Size: {}".format(response))
+                results["responses"].append("Reducing Chunk Size: {}".format(response))
+                timeout = True
+
+            else:
+                try:
+                    json_res = json.loads(response)
+                except JSONDecodeError as e:
+                    print(response)
+                    print(str(e))
+                    raise Gen3Error("Unable to parse API response as JSON!")
+
+                if "message" in json_res and "code" not in json_res:
+                    print(
+                        "\t No code in the API response for Chunk {}: {}".format(
+                            str(count), res.get("message")
+                        )
+                    )
+                    print("\t {}".format(str(res.get("transactional_errors"))))
+                    results["responses"].append(
+                        "Error Chunk {}: {}".format(str(count), res.get("message"))
+                    )
+                    results["other"].append(res.get("transactional_errors"))
+
+                elif "code" not in json_res:
+                    print("\t Unhandled API-response: {}".format(response))
+                    results["responses"].append(
+                        "Unhandled API response: {}".format(response)
+                    )
+
+                elif json_res["code"] == 200:  # success
+
+                    entities = json_res.get("entities", [])
+                    print("\t Succeeded: {} entities.".format(str(len(entities))))
+                    results["responses"].append(
+                        "Chunk {} Succeeded: {} entities.".format(
+                            str(count), str(len(entities))
+                        )
+                    )
+
+                    for entity in entities:
+                        sid = entity["unique_keys"][0]["submitter_id"]
+                        results["succeeded"].append(sid)
+
+                elif (
+                    json_res["code"] == 400
+                    or json_res["code"] == 403
+                    or json_res["code"] == 404
+                ):  # failure
+
+                    entities = json_res.get("entities", [])
+                    print("\tChunk Failed: {} entities.".format(str(len(entities))))
+                    results["responses"].append(
+                        "Chunk {} Failed: {} entities.".format(
+                            str(count), str(len(entities))
+                        )
+                    )
+
+                    for entity in entities:
+                        sid = entity["unique_keys"][0]["submitter_id"]
+                        if entity["valid"]:  # valid but failed
+                            valid_but_failed.append(sid)
+                        else:  # invalid and failed
+                            message = str(entity["errors"])
+                            results["invalid"][sid] = message
+                            invalid.append(sid)
+                    print(
+                        "\tInvalid records in this chunk: {}".format(str(len(invalid)))
+                    )
+
+                elif json_res["code"] == 500:  # internal server error
+
+                    print("\t Internal Server Error: {}".format(response))
+                    results["responses"].append(
+                        "Internal Server Error: {}".format(response)
+                    )
+
+            if (
+                len(valid_but_failed) > 0 and len(invalid) > 0
+            ):  # if valid entities failed bc grouped with invalid, retry submission
+                chunk = chunk.loc[
+                    df["submitter_id"].isin(valid_but_failed)
+                ]  # these are records that weren't successful because they were part of a chunk that failed, but are valid and can be resubmitted without changes
+                print(
+                    "Retrying submission of valid entities from failed chunk: {} valid entities.".format(
+                        str(len(chunk))
+                    )
+                )
+
+            elif (
+                len(valid_but_failed) > 0 and len(invalid) == 0
+            ):  # if all entities are valid but submission still failed, probably due to duplicate submitter_ids. Can remove this section once the API response is fixed: https://ctds-planx.atlassian.net/browse/PXP-3065
+                raise Gen3Error(
+                    "Please check your data for correct file encoding, special characters, or duplicate submitter_ids or ids."
+                )
+
+            elif timeout is False:  # get new chunk if didn't timeout
+                start += chunk_size
+                end = start + chunk_size
+                chunk = df[start:end]
+
+            else:  # if timeout, reduce chunk size and retry smaller chunk
+                if chunk_size >= 2:
+                    chunk_size = int(chunk_size / 2)
+                    end = start + chunk_size
+                    chunk = df[start:end]
+                    print(
+                        "Retrying Chunk with reduced chunk_size: {}".format(
+                            str(chunk_size)
+                        )
+                    )
+                    timeout = False
+                else:
+                    raise Gen3SubmissionError(
+                        "Submission is timing out. Please contact the Helpdesk."
+                    )
+
+        print("Finished data submission.")
+        print("Successful records: {}".format(str(len(set(results["succeeded"])))))
+        print("Failed invalid records: {}".format(str(len(results["invalid"]))))
+
+        return results
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,4 @@
 requests
 -e git+https://github.com/uc-cdis/indexclient.git@1.5.6#egg=indexclient
+
+pandas

-Original file line number
+Diff line change
@@ @@ -1,2 +1,4 @@ @@
 requests
 -e git+https://github.com/uc-cdis/[email protected]#egg=indexclient
++
 +pandas