From 18fdbc4847dc438d139e700cddfb77c47f5dc180 Mon Sep 17 00:00:00 2001
From: dzaslavskiy <dzaslavskiy@users.noreply.github.com>
Date: Thu, 13 Jun 2024 09:31:05 -0400
Subject: [PATCH] changes for export

---
 .github/workflows/deploy-app.yaml |   6 +-
 gdrive/api.py                     |  43 +++++--
 gdrive/drive_client.py            | 180 ++++++++++++++++++++++++++----
 gdrive/export_api.py              | 135 +++++++++++++++++-----
 gdrive/export_client.py           |  88 ++++++++++++++-
 gdrive/main.py                    |   7 +-
 gdrive/settings.py                |  20 ++--
 requirements.txt                  |   3 +
 8 files changed, 407 insertions(+), 75 deletions(-)

diff --git a/.github/workflows/deploy-app.yaml b/.github/workflows/deploy-app.yaml
index 0cc6eb4..267b199 100644
--- a/.github/workflows/deploy-app.yaml
+++ b/.github/workflows/deploy-app.yaml
@@ -34,6 +34,6 @@ jobs:
 
       - name: Add network policies
         run: |
-          cf add-network-policy gdrive outbound-proxy -s ${{ steps.cf-setup.outputs.target-environment }}-public --protocol tcp --port 8080
-          cf add-network-policy gdrive es-proxy -s ${{ steps.cf-setup.outputs.target-environment }} --protocol tcp --port 8080
-          cf add-network-policy gdrive qualtrix -s ${{ steps.cf-setup.outputs.target-environment }} --protocol tcp --port 8080
\ No newline at end of file
+          cf add-network-policy util outbound-proxy -s prod-public --protocol tcp --port 8080
+          cf add-network-policy util es-proxy -s prod --protocol tcp --port 8080
+          cf add-network-policy util qualtrix -s prod --protocol tcp --port 8080
\ No newline at end of file
diff --git a/gdrive/api.py b/gdrive/api.py
index 2e19aa8..9518651 100644
--- a/gdrive/api.py
+++ b/gdrive/api.py
@@ -53,13 +53,18 @@ async def upload_file(
         parent = drive_client.create_folder(id, settings.ROOT_DIRECTORY)
 
         if zip:
-            with zipfile.ZipFile(stream) as archive:
-                files = archive.filelist
-                for file in files:
-                    image = io.BytesIO(archive.read(file))
-                    drive_client.upload_basic(
-                        f"{filename}_{file.filename}", parent, image
-                    )
+            try:
+                with zipfile.ZipFile(stream) as archive:
+                    files = archive.filelist
+                    for file in files:
+                        image = io.BytesIO(archive.read(file))
+                        client.upload_basic(
+                            f"{filename}_{file.filename}", parent, image
+                        )
+            except zipfile.BadZipFile as error:
+                client.upload_basic(filename, parent, stream)
+                log.error(f"An error occurred: {error}")
+                response.status_code = status.HTTP_400_BAD_REQUEST
         else:
             drive_client.upload_basic(filename, parent, stream)
 
@@ -85,3 +90,27 @@ async def delete_file(filename, response: Response):
     except HttpError as error:
         log.error(f"An error occurred: {error}")
         response.status_code = error.status_code
+
+
+@router.get("/list")
+async def list():
+    """ """
+    s = client.list(count=200, parent="0AFrX3czp_UwZUk9PVA")
+    print(s)
+
+
+@router.get("/count")
+async def list():
+    """ """
+    s = drive_client.count(parent="1dz8UklyVsBDLP0wC5HPVSOuKpYGqrNEI")
+    print(s)
+
+
+@router.post("/move")
+async def move_file(file: str, source: str, dest: str):
+    return drive_client.move(file, source, dest)
+
+
+@router.post("/create_folder")
+async def move_file(name: str, dest: str):
+    return drive_client.create_folder(name, dest)
diff --git a/gdrive/drive_client.py b/gdrive/drive_client.py
index c5e59c8..937938b 100644
--- a/gdrive/drive_client.py
+++ b/gdrive/drive_client.py
@@ -2,11 +2,13 @@
 import logging
 import json
 import mimetypes
+import time
 from typing import List
 
 from google.oauth2 import service_account
 from googleapiclient.discovery import build
 from googleapiclient.http import MediaIoBaseUpload
+from googleapiclient.errors import HttpError
 
 from gdrive import settings, error
 
@@ -30,14 +32,17 @@ def init():
     log.info(f"Connected to Root Directory {driveId}")
 
 
-def list(count: int = 10, shared: bool = True) -> None:
+def list(count: int = 10, shared: bool = True, parent: str | None = None) -> None:
     """
     Prints the names and ids of the first <count> files the user has access to.
     """
 
+    mq = f"'{parent}' in parents" if parent else ""
+
     results = (
         service.files()
         .list(
+            q=mq,
             pageSize=count,
             fields="*",
             supportsAllDrives=shared,
@@ -63,6 +68,37 @@ def list(count: int = 10, shared: bool = True) -> None:
             log.info(f"No such key: {error} in {item}")
 
 
+def count(shared: bool = True, parent: str | None = None) -> None:
+    """
+    Prints the names and ids of the first <count> files the user has access to.
+    """
+
+    mq = f"'{parent}' in parents" if parent else ""
+
+    results = __count(mq, "", shared)
+    count = len(results["files"])
+
+    while results.get("nextPageToken", None):
+        results = __count(mq, results["nextPageToken"], shared)
+        count += len(results["files"])
+    return count
+
+
+def __count(query, nextPageToken, shared):
+    results = (
+        service.files()
+        .list(
+            q=query,
+            pageSize=1000,
+            supportsAllDrives=shared,
+            includeItemsFromAllDrives=shared,
+            pageToken=nextPageToken,
+        )
+        .execute()
+    )
+    return results
+
+
 def create_empty_spreadsheet(filename: str, parent_id: str) -> str:
     file_metadata = {
         "name": filename,
@@ -180,17 +216,31 @@ def get_files_by_drive_id(filename: str, drive_id: str):
     Get list of files by filename
     """
 
-    results = (
-        service.files()
-        .list(
-            q=f"name = '{filename}'",
-            corpora="drive",
-            driveId=drive_id,
-            includeTeamDriveItems=True,
-            supportsTeamDrives=True,
+    try:
+        results = (
+            service.files()
+            .list(
+                q=f"name = '{filename}'",
+                corpora="drive",
+                driveId=drive_id,
+                includeTeamDriveItems=True,
+                supportsTeamDrives=True,
+            )
+            .execute()
+        )
+    except HttpError:  # manual retry hack
+        time.sleep(2)
+        results = (
+            service.files()
+            .list(
+                q=f"name = '{filename}'",
+                corpora="drive",
+                driveId=drive_id,
+                includeTeamDriveItems=True,
+                supportsTeamDrives=True,
+            )
+            .execute()
         )
-        .execute()
-    )
 
     return results["files"]
 
@@ -202,17 +252,35 @@ def get_files_in_folder(id: str) -> List:
     files = []
     page_token = None
     while True:
-        results = (
-            service.files()
-            .list(
-                q=f"'{id}' in parents and trashed=false",
-                supportsAllDrives=True,
-                includeItemsFromAllDrives=True,
-                fields="nextPageToken, files(*)",
-                pageToken=page_token,
+        try:
+            results = (
+                service.files()
+                .list(
+                    q=f"'{id}' in parents and trashed=false",
+                    pageSize=1000,
+                    supportsAllDrives=True,
+                    includeItemsFromAllDrives=True,
+                    fields="nextPageToken, files(*)",
+                    pageToken=page_token,
+                    orderBy="createdTime",
+                )
+                .execute()
+            )
+        except HttpError:  # manual retry hack
+            time.sleep(2)
+            results = (
+                service.files()
+                .list(
+                    q=f"'{id}' in parents and trashed=false",
+                    pageSize=1000,
+                    supportsAllDrives=True,
+                    includeItemsFromAllDrives=True,
+                    fields="nextPageToken, files(*)",
+                    pageToken=page_token,
+                    orderBy="createdTime",
+                )
+                .execute()
             )
-            .execute()
-        )
         files.extend(results.get("files", []))
         page_token = results.get("nextPageToken")
         if not page_token:
@@ -230,3 +298,73 @@ def delete_file(id: str) -> None:
 
 def export(id: str) -> any:
     return service.files().get_media(fileId=id).execute()
+
+
+def copy(fileId: str, new_location: str):
+    new_file = {"parents": [new_location]}
+
+    result = (
+        service.files()
+        .copy(body=new_file, fileId=fileId, supportsAllDrives=True)
+        .execute()
+    )
+
+    return result
+
+
+def search(responseId: str, source_drive: str):
+    """
+    Prints the names and ids of the first <count> files the user has access to.
+    """
+
+    mq = f"fullText contains '{responseId}' and name = 'analytics.json'"
+
+    results = (
+        service.files()
+        .list(
+            q=mq,
+            driveId=source_drive,
+            corpora="drive",
+            fields="*",
+            supportsAllDrives=True,
+            includeItemsFromAllDrives=True,
+        )
+        .execute()
+    )
+    items = results.get("files", [])
+    return items
+
+
+def exists(filename: str, parent: str):
+    # , source_drive: str
+    """
+    does file exist?
+    """
+
+    results = (
+        service.files()
+        .list(
+            q=f"'{parent}' in parents and name = '{filename}'",
+            # driveId=source_drive,
+            # corpora='drive',
+            fields="*",
+            supportsAllDrives=True,
+            includeItemsFromAllDrives=True,
+        )
+        .execute()
+    )
+    #    return results["files"]
+    items = results.get("files", [])
+    return items
+
+
+def move(fileId: str, source: str, dest: str):
+    result = (
+        service.files()
+        .update(
+            addParents=dest, removeParents=source, fileId=fileId, supportsAllDrives=True
+        )
+        .execute()
+    )
+
+    return result
diff --git a/gdrive/export_api.py b/gdrive/export_api.py
index 4de53f6..d26d0ff 100644
--- a/gdrive/export_api.py
+++ b/gdrive/export_api.py
@@ -2,14 +2,17 @@
 gdrive rest api
 """
 
+from datetime import datetime
 import io
 import json
 import logging
 import sys
+import time
 
 import fastapi
 from pydantic import BaseModel, Field
 from fastapi import BackgroundTasks, responses
+import requests
 
 from gdrive import export_client, drive_client, sheets_client, settings, error
 from gdrive.database import database, crud, models
@@ -19,16 +22,28 @@
 router = fastapi.APIRouter()
 
 
+@router.get("/dead")
+async def dead(interationId):
+    export_client.export_dead()
+
+
 @router.post("/export")
 async def upload_file(interactionId):
-    log.info(f"Export interaction {interactionId}")
-    export_data = export_client.export(interactionId)
-    export_bytes = io.BytesIO(
-        export_client.codename(json.dumps(export_data, indent=2)).encode()
-    )
     parent = drive_client.create_folder(interactionId, settings.ROOT_DIRECTORY)
-    drive_client.upload_basic("analytics.json", parent, export_bytes)
-    log.info(f"Uploading {sys.getsizeof(export_bytes)} bytes to drive folder {parent}")
+    files = drive_client.exists("analytics.json", parent)
+    if files != []:
+        log.warn(f"analytics.json for {interactionId} already exists in {parent}")
+    else:
+        log.info(f"Export interaction {interactionId}")
+        export_data = export_client.export(interactionId)
+        export_bytes = io.BytesIO(
+            export_client.codename(json.dumps(export_data, indent=2)).encode()
+        )
+
+        drive_client.upload_basic("analytics.json", parent, export_bytes)
+        log.info(
+            f"Uploading {sys.getsizeof(export_bytes)} bytes to drive folder {parent}"
+        )
 
 
 class ParticipantModel(BaseModel):
@@ -66,7 +81,53 @@ async def survey_upload_response(
     )
 
 
-async def survey_upload_response_task(request):
+@router.post("/manual-survey-export")
+async def manual_survey_upload_response(
+    responseId: str, surveyId: str, background_tasks: BackgroundTasks
+):
+    request = SurveyParticipantModel(surveyId=surveyId, responseId=responseId)
+    await survey_upload_response_task(request, fetchParticipantInfo=True)
+
+
+@router.post("/bulk-citr-export")
+async def bulk_citer_export():
+    i = 0
+    j = len(ids)
+    for id in ids:
+        i += 1
+        log.info(f" {i} / {j}   {id}")
+
+        # survey export
+        # request = SurveyParticipantModel(surveyId="SV_9H7s2QQiAWFpQIS", responseId=id)
+        # await survey_upload_response_task(request)
+
+        # citer export
+        await citer_export(id)
+
+
+@router.post("/citr-export/{responseId}")
+async def citer_export(responseId):
+    citer_folder = "1Zk7xFackMpN1-4Nl8x1M9GuI4peC5Ymv"
+    source_drive = "0AFrX3czp_UwZUk9PVA"
+
+    files = drive_client.search(responseId, source_drive)
+
+    # folder_name = datetime.strftime( datetime.now(),'%m/%d')
+    folder_name = "incompletes"
+
+    parent = "1dz8UklyVsBDLP0wC5HPVSOuKpYGqrNEI"  # drive_client.create_folder(folder_name, citer_folder)
+
+    if len(files) == 0:
+        log.warning(f"no analytics for {responseId}")
+    else:
+        responseId_folder = drive_client.create_folder(responseId, parent)
+
+        for file in files:
+            new_file = drive_client.copy(file["id"], responseId_folder)
+        # TimeoutError
+
+
+async def survey_upload_response_task(request, fetchParticipantInfo=False):
     """
     Background task that handles qualtrics response fetching and exporting
     """
@@ -88,30 +149,42 @@ async def survey_upload_response_task(request):
         # throws exception in get_qualtrics_response
         survey_resp = response["response"]
 
-        if request.participant:
-            participant = request.participant
-            upload_result = sheets_client.upload_participant(
-                participant.first,
-                participant.last,
-                participant.email,
-                request.responseId,
-                participant.time,
-                participant.date,
-                survey_resp["ethnicity"],
-                ", ".join(
-                    survey_resp["race"]
-                ),  # Can have more than one value in a list
-                survey_resp["gender"],
-                survey_resp["age"],
-                survey_resp["income"],
-                survey_resp["skin_tone"],
-            )
-
-            result_sheet_id = upload_result["spreadsheetId"]
-            if upload_result:
-                log.info(
-                    f"Uploaded response: {request.responseId} to completions spreadsheet {result_sheet_id}"
+        if request.participant or fetchParticipantInfo:
+            if fetchParticipantInfo:
+                pdict = export_client.get_qualtrics_contact(survey_resp["contactId"])[
+                    "result"
+                ]
+                participant = ParticipantModel(
+                    first=pdict["firstName"],
+                    last=pdict["lastName"],
+                    email=pdict["email"],
+                    time=pdict["embeddedData"]["time"],
+                    date=pdict["embeddedData"]["Date"],
                 )
+            else:
+                participant = request.participant
+                upload_result = sheets_client.upload_participant(
+                    participant.first,
+                    participant.last,
+                    participant.email,
+                    request.responseId,
+                    participant.time,
+                    participant.date,
+                    survey_resp["ethnicity"],
+                    ", ".join(
+                        survey_resp["race"]
+                    ),  # Can have more than one value in a list
+                    survey_resp["gender"],
+                    survey_resp["age"],
+                    survey_resp["income"],
+                    survey_resp["skin_tone"],
+                )
+
+                result_sheet_id = upload_result["spreadsheetId"]
+                if upload_result:
+                    log.info(
+                        f"Uploaded response: {request.responseId} to completions spreadsheet {result_sheet_id}"
+                    )
 
             crud.create_participant(
                 models.ParticipantModel(
diff --git a/gdrive/export_client.py b/gdrive/export_client.py
index 03df509..ed97b2b 100644
--- a/gdrive/export_client.py
+++ b/gdrive/export_client.py
@@ -1,6 +1,7 @@
 import logging
 import json
 import re
+from types import SimpleNamespace
 import requests
 
 from opensearchpy import OpenSearch
@@ -151,7 +152,7 @@ def export_response(responseId, survey_response):
         )
 
     results_update = es.update_by_query(
-        index="_all", body=query_response_data, refresh=True
+        index="_all", body=query_response_data, refresh=True, scroll="18s"
     )
 
     return list(map(lambda id: id["match"]["interactionId"], interactionIds_match))
@@ -174,6 +175,91 @@ def get_qualtrics_response(surveyId: str, responseId: str):
     return r.json()
 
 
+def get_qualtrics_contact(contactId: str):
+    url = f"http://{settings.QUALTRICS_APP_URL}:{settings.QUALTRICS_APP_PORT}/contact/{contactId}"
+
+    r = requests.post(
+        url,
+        timeout=30,  # qualtrics microservice retries as it waits for response to become available
+    )
+    if r.status_code != 200:
+        raise error.ExportError(f"No contact response found for contactId: {contactId}")
+    return r.json()
+
+
+def export_dead():
+    es = OpenSearch(
+        hosts=[{"host": settings.ES_HOST, "port": settings.ES_PORT}], timeout=300
+    )
+
+    query_interactionId = {
+        # "size": 1000,
+        "query": {
+            "bool": {
+                "must": [
+                    {
+                        "match_phrase": {
+                            "properties.outcomeType.value": "survey_response"
+                        }
+                    },
+                    {"match": {"properties.outcomeDescription.value": "N/A"}},
+                ]
+            }
+        },
+        "_source": ["interactionId"],
+    }
+
+    results_interacitonId = es.search(
+        body=json.dumps(query_interactionId), index="_all"
+    )
+
+    interactionIds_match = list(
+        map(
+            lambda res: {
+                "match": {"interactionId": f'{res["_source"]["interactionId"]}'}
+            },
+            results_interacitonId["hits"]["hits"],
+        )
+    )
+
+    query_interactionId = {
+        # "size": 1000,
+        "query": {
+            "bool": {
+                "must": [
+                    {"match_phrase": {"properties.outcomeType.value": "survey_data"}}
+                ],
+                "should": interactionIds_match,
+            }
+        }
+    }
+
+    responseIds_result = es.search(body=json.dumps(query_interactionId), index="_all")
+
+    for item in responseIds_result["hits"]["hits"]:
+        survey_data = json.loads(
+            item["_source"]["properties"]["outcomeDescription"]["value"]
+        )
+        responseId = survey_data["responseId"]
+
+        # query qaultrix for suvery result
+        # if present and or > 24 hours
+        # run exoport on the data
+
+    print("hello")
+
+    # if len(interactionIds_match) == 0:
+    #     raise error.ExportError(
+    #         f"No flow interactionId match for responseId: {responseId}"
+    #     )
+
+    # results_update = es.update_by_query(
+    #     index="_all", body=query_response_data, refresh=True
+    # )
+
+    # return list(map(lambda id: id["match"]["interactionId"], interactionIds_match))
+
+
 def get_all_InteractionIds(responseId):
     es = OpenSearch(
         hosts=[{"host": settings.ES_HOST, "port": settings.ES_PORT}], timeout=300
diff --git a/gdrive/main.py b/gdrive/main.py
index b116022..095eddc 100644
--- a/gdrive/main.py
+++ b/gdrive/main.py
@@ -4,9 +4,13 @@
 
 import fastapi
 import starlette_prometheus
+import logging
 
+from . import api, export_api, analytics_api, debug_api, settings
 
-from . import api, export_api, analytics_api, settings
+
+log = logging.getLogger()
+log.addHandler(logging.FileHandler("errors.log"))
 
 app = fastapi.FastAPI()
 
@@ -16,3 +20,4 @@
 app.include_router(api.router)
 app.include_router(export_api.router)
 app.include_router(analytics_api.router)
+app.include_router(debug_api.router)
diff --git a/gdrive/settings.py b/gdrive/settings.py
index e45d42e..16ed995 100644
--- a/gdrive/settings.py
+++ b/gdrive/settings.py
@@ -21,9 +21,7 @@
     "https://www.googleapis.com/auth/drive",
     "https://www.googleapis.com/auth/spreadsheets",
 ]
-
-SERVICE_ACCOUNT_FILE = "credentials.json"
-
+SERVICE_ACCOUNT_FILE = "../identity-idva-utils/secrets/gdrive/credentials-prod.json"
 ROOT_DIRECTORY = ""
 CODE_NAMES = None
 CREDENTIALS = None
@@ -31,11 +29,11 @@
 ANALYTICS_PROPERTY_ID = None
 ANALYTICS_CREDENTIALS = None
 
-ES_HOST = os.getenv("ES_HOST")
-ES_PORT = os.getenv("ES_PORT")
+ES_HOST = os.getenv("ES_HOST", "localhost")
+ES_PORT = os.getenv("ES_PORT", "8090")
 
-QUALTRICS_APP_URL = os.getenv("QUALTRICS_APP_URL")
-QUALTRICS_APP_PORT = os.getenv("QUALTRICS_APP_PORT")
+QUALTRICS_APP_URL = os.getenv("QUALTRICS_APP_URL", "localhost")
+QUALTRICS_APP_PORT = os.getenv("QUALTRICS_APP_PORT", "8060")
 
 RAW_COMPLETIONS_SHEET_NAME = os.getenv("GDRIVE_RAW_COMPLETIONS_SHEET_NAME", "Sheet1")
 
@@ -48,15 +46,15 @@
     config = {}
     if vcap_services:
         user_services = json.loads(vcap_services)["user-provided"]
-        DB_URI = json.loads(vcap_services)["aws-rds"][0]["credentials"]["uri"]
-        #  Sqlalchemy requires 'postgresql' as the protocol
-        DB_URI = DB_URI.replace("postgres://", "postgresql://", 1)
         for service in user_services:
             if service["name"] == "gdrive":
                 log.info("Loading credentials from env var")
                 config = service["credentials"]
 
                 break
+        db = json.loads(vcap_services)["aws-rds"][0]["credentials"]["uri"]
+        #  Sqlalchemy requires 'postgresql' as the protocol
+        DB_URI = db.replace("postgres://", "postgresql://", 1)
     else:
         with open(SERVICE_ACCOUNT_FILE) as file:
             log.info("Loading credentials from creds file")
@@ -72,4 +70,4 @@
 
 except (json.JSONDecodeError, KeyError, FileNotFoundError) as err:
     log.warning("Unable to load credentials from VCAP_SERVICES")
-    log.debug("Error: %s", str(err))
+    log.error("Error: %s", str(err))
diff --git a/requirements.txt b/requirements.txt
index 54279f9..1e52f61 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,3 +14,6 @@ pandas==2.2.2
 sqlalchemy==1.4.*
 psycopg2==2.9.9
 alembic==1.13.1
+wheel
+gspread
+jinja2
\ No newline at end of file