started updates for convo logging

UIUC-Chatbot · KastanDay · Mar 29, 2024 · Mar 21, 2024 · Mar 22, 2024 · Mar 22, 2024
commit fc69d4ccf911d074f0be82510bdaff42b6d75b5d
diff --git a/ai_ta_backend/beam/nomic_logging.py b/ai_ta_backend/beam/nomic_logging.py
@@ -15,6 +15,8 @@
     supabase_url=os.getenv('SUPABASE_URL'),  # type: ignore
     supabase_key=os.getenv('SUPABASE_API_KEY'))  # type: ignore
 
+NOMIC_MAP_NAME_PREFIX = 'Document Map for '
+
 ## -------------------------------- DOCUMENT MAP FUNCTIONS --------------------------------- ##
 
 def create_document_map(course_name: str):
@@ -33,21 +35,15 @@ def create_document_map(course_name: str):
   """
   print("in create_document_map()")
   nomic.login(os.getenv('NOMIC_API_KEY'))
-  NOMIC_MAP_NAME_PREFIX = 'Document Map for '
-
-  # initialize supabase
-  supabase_client = supabase.create_client(  # type: ignore
-      supabase_url=os.getenv('SUPABASE_URL'),  # type: ignore
-      supabase_key=os.getenv('SUPABASE_API_KEY'))  # type: ignore
-
+
   try:
     # check if map exists
-    response = supabase_client.table("projects").select("doc_map_id").eq("course_name", course_name).execute()
+    response = SUPABASE_CLIENT.table("projects").select("doc_map_id").eq("course_name", course_name).execute()
     if response.data:
       return "Map already exists for this course."
 
     # fetch relevant document data from Supabase
-    response = supabase_client.table("documents").select("id",
+    response = SUPABASE_CLIENT.table("documents").select("id",
                                                          count="exact").eq("course_name",
                                                                            course_name).order('id',
                                                                                               desc=False).execute()
@@ -71,7 +67,7 @@ def create_document_map(course_name: str):
     # iteratively query in batches of 25
     while curr_total_doc_count < total_doc_count:
 
-      response = supabase_client.table("documents").select(
+      response = SUPABASE_CLIENT.table("documents").select(
             "id, created_at, s3_path, url, base_url, readable_filename, contexts").eq("course_name", course_name).gte(
                 'id', first_id).order('id', desc=False).limit(25).execute()
       df = pd.DataFrame(response.data)
@@ -105,7 +101,7 @@ def create_document_map(course_name: str):
             project_id = project.id
             last_id = int(final_df['id'].iloc[-1])
             project_info = {'course_name': course_name, 'doc_map_id': project_id, 'last_uploaded_doc_id': last_id}
-            update_response = supabase_client.table("projects").insert(project_info).execute()
+            update_response = SUPABASE_CLIENT.table("projects").insert(project_info).execute()
             print("Response from supabase: ", update_response)
 
         else:
@@ -118,7 +114,7 @@ def create_document_map(course_name: str):
             # update the last uploaded id in supabase
             last_id = int(final_df['id'].iloc[-1])
             info = {'last_uploaded_doc_id': last_id}
-            update_response = supabase_client.table("projects").update(info).eq("course_name", course_name).execute()
+            update_response = SUPABASE_CLIENT.table("projects").update(info).eq("course_name", course_name).execute()
             print("Response from supabase: ", update_response)
 
         # reset variables
@@ -148,7 +144,7 @@ def create_document_map(course_name: str):
         last_id = int(final_df['id'].iloc[-1])
         project_info = {'last_uploaded_doc_id': last_id}
         print("project_info: ", project_info)
-        update_response = supabase_client.table("projects").update(project_info).eq("course_name", course_name).execute()
+        update_response = SUPABASE_CLIENT.table("projects").update(project_info).eq("course_name", course_name).execute()
         print("Response from supabase: ", update_response)
 
 

diff --git a/ai_ta_backend/database/sql.py b/ai_ta_backend/database/sql.py
@@ -73,9 +73,14 @@ def getAllFromTableForDownloadType(self, course_name: str, download_type: str, f
 
     return response
 
-  def getAllConversationsBetweenIds(self, course_name: str, first_id: int, last_id: int):
-    return self.supabase_client.table("llm-convo-monitor").select("*").eq("course_name", course_name).gte(
-        'id', first_id).lte('id', last_id).order('id', desc=False).limit(25).execute()
+  def getAllConversationsBetweenIds(self, course_name: str, first_id: int, last_id: int, limit: int = 50):
+    if last_id == 0:
+      return self.supabase_client.table("llm-convo-monitor").select("*").eq("course_name", course_name).gte(
+          'id', first_id).order('id', desc=False).limit(limit).execute()
+    else:
+      return self.supabase_client.table("llm-convo-monitor").select("*").eq("course_name", course_name).gte(
+          'id', first_id).lte('id', last_id).order('id', desc=False).limit(limit).execute()
+
 
   def getDocsForIdsGte(self, course_name: str, first_id: int, fields: str = "*", limit: int = 100):
     return self.supabase_client.table("documents").select(fields).eq("course_name", course_name).gte(
@@ -86,3 +91,12 @@ def insertProjectInfo(self, project_info):
 
   def getAllFromLLMConvoMonitor(self, course_name: str):
     return self.supabase_client.table("llm-convo-monitor").select("*").eq("course_name", course_name).execute()
+
+  def getCountFromLLMConvoMonitor(self, course_name: str):
+    return self.supabase_client.table("llm-convo-monitor").select("id", count='exact').eq("course_name", course_name).execute()
+
+  def getDocMapFromProjects(self, course_name: str):
+    return self.supabase_client.table("projects").select("doc_map_id").eq("course_name", course_name).execute()
+
+  def getConvoMapFromProjects(self, course_name: str):
+    return self.supabase_client.table("projects").select("convo_map_id").eq("course_name", course_name).execute()
diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py
@@ -197,6 +197,20 @@ def createDocumentMap(service: NomicService):
   response.headers.add('Access-Control-Allow-Origin', '*')
   return response
 
+@app.route('/createConversationMap', methods=['GET'])
+def createConversationMap(service: NomicService):
+  course_name: str = request.args.get('course_name', default='', type=str)
+
+  if course_name == '':
+    # proper web error "400 Bad request"
+    abort(400, description=f"Missing required parameter: 'course_name' must be provided. Course name: `{course_name}`")
+
+  map_id = service.create_conversation_map(course_name)
+
+  response = jsonify(map_id)
+  response.headers.add('Access-Control-Allow-Origin', '*')
+  return response
+
 
 @app.route('/onResponseCompletion', methods=['POST'])
 def logToNomic(service: NomicService, flaskExecutor: ExecutorInterface):

diff --git a/ai_ta_backend/service/nomic_service.py b/ai_ta_backend/service/nomic_service.py
@@ -20,7 +20,6 @@
     'Project is currently indexing and cannot ingest new datums. Try again later.'
 ]
 
-
 def giveup_hdlr(e):
   """
 	Function to handle giveup conditions in backoff decorator
@@ -252,157 +251,309 @@ def get_nomic_map(self, course_name: str, type: str):
         self.sentry.capture_exception(e)
       return {"map_id": None, "map_link": None}
 
-  def create_nomic_map(self, course_name: str, log_data: list):
+  # def create_nomic_map(self, course_name: str, log_data: list):
+  #   """
+	# 	Creates a Nomic map for new courses and those which previously had < 20 queries.
+	# 	1. fetches supabase conversations for course
+	# 	2. appends current embeddings and metadata to it
+	# 	2. creates map if there are at least 20 queries
+	# 	"""
+  #   nomic.login(os.environ['NOMIC_API_KEY'])  # login during start of flask app
+  #   NOMIC_MAP_NAME_PREFIX = 'Conversation Map for '
+
+  #   print(f"in create_nomic_map() for {course_name}")
+
+  #   try:
+  #     # fetch all conversations with this new course (we expect <=20 conversations, because otherwise the map should be made already)
+
+  #     response = self.sql.getAllFromLLMConvoMonitor(course_name)
+  #     data = response.data
+  #     df = pd.DataFrame(data)
+
+  #     if len(data) < 19:
+  #       return None
+  #     else:
+  #       # get all queries for course and create metadata
+  #       user_queries = []
+  #       metadata = []
+  #       i = 1
+  #       conversation_exists = False
+
+  #       # current log details
+  #       log_messages = log_data['conversation']['messages']  # type: ignore
+  #       log_user_email = log_data['conversation']['user_email']  # type: ignore
+  #       log_conversation_id = log_data['conversation']['id']  # type: ignore
+
+  #       for _index, row in df.iterrows():
+  #         user_email = row['user_email']
+  #         created_at = pd.to_datetime(row['created_at']).strftime('%Y-%m-%d %H:%M:%S')
+  #         convo = row['convo']
+  #         messages = convo['messages']
+
+  #         first_message = messages[0]['content']
+  #         if isinstance(first_message, list):
+  #           first_message = first_message[0]['text']
+
+  #         user_queries.append(first_message)
+
+  #         # create metadata for multi-turn conversation
+  #         conversation = ""
+  #         for message in messages:
+  #           # string of role: content, role: content, ...
+  #           if message['role'] == 'user':  # type: ignore
+  #             emoji = "🙋 "
+  #           else:
+  #             emoji = "🤖 "
+
+  #           if isinstance(message['content'], list):
+  #             text = message['content'][0]['text']
+  #           else:
+  #             text = message['content']
+
+  #           conversation += "\n>>> " + emoji + message['role'] + ": " + text + "\n"
+
+  #         # append current chat to previous chat if convo already exists
+  #         if convo['id'] == log_conversation_id:
+  #           conversation_exists = True
+
+  #           for m in log_messages:
+  #             if m['role'] == 'user':  # type: ignore
+  #               emoji = "🙋 "
+  #             else:
+  #               emoji = "🤖 "
+
+  #             if isinstance(m['content'], list):
+  #               text = m['content'][0]['text']
+  #             else:
+  #               text = m['content']
+  #             conversation += "\n>>> " + emoji + m['role'] + ": " + text + "\n"
+
+  #         # adding modified timestamp
+  #         current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+  #         # add to metadata
+  #         metadata_row = {
+  #             "course": row['course_name'],
+  #             "conversation": conversation,
+  #             "conversation_id": convo['id'],
+  #             "id": i,
+  #             "user_email": user_email,
+  #             "first_query": first_message,
+  #             "created_at": created_at,
+  #             "modified_at": current_time
+  #         }
+  #         metadata.append(metadata_row)
+  #         i += 1
+
+  #       # add current log as a new data point if convo doesn't exist
+  #       if not conversation_exists:
+  #         user_queries.append(log_messages[0]['content'])
+  #         conversation = ""
+  #         for message in log_messages:
+  #           if message['role'] == 'user':
+  #             emoji = "🙋 "
+  #           else:
+  #             emoji = "🤖 "
+
+  #           if isinstance(message['content'], list):
+  #             text = message['content'][0]['text']
+  #           else:
+  #             text = message['content']
+  #           conversation += "\n>>> " + emoji + message['role'] + ": " + text + "\n"
+
+  #         # adding timestamp
+  #         current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+  #         metadata_row = {
+  #             "course": course_name,
+  #             "conversation": conversation,
+  #             "conversation_id": log_conversation_id,
+  #             "id": i,
+  #             "user_email": log_user_email,
+  #             "first_query": log_messages[0]['content'],
+  #             "created_at": current_time,
+  #             "modified_at": current_time
+  #         }
+  #         metadata.append(metadata_row)
+
+  #       metadata = pd.DataFrame(metadata)
+  #       embeddings_model = OpenAIEmbeddings(openai_api_type=os.environ['OPENAI_API_TYPE'])
+  #       embeddings = embeddings_model.embed_documents(user_queries)
+
+  #       # create Atlas project
+  #       project_name = NOMIC_MAP_NAME_PREFIX + course_name
+  #       index_name = course_name + "_convo_index"
+  #       project = atlas.map_embeddings(
+  #           embeddings=np.array(embeddings),
+  #           data=metadata,  # type: ignore - this is the correct type, the func signature from Nomic is incomplete
+  #           id_field='id',
+  #           build_topic_model=True,
+  #           topic_label_field='first_query',
+  #           name=project_name,
+  #           colorable_fields=['conversation_id', 'first_query'])
+  #       project.create_index(index_name, build_topic_model=True)
+  #       return f"Successfully created Nomic map for {course_name}"
+  #   except Exception as e:
+  #     # Error: ValueError: You must specify a unique_id_field when creating a new project.
+  #     if str(e) == 'You must specify a unique_id_field when creating a new project.':  # type: ignore
+  #       print("Nomic map does not exist yet, probably because you have less than 20 queries on your project: ", e)
+  #     else:
+  #       print("ERROR in create_nomic_map():", e)
+  #       self.sentry.capture_exception(e)
+
+  #     return "failed"
+
+  def create_conversation_map(self, course_name: str):
     """
-		Creates a Nomic map for new courses and those which previously had < 20 queries.
-		1. fetches supabase conversations for course
-		2. appends current embeddings and metadata to it
-		2. creates map if there are at least 20 queries
-		"""
-    nomic.login(os.environ['NOMIC_API_KEY'])  # login during start of flask app
+    This function creates a conversation map for a given course from scratch.
+    """
+    nomic.login(os.getenv('NOMIC_API_KEY'))
     NOMIC_MAP_NAME_PREFIX = 'Conversation Map for '
+    # check if map exists
+    response = self.sql.getConvoMapFromProjects(course_name)
 
-    print(f"in create_nomic_map() for {course_name}")
+    if response.data[0]['convo_map_id']:
+        return "Map already exists for this course."
 
-    try:
-      # fetch all conversations with this new course (we expect <=20 conversations, because otherwise the map should be made already)
+    # if no, fetch total count of records
+    response = self.sql.getCountFromLLMConvoMonitor(course_name)
+
+    # if <20, return message that map cannot be created
+    if not response.count:
+        return "No conversations found for this course."
+    elif response.count < 20:
+        return "Cannot create a map because there are less than 20 conversations in the course."
+
+    # if >20, iteratively fetch records in batches of 100
+    total_convo_count = response.count
+    print("Total number of conversations in Supabase: ", total_convo_count)
+    first_id = response.data[0]['id']
+    combined_dfs = []
+    current_convo_count = 0
+    convo_count = 0
+    first_batch = True
+
+    # iteratively query in batches of 100
+    while current_convo_count < total_convo_count:
+      response = self.sql.getAllConversationsBetweenIds(course_name, first_id, 0, 50)
+      df = pd.DataFrame(response.data)
+      combined_dfs.append(df)
+      current_convo_count += len(response.data)
+      convo_count += len(response.data)
+
+      if convo_count >= 5:
+        # concat all dfs from the combined_dfs list
+        final_df = pd.concat(combined_dfs, ignore_index=True)
 
-      response = self.sql.getAllFromLLMConvoMonitor(course_name)
-      data = response.data
-      df = pd.DataFrame(data)
+        # prep data for nomic upload
+        embeddings, metadata = self.data_prep_for_convo_map(final_df)
 
-      if len(data) < 19:
-        return None
-      else:
-        # get all queries for course and create metadata
-        user_queries = []
-        metadata = []
-        i = 1
-        conversation_exists = False
-
-        # current log details
-        log_messages = log_data['conversation']['messages']  # type: ignore
-        log_user_email = log_data['conversation']['user_email']  # type: ignore
-        log_conversation_id = log_data['conversation']['id']  # type: ignore
-
-        for _index, row in df.iterrows():
-          user_email = row['user_email']
-          created_at = pd.to_datetime(row['created_at']).strftime('%Y-%m-%d %H:%M:%S')
-          convo = row['convo']
-          messages = convo['messages']
-
-          first_message = messages[0]['content']
-          if isinstance(first_message, list):
-            first_message = first_message[0]['text']
-
-          user_queries.append(first_message)
-
-          # create metadata for multi-turn conversation
-          conversation = ""
-          for message in messages:
-            # string of role: content, role: content, ...
-            if message['role'] == 'user':  # type: ignore
-              emoji = "🙋 "
-            else:
-              emoji = "🤖 "
+        if first_batch:
+          # create a new map
+          print("Creating new map...")
+          project_name = NOMIC_MAP_NAME_PREFIX + course_name
+          index_name = course_name + "_convo_index"
+          topic_label_field = "first_query"
+          colorable_fields = ["user_email", "first_query", "conversation_id", "created_at"]
+          result = create_map(embeddings, metadata, project_name, index_name, topic_label_field, colorable_fields)
+
+          if result == "success":
+            # update flag
+            first_batch = False
+             # log project info to supabase
+            project = AtlasProject(name=project_name, add_datums_if_exists=True)
+            project_id = project.id
+            last_id = int(final_df['id'].iloc[-1])
+            project_info = {'course_name': course_name, 'convo_map_id': project_id, 'last_uploaded_convo_id': last_id}
+            # if entry already exists, update it
+            projects_record = self.sql.getConvoMapFromProjects(course_name)
+            update_response = SUPABASE_CLIENT.table("projects").insert(project_info).execute()
+            print("Response from supabase: ", update_response)
+        else:
+          # append to existing map
 
-            if isinstance(message['content'], list):
-              text = message['content'][0]['text']
-            else:
-              text = message['content']
-
-            conversation += "\n>>> " + emoji + message['role'] + ": " + text + "\n"
-
-          # append current chat to previous chat if convo already exists
-          if convo['id'] == log_conversation_id:
-            conversation_exists = True
-
-            for m in log_messages:
-              if m['role'] == 'user':  # type: ignore
-                emoji = "🙋 "
-              else:
-                emoji = "🤖 "
-
-              if isinstance(m['content'], list):
-                text = m['content'][0]['text']
-              else:
-                text = m['content']
-              conversation += "\n>>> " + emoji + m['role'] + ": " + text + "\n"
-
-          # adding modified timestamp
-          current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-
-          # add to metadata
-          metadata_row = {
-              "course": row['course_name'],
-              "conversation": conversation,
-              "conversation_id": convo['id'],
-              "id": i,
-              "user_email": user_email,
-              "first_query": first_message,
-              "created_at": created_at,
-              "modified_at": current_time
-          }
-          metadata.append(metadata_row)
-          i += 1
-
-        # add current log as a new data point if convo doesn't exist
-        if not conversation_exists:
-          user_queries.append(log_messages[0]['content'])
-          conversation = ""
-          for message in log_messages:
-            if message['role'] == 'user':
-              emoji = "🙋 "
-            else:
-              emoji = "🤖 "
 
-            if isinstance(message['content'], list):
-              text = message['content'][0]['text']
-            else:
-              text = message['content']
-            conversation += "\n>>> " + emoji + message['role'] + ": " + text + "\n"
-
-          # adding timestamp
-          current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-
-          metadata_row = {
-              "course": course_name,
-              "conversation": conversation,
-              "conversation_id": log_conversation_id,
-              "id": i,
-              "user_email": log_user_email,
-              "first_query": log_messages[0]['content'],
-              "created_at": current_time,
-              "modified_at": current_time
-          }
-          metadata.append(metadata_row)
-
-        metadata = pd.DataFrame(metadata)
-        embeddings_model = OpenAIEmbeddings(openai_api_type=os.environ['OPENAI_API_TYPE'])
-        embeddings = embeddings_model.embed_documents(user_queries)
 
-        # create Atlas project
-        project_name = NOMIC_MAP_NAME_PREFIX + course_name
-        index_name = course_name + "_convo_index"
-        project = atlas.map_embeddings(
-            embeddings=np.array(embeddings),
-            data=metadata,  # type: ignore - this is the correct type, the func signature from Nomic is incomplete
-            id_field='id',
-            build_topic_model=True,
-            topic_label_field='first_query',
-            name=project_name,
-            colorable_fields=['conversation_id', 'first_query'])
-        project.create_index(index_name, build_topic_model=True)
-        return f"Successfully created Nomic map for {course_name}"
-    except Exception as e:
-      # Error: ValueError: You must specify a unique_id_field when creating a new project.
-      if str(e) == 'You must specify a unique_id_field when creating a new project.':  # type: ignore
-        print("Nomic map does not exist yet, probably because you have less than 20 queries on your project: ", e)
+
+
+    return "success"
+
+  def data_prep_for_convo_map(self, df: pd.DataFrame):
+    """
+		This function prepares embeddings and metadata for nomic upload in conversation map creation.
+		Args:
+			df: pd.DataFrame - the dataframe of documents from Supabase
+		Returns:
+			embeddings: np.array of embeddings
+			metadata: pd.DataFrame of metadata
+		"""
+    print("in data_prep_for_convo_map()")
+
+    metadata = []
+    embeddings = []
+    texts = []
+
+    for _index, row in df.iterrows():
+
+      print("Row: ", row)
+
+      current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+      created_at = datetime.datetime.strptime(row['created_at'], "%Y-%m-%dT%H:%M:%S.%f%z").strftime("%Y-%m-%d %H:%M:%S")
+      conversation_exists = False
+      conversation = ""
+      emoji = ""
+      user_queries = []
+
+      if row['user_email'] is None:
+        user_email = ""
       else:
-        print("ERROR in create_nomic_map():", e)
-        self.sentry.capture_exception(e)
+        user_email = row['user_email']
+
+      messages = row['convo']['messages']
+      first_message = messages[0]['content']
+      # some conversations include images, so the data structure is different
+      if isinstance(first_message, list):
+        first_message = first_message[0]['text']
+      user_queries.append(first_message)
+
+      # construct metadata for multi-turn conversation
+      for message in messages:
+        if message['role'] == 'user': 
+          emoji = "🙋 "
+        else:
+          emoji = "🤖 "
+
+        if isinstance(message['content'], list):
+          text = message['content'][0]['text']
+        else:
+          text = message['content']
+
+        conversation += "\n>>> " + emoji + message['role'] + ": " + text + "\n"
+
+      meta_row = {
+        "course": row['course_name'],
+        "conversation": conversation,
+        "conversation_id": row['convo']['id'],
+        "id": row['id'],
+        "user_email": user_email,
+        "first_query": first_message,
+        "created_at": created_at,
+        "modified_at": current_time
+      }
+
+      metadata.append(meta_row)
+      texts.append(user_queries)
+
+    embeddings_model = OpenAIEmbeddings(openai_api_type="openai",
+                                          openai_api_base="https://api.openai.com/v1/",
+                                          openai_api_key=os.environ['VLADS_OPENAI_KEY'])
+    embeddings = embeddings_model.embed_documents(texts)
+
+    metadata = pd.DataFrame(metadata)
+    embeddings = np.array(embeddings)
+
+    return embeddings, metadata
 
-      return "failed"
 
   ## -------------------------------- DOCUMENT MAP FUNCTIONS --------------------------------- ##
 
@@ -707,9 +858,6 @@ def data_prep_for_doc_map(self, df: pd.DataFrame):
     # check dimension if embeddings_np is (n, 1536)
     if len(embeddings_np.shape) < 2:
       print("Creating new embeddings...")
-      # embeddings_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE,
-      #                                     openai_api_base=os.getenv('AZURE_OPENAI_BASE'),
-      #                                     openai_api_key=os.getenv('AZURE_OPENAI_KEY')) # type: ignore
       embeddings_model = OpenAIEmbeddings(openai_api_type="openai",
                                           openai_api_base="https://api.openai.com/v1/",
                                           openai_api_key=os.environ['VLADS_OPENAI_KEY'])