nomic-ai · akgom · Jun 25, 2024 · Jun 26, 2024 · Jun 26, 2024 · Jun 27, 2024
diff --git a/maps/mongodb_connector.py b/maps/mongodb_connector.py
@@ -1,58 +1,86 @@
 import pymongo as pm
-from nomic import AtlasProject
+import nomic
+from nomic import AtlasDataset
 from sentence_transformers import SentenceTransformer
+from pymongo.mongo_client import MongoClient
 import numpy as np
 import pandas as pd
 from pathlib import Path
-import nomic
 
-# replace with your mongodb connect string / cert
-client = pm.MongoClient('mongodb+srv://cluster0.l3jhqfs.mongodb.net/'
-                        '?authSource=%24external&authMechanism=MONGODB-X509&retryWrites=true&w=majority',
-                        tls=True,
-                        tlsCertificateKeyFile='mongocert.pem')
+# MongoDB connection string 
+client = MongoClient('mongodb+srv://<USERNAME>:<PASSWORD>@<APPNAME>.1fy6rp1.mongodb.net/?appName=<APPNAME>',
+                     tls=True)
 
-collection = client.testdb.testcoll
+# Replace with your actual API key
+nomic.login('YOUR_KEY_HERE')
+
+# MongoDB collection
+collection = client.sample_mflix.comments
 
 # Delete current content of collection
 collection.delete_many({})
 
-# Load embedding data into mongodb
+# Load embedding data into MongoDB from parquet file
 mongo_so = pd.read_parquet(Path.cwd() / 'data' / 'mongo-so.parquet')
+
+# Initialize SentenceTransformer model
 model = SentenceTransformer('all-MiniLM-L6-v2')
-title_embeds = model.encode(mongo_so['title'])
-mso_te = mongo_so.assign(title_embedding=list(title_embeds))
-
-data = list(r._asdict() for r in mso_te.itertuples())
-for d in data:
-    del d['Index']
-    d['title_embedding'] = d['title_embedding'].tolist()
-data[0]
-collection.insert_many(data)
 
-# Read a mongodb collection with embeddings in it and map it:
-project = AtlasProject(
-    name='MongoDB Stack Overflow Questions',
-    unique_id_field='mongo_id',
-    reset_project_if_exists=True,
-    is_public=True,
-    modality='embedding',
-)
+# Encode titles using SentenceTransformer
+title_embeds = model.encode(mongo_so['title'].tolist())
+mongo_so['title_embedding'] = list(title_embeds)
+
+# Convert DataFrame to list of dictionaries for MongoDB insertion
+data = mongo_so.to_dict(orient='records')
 
+# Insert data into MongoDB collection
+collection.insert_many(data)
+
+# Fetch all items from MongoDB collection
 all_items = list(collection.find())
+
+# Extract embeddings into numpy array
 embs = np.array([d['title_embedding'] for d in all_items])
+
+# Remove 'title_embedding' field from each item, and convert '_id' to string
 for d in all_items:
-    d['mongo_id'] = str(d['_id'])
+    d['_id'] = str(d['_id'])
     del d['title_embedding']
-    del d['_id']
 
-project.add_embeddings(all_items, embs)
+# Create an AtlasDataset instance
+dataset = AtlasDataset(
+    identifier='sample-mflix-comments',  # Unique identifier for your dataset
+    description='MongoDB Movie Comments',
+    unique_id_field='_id',
+    is_public=True
+)
+
+# Add data and embeddings to the AtlasDataset
+dataset.add_data(data=all_items, embeddings=embs)
 
-project.rebuild_maps()
-project.create_index(
-    name='MongoDB Stack Overflow Questions',
-    topic_label_field='body',
-    build_topic_model=True,
+# Create an index and map
+dataset.create_index(
+    name='MongoDB Movie Comments',
+    indexed_field='body',  # Replace with your topic label field
+    modality='embedding',
+    topic_model={
+        'build_topic_model': True,
+        'topic_label_field': 'body'  # Replace with the field used for topic labeling
+    },
+    duplicate_detection={
+        'tag_duplicates': True,
+        'duplicate_cutoff': 0.95  # Adjust as needed
+    },
+    projection={
+        'n_neighbors': 15,  # Example value, adjust as needed
+        'n_epochs': 100,  # Example value, adjust as needed
+        'model': 'nomic-project-v2',
+        'local_neighborhood_size': 30,
+        'spread': 1.0,
+        'rho': 0.5
+    },
+    embedding_model='NomicEmbed'  # Specify the embedding model if needed
 )
 
-print(project)
+# Print the dataset to confirm
+print(dataset)