Skip to content

Commit

Permalink
Merge pull request #111 from hsavran/main
Browse files Browse the repository at this point in the history
PostgreSQL and Generate vector for Mongo DB
  • Loading branch information
codingbandit authored Apr 18, 2024
2 parents d93bd18 + 3be9df4 commit 537d732
Show file tree
Hide file tree
Showing 23 changed files with 774 additions and 12 deletions.
15 changes: 15 additions & 0 deletions .github/actions/build-with-plugins/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,21 @@ runs:
-p:PublishReadyToRun=false \
-p:PublishTrimmed=false \
-p:Version=${{ inputs.build-version }}
- name: Build PostgreSQL Extension
shell: bash
run: |
dotnet publish \
Extensions/PostgreSQL/Cosmos.DataTransfer.PostgresqlExtension.csproj \
--configuration Release \
--output ${{ inputs.platform-short }}/Extensions \
--self-contained false \
--runtime ${{ inputs.runtime }} \
-p:PublishSingleFile=false \
-p:DebugType=embedded \
-p:EnableCompressionInSingleFile=true \
-p:PublishReadyToRun=false \
-p:PublishTrimmed=false \
-p:Version=${{ inputs.build-version }}
- name: Upload package
uses: actions/upload-artifact@v3
with:
Expand Down
6 changes: 3 additions & 3 deletions Core/Cosmos.DataTransfer.Core/Cosmos.DataTransfer.Core.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Azure.Core" Version="1.31.0" />
<PackageReference Include="Microsoft.Data.SqlClient" Version="5.0.0" />
<PackageReference Include="Azure.Core" Version="1.36.0" />
<PackageReference Include="Microsoft.Data.SqlClient" Version="5.2.0" />
<PackageReference Include="Microsoft.Extensions.Configuration.UserSecrets" Version="6.0.1" />
<PackageReference Include="Microsoft.Extensions.Hosting" Version="6.0.1" />
<PackageReference Include="Microsoft.Extensions.Logging.Console" Version="6.0.0" />
<PackageReference Include="System.CommandLine" Version="2.0.0-beta4.22272.1" />
<PackageReference Include="System.CommandLine.Hosting" Version="0.4.0-alpha.22272.1" />
<PackageReference Include="System.ComponentModel.Composition" Version="6.0.0" />
<PackageReference Include="System.Configuration.ConfigurationManager" Version="6.0.0" />
<PackageReference Include="System.Configuration.ConfigurationManager" Version="8.0.0" />
</ItemGroup>

<ItemGroup>
Expand Down
10 changes: 6 additions & 4 deletions Core/Cosmos.DataTransfer.Core/migrationsettings.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
{
"Source": null,
"Sink": null,
"SourceSettings": {
"Source": "",
"Sink": "",
"SourceSettings": {

},
"SinkSettings": {
"SinkSettings": {

},
"Operations": [
//{
Expand Down
25 changes: 23 additions & 2 deletions CosmosDbDataMigrationTool.sln
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Mongo", "Mongo", "{F18E789A
Extensions\Mongo\README.md = Extensions\Mongo\README.md
EndProjectSection
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.MongoExtension", "Extensions\Mongo\Cosmos.DataTransfer.MongoExtension\Cosmos.DataTransfer.MongoExtension.csproj", "{F6EAC33B-9F7D-433B-9328-622FB8938C24}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.MongoVectorExtension", "Extensions\Mongo\Cosmos.DataTransfer.MongoVectorExtension\Cosmos.DataTransfer.MongoVectorExtension.csproj", "{F6EAC33B-9F7D-433B-9328-622FB8938C24}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.JsonExtension.UnitTests", "Extensions\Json\Cosmos.DataTransfer.JsonExtension.UnitTests\Cosmos.DataTransfer.JsonExtension.UnitTests.csproj", "{ED1E375E-A5A3-47EA-A7D5-07344C7E152F}"
EndProject
Expand Down Expand Up @@ -87,14 +87,24 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Csv", "Csv", "{39930280-DA2
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.CsvExtension", "Extensions\Csv\Cosmos.DataTransfer.CsvExtension\Cosmos.DataTransfer.CsvExtension.csproj", "{6A3FB90C-B837-4724-A406-214D4CEA686F}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Cosmos.DataTransfer.CsvExtension.UnitTests", "Extensions\Csv\Cosmos.DataTransfer.CsvExtension.UnitTests\Cosmos.DataTransfer.CsvExtension.UnitTests.csproj", "{40AD8890-BD78-48F5-AE76-2C2FC6F15B7E}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.CsvExtension.UnitTests", "Extensions\Csv\Cosmos.DataTransfer.CsvExtension.UnitTests\Cosmos.DataTransfer.CsvExtension.UnitTests.csproj", "{40AD8890-BD78-48F5-AE76-2C2FC6F15B7E}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{BCBBAF22-0CB5-416B-8C80-03AB2FC4D0A0}"
ProjectSection(SolutionItems) = preProject
Contributing.md = Contributing.md
ExampleConfigs.md = ExampleConfigs.md
README.md = README.md
EndProjectSection
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.PostgresqlExtension", "Extensions\PostgreSQL\Cosmos.DataTransfer.PostgresqlExtension.csproj", "{85820167-DB94-458B-B09B-9E823996C692}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "PostgreSQL", "PostgreSQL", "{1B927C5F-50FC-42A6-BAF6-B00E6D760543}"
ProjectSection(SolutionItems) = preProject
Extensions\PostgreSQL\README.md = Extensions\PostgreSQL\README.md
EndProjectSection
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.MongoExtension", "Extensions\Mongo\Cosmos.DataTransfer.MongoExtension\Cosmos.DataTransfer.MongoExtension.csproj", "{31BC84E1-55E5-45AA-BFAC-90732F20588B}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -181,6 +191,14 @@ Global
{40AD8890-BD78-48F5-AE76-2C2FC6F15B7E}.Debug|Any CPU.Build.0 = Debug|Any CPU
{40AD8890-BD78-48F5-AE76-2C2FC6F15B7E}.Release|Any CPU.ActiveCfg = Release|Any CPU
{40AD8890-BD78-48F5-AE76-2C2FC6F15B7E}.Release|Any CPU.Build.0 = Release|Any CPU
{85820167-DB94-458B-B09B-9E823996C692}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{85820167-DB94-458B-B09B-9E823996C692}.Debug|Any CPU.Build.0 = Debug|Any CPU
{85820167-DB94-458B-B09B-9E823996C692}.Release|Any CPU.ActiveCfg = Release|Any CPU
{85820167-DB94-458B-B09B-9E823996C692}.Release|Any CPU.Build.0 = Release|Any CPU
{31BC84E1-55E5-45AA-BFAC-90732F20588B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{31BC84E1-55E5-45AA-BFAC-90732F20588B}.Debug|Any CPU.Build.0 = Debug|Any CPU
{31BC84E1-55E5-45AA-BFAC-90732F20588B}.Release|Any CPU.ActiveCfg = Release|Any CPU
{31BC84E1-55E5-45AA-BFAC-90732F20588B}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -212,6 +230,9 @@ Global
{39930280-DA29-4814-837B-FA7F252EB3EC} = {A8A1CEAB-2D82-460C-9B86-74ABD17CD201}
{6A3FB90C-B837-4724-A406-214D4CEA686F} = {39930280-DA29-4814-837B-FA7F252EB3EC}
{40AD8890-BD78-48F5-AE76-2C2FC6F15B7E} = {39930280-DA29-4814-837B-FA7F252EB3EC}
{85820167-DB94-458B-B09B-9E823996C692} = {1B927C5F-50FC-42A6-BAF6-B00E6D760543}
{1B927C5F-50FC-42A6-BAF6-B00E6D760543} = {A8A1CEAB-2D82-460C-9B86-74ABD17CD201}
{31BC84E1-55E5-45AA-BFAC-90732F20588B} = {F18E789A-D32D-48D3-B75F-1196D7215F74}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {662B3F27-70D8-45E6-A1C0-1438A9C8A542}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Azure.Identity" Version="1.6.0" />
<PackageReference Include="Azure.Identity" Version="1.10.3" />
<PackageReference Include="Microsoft.Azure.Cosmos" Version="3.34.0" />
<PackageReference Include="Microsoft.Extensions.Configuration" Version="6.0.1" />
<PackageReference Include="Microsoft.Extensions.Configuration.Abstractions" Version="6.0.0" />
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<OutputType>Exe</OutputType>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Azure.AI.OpenAI" Version="1.0.0-beta.12" />
<PackageReference Include="Microsoft.Extensions.Configuration.Binder" Version="6.0.0" />
<PackageReference Include="MongoDB.Driver" Version="2.19.1" />
<PackageReference Include="System.ComponentModel.Composition" Version="6.0.0" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\..\..\Interfaces\Cosmos.DataTransfer.Interfaces\Cosmos.DataTransfer.Interfaces.csproj" />
<ProjectReference Include="..\Cosmos.DataTransfer.MongoExtension\Cosmos.DataTransfer.MongoExtension.csproj" />
</ItemGroup>

<Target Name="PublishToExtensionsFolder" AfterTargets="Build" Condition=" '$(Configuration)' == 'Debug' ">
<Exec Command="dotnet publish --configuration $(Configuration) --no-build -p:PublishProfile=PublishToExtensionsFolder" />
</Target>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
using System.ComponentModel.Composition;
using Azure;
using Azure.AI.OpenAI;
using Cosmos.DataTransfer.Interfaces;
using Cosmos.DataTransfer.MongoExtension;
using Cosmos.DataTransfer.MongoVectorExtension.Settings;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.Logging;
using MongoDB.Bson;

namespace Cosmos.DataTransfer.MongoVectorExtension;
[Export(typeof(IDataSinkExtension))]
public class MongoVectorDataSinkExtension : IDataSinkExtensionWithSettings
{
public string DisplayName => $"MongoDB-Vector{ExtensionExtensions.BetaExtensionTag}";

public async Task WriteAsync(IAsyncEnumerable<IDataItem> dataItems, IConfiguration config, IDataSourceExtension dataSource, ILogger logger, CancellationToken cancellationToken = default)
{
var settings = config.Get<MongoVectorSinkSettings>();
settings.Validate();

if (!string.IsNullOrEmpty(settings.ConnectionString) && !string.IsNullOrEmpty(settings.DatabaseName) && !string.IsNullOrEmpty(settings.Collection))
{
var Isembeddingsetsvalid = false;
var client = new OpenAIClient("");
if (settings.GenerateEmbedding.HasValue && settings.GenerateEmbedding.Value && settings.SourcePropEmbedding != null && settings.DestPropEmbedding != null)
{
if (!string.IsNullOrEmpty(settings.OpenAIUrl) && !string.IsNullOrEmpty(settings.OpenAIKey) && !string.IsNullOrEmpty(settings.OpenAIDeploymentName))
{
client = new OpenAIClient(new Uri(settings.OpenAIUrl), new AzureKeyCredential(settings.OpenAIKey));
Isembeddingsetsvalid = true;
logger.LogInformation("OpenAI Embedding settings are valid.");
}
}

var context = new Context(settings.ConnectionString, settings.DatabaseName);
var repo = context.GetRepository<BsonDocument>(settings.Collection);
var batchSize = settings.BatchSize ?? 1000;
var objects = new List<BsonDocument>();
int itemCount = 0;
await foreach (var item in dataItems.WithCancellation(cancellationToken))
{
var dict = item.BuildDynamicObjectTree();

if (Isembeddingsetsvalid)
{
var valtoemb = item.GetValue(settings.SourcePropEmbedding)?.ToString();
if (!string.IsNullOrEmpty(valtoemb) && valtoemb?.Length < 8192)
{
var options = new EmbeddingsOptions()
{
DeploymentName = settings.OpenAIDeploymentName,
Input = { valtoemb }
};
var vector = await client.GetEmbeddingsAsync(options,cancellationToken);
if (vector != null)
{
dict?.TryAdd(settings.DestPropEmbedding, vector.Value.Data[0].Embedding.ToArray());
}
}
}
objects.Add(new BsonDocument(dict));
itemCount++;

if (objects.Count == batchSize)
{
await repo.AddRange(objects);
logger.LogInformation("Added {ItemCount} items to collection '{Collection}'", itemCount, settings.Collection);
objects.Clear();
}
}

if (objects.Any())
{
await repo.AddRange(objects);
}

if (itemCount > 0)
logger.LogInformation("Added {ItemCount} total items to collection '{Collection}'", itemCount, settings.Collection);
else
logger.LogWarning("No items added to collection '{Collection}'", settings.Collection);
}
}

public IEnumerable<IDataExtensionSettings> GetSettings()
{
yield return new MongoVectorSinkSettings();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Console.WriteLine("Starting Mongo extension");
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<?xml version="1.0" encoding="utf-8"?>
<!--
https://go.microsoft.com/fwlink/?LinkID=208121.
-->
<Project>
<PropertyGroup Condition=" '$(Configuration)' == 'Debug' ">
<Configuration>Debug</Configuration>
<Platform>Any CPU</Platform>
<PublishDir>..\..\..\Core\Cosmos.DataTransfer.Core\bin\Debug\net6.0\Extensions</PublishDir>
<PublishProtocol>FileSystem</PublishProtocol>
<_TargetId>Folder</_TargetId>
<TargetFramework>net6.0</TargetFramework>
<SelfContained>false</SelfContained>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)' != 'Debug' ">
<Configuration>Release</Configuration>
<Platform>Any CPU</Platform>
<PublishDir>..\..\..\Core\Cosmos.DataTransfer.Core\bin\Release\net6.0\Extensions</PublishDir>
<PublishProtocol>FileSystem</PublishProtocol>
<_TargetId>Folder</_TargetId>
<TargetFramework>net6.0</TargetFramework>
<SelfContained>false</SelfContained>
</PropertyGroup>
</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
using System.ComponentModel.DataAnnotations;
using Cosmos.DataTransfer.MongoExtension.Settings;

namespace Cosmos.DataTransfer.MongoVectorExtension.Settings;
public class MongoVectorSinkSettings : MongoBaseSettings
{
[Required]
public string? Collection { get; set; }

public int? BatchSize { get; set; }

public bool? GenerateEmbedding { get; set; }

public string? OpenAIUrl { get; set; }
public string? OpenAIKey { get; set; }

// name of the deployment for text-embedding-ada-002
public string? OpenAIDeploymentName { get; set; }
public string? SourcePropEmbedding { get; set; }
public string? DestPropEmbedding { get; set; }
}
38 changes: 37 additions & 1 deletion Extensions/Mongo/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,47 @@ Source and sink settings require both `ConnectionString` and `DatabaseName` para

### Sink

```json
{
"ConnectionString": "",
"DatabaseName: "",
"Collection": ""
}
```

# MongoDB Vector Extension (Beta)

The MongoDB Vector extension is a Sink only extension that builds on the MongoDB extension by providing additional capabilities for generating embeddings using Azure OpenAI APIs.

> **Note**: When specifying the MongoDB Vector extension as the Sink property in configuration, utilize the name **MongoDB-Vector(beta)**.
## Settings

The settings are based on the MongoDB extension settings with additional parameters for generating embeddings.

### Additional Sink Settings

The sink settings require the following additional parameters:

- `GenerateEmbedding`: If set to true, the sink will generate embeddings for the records before writing them to the database. The sink requires the `OpenAIUrl`, `OpenAIKey`, and `OpenAIDeploymentModel` parameters to be set. Following paramaters are required if this is true
- `OpenAIUrl`: The URL of the OpenAI API
- `OpenAIKey`: The API key for the OpenAI API
- `OpenAIDeploymentModel`: The deployment model to use for the OpenAI API
- `SourcePropEmbedding`: The property in the source data that should be used to generate the embeddings
- `DestPropEmbedding`: New property name that will be added to the source data with the generated embeddings

```json
{
"ConnectionString": "",
"DatabaseName: "",
"Collection": "",
"BatchSize: 100
"BatchSize: 100,
"GenerateEmbedding": true | false
"OpenAIUrl": "",
"OpenAIKey": "",
"OpenAIDeploymentModel": "",
"SourcePropEmbedding": "",
"DestPropEmbedding": ""
}
```

Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.Extensions.Configuration.Binder" Version="6.0.0" />
<PackageReference Include="Npgsql" Version="7.0.6" />
<PackageReference Include="System.ComponentModel.Composition" Version="7.0.0" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\..\Interfaces\Cosmos.DataTransfer.Interfaces\Cosmos.DataTransfer.Interfaces.csproj" />
</ItemGroup>
<Target Name="PublishToExtensionsFolder" AfterTargets="Build" Condition=" '$(Configuration)' == 'Debug' ">
<Exec Command="dotnet publish --configuration $(Configuration) --no-build -p:PublishProfile=FolderProfile" />
</Target>
</Project>
Loading

0 comments on commit 537d732

Please sign in to comment.