-
Notifications
You must be signed in to change notification settings - Fork 145
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[FSTORE-1047] Support Similarity Search in the Feature Store (#1437)
* [FSTORE-1047] Support Similarity Search in the Feature Store (#1609) * rename to embeddingIndex vector db client handle exception embedding controller get project fs address comment return feature name in dto set default project embedding set default project embedding add embedding modify opensearch client creation add vector db module * remove empty line * skip delete sql db when embedding (cherry picked from commit 9f0a98c) # Conflicts: # hopsworks-api/src/main/java/io/hops/hopsworks/api/featurestore/FeaturestoreService.java # hopsworks-common/src/main/java/io/hops/hopsworks/common/featurestore/featuregroup/online/OnlineFeaturegroupController.java # hopsworks-common/src/main/java/io/hops/hopsworks/common/util/Settings.java # pom.xml * [APPEND][FSTORE-1047] Add get fg endpoint for onlinefs (#1632) * rename to embeddingIndex vector db client handle exception embedding controller get project fs address comment return feature name in dto set default project embedding set default project embedding add embedding modify opensearch client creation add vector db module * remove empty line * return fg for onlinefs * allow job to get fs by id * add consumer group to offset table * remove jwt * fix unit test (cherry picked from commit 3f11dce) * [FSTORE-1127] Remove log4j (#1644) (cherry picked from commit e82d686)
- Loading branch information
1 parent
3050dd2
commit 672c238
Showing
33 changed files
with
1,405 additions
and
76 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
170 changes: 170 additions & 0 deletions
170
...on/src/main/java/io/hops/hopsworks/common/featurestore/embedding/EmbeddingController.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
/* | ||
* This file is part of Hopsworks | ||
* Copyright (C) 2023, Hopsworks AB. All rights reserved | ||
* | ||
* Hopsworks is free software: you can redistribute it and/or modify it under the terms of | ||
* the GNU Affero General Public License as published by the Free Software Foundation, | ||
* either version 3 of the License, or (at your option) any later version. | ||
* | ||
* Hopsworks is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; | ||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR | ||
* PURPOSE. See the GNU Affero General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Affero General Public License along with this program. | ||
* If not, see <https://www.gnu.org/licenses/>. | ||
*/ | ||
|
||
package io.hops.hopsworks.common.featurestore.embedding; | ||
|
||
import com.google.common.base.Strings; | ||
import com.google.common.collect.Lists; | ||
import com.google.common.collect.Sets; | ||
import io.hops.hopsworks.common.featurestore.featuregroup.EmbeddingDTO; | ||
import io.hops.hopsworks.common.hdfs.Utils; | ||
import io.hops.hopsworks.common.util.Settings; | ||
import io.hops.hopsworks.exceptions.FeaturestoreException; | ||
import io.hops.hopsworks.persistence.entity.featurestore.featuregroup.Embedding; | ||
import io.hops.hopsworks.persistence.entity.featurestore.featuregroup.EmbeddingFeature; | ||
import io.hops.hopsworks.persistence.entity.featurestore.featuregroup.Featuregroup; | ||
import io.hops.hopsworks.persistence.entity.project.Project; | ||
import io.hops.hopsworks.restutils.RESTCodes; | ||
import io.hops.hopsworks.vectordb.Index; | ||
import io.hops.hopsworks.vectordb.VectorDatabaseException; | ||
|
||
import javax.ejb.EJB; | ||
import javax.ejb.Stateless; | ||
import javax.ejb.TransactionAttribute; | ||
import javax.ejb.TransactionAttributeType; | ||
import java.util.Arrays; | ||
import java.util.Collection; | ||
import java.util.Comparator; | ||
import java.util.List; | ||
import java.util.Random; | ||
import java.util.Set; | ||
import java.util.logging.Level; | ||
import java.util.stream.Collectors; | ||
|
||
@Stateless | ||
@TransactionAttribute(TransactionAttributeType.NEVER) | ||
public class EmbeddingController { | ||
|
||
@EJB | ||
private Settings settings; | ||
@EJB | ||
private VectorDatabaseClient vectorDatabaseClient; | ||
|
||
public void createVectorDbIndex(Project project, Featuregroup featureGroup) | ||
throws FeaturestoreException { | ||
Index index = new Index(featureGroup.getEmbedding().getVectorDbIndexName()); | ||
try { | ||
vectorDatabaseClient.getClient().createIndex(index, createIndex(featureGroup.getEmbedding().getColPrefix(), | ||
featureGroup.getEmbedding().getEmbeddingFeatures()), true); | ||
if (isDefaultVectorDbIndex(project, index.getName())) { | ||
vectorDatabaseClient.getClient().addFields(index, createMapping(featureGroup.getEmbedding().getColPrefix(), | ||
featureGroup.getEmbedding().getEmbeddingFeatures())); | ||
} | ||
} catch (VectorDatabaseException e) { | ||
throw new FeaturestoreException(RESTCodes.FeaturestoreErrorCode.COULD_NOT_CREATE_FEATUREGROUP, | ||
Level.FINE, "Cannot create opensearch vectordb index: " + index.getName()); | ||
} | ||
} | ||
|
||
public Embedding getEmbedding(Project project, EmbeddingDTO embeddingDTO, Featuregroup featuregroup) | ||
throws FeaturestoreException { | ||
Embedding embedding = new Embedding(); | ||
embedding.setFeaturegroup(featuregroup); | ||
if (embeddingDTO.getIndexName() == null) { | ||
embedding.setVectorDbIndexName(getDefaultVectorDbIndex(project)); | ||
embedding.setColPrefix(getVectorDbColPrefix(featuregroup)); | ||
} else { | ||
String vectorDbIndexPrefix = getVectorDbIndexPrefix(project); | ||
// In hopsworks opensearch, users can only access indexes which start with specific prefix | ||
if (!embeddingDTO.getIndexName().startsWith(vectorDbIndexPrefix)) { | ||
embedding.setVectorDbIndexName( | ||
vectorDbIndexPrefix + "_" + embeddingDTO.getIndexName()); | ||
embedding.setColPrefix(""); | ||
} | ||
if (isDefaultVectorDbIndex(project, embeddingDTO.getIndexName())) { | ||
embedding.setColPrefix(getVectorDbColPrefix(featuregroup)); | ||
} | ||
} | ||
embedding.setEmbeddingFeatures( | ||
embeddingDTO.getFeatures() | ||
.stream() | ||
.map(mapping -> new EmbeddingFeature(embedding, mapping.getName(), mapping.getDimension(), | ||
mapping.getSimilarityFunctionType())) | ||
.collect(Collectors.toList()) | ||
); | ||
return embedding; | ||
} | ||
|
||
protected String createMapping(String prefix, Collection<EmbeddingFeature> features) { | ||
String mappingString = "{\n" + | ||
" \"properties\": {\n" + | ||
"%s\n" + | ||
" }\n" + | ||
" }"; | ||
String fieldString = " \"%s\": {\n" + | ||
" \"type\": \"knn_vector\",\n" + | ||
" \"dimension\": %d\n" + | ||
" }"; | ||
List<String> fieldMapping = Lists.newArrayList(); | ||
for (EmbeddingFeature feature : features) { | ||
fieldMapping.add(String.format( | ||
fieldString, prefix + feature.getName(), feature.getDimension())); | ||
} | ||
return String.format(mappingString, String.join(",\n", fieldMapping)); | ||
} | ||
|
||
protected String createIndex(String prefix, Collection<EmbeddingFeature> features) { | ||
String jsonString = "{\n" + | ||
" \"settings\": {\n" + | ||
" \"index\": {\n" + | ||
" \"knn\": \"true\",\n" + | ||
" \"knn.algo_param.ef_search\": 512\n" + | ||
" }\n" + | ||
" },\n" + | ||
" \"mappings\": %s\n" + | ||
"}"; | ||
return String.format(jsonString, createMapping(prefix, features)); | ||
|
||
} | ||
|
||
private String getDefaultVectorDbIndex(Project project) throws FeaturestoreException { | ||
Set<String> indexName = getAllDefaultVectorDbIndex(project); | ||
// randomly select an index | ||
return indexName.stream().sorted(Comparator.comparingInt(i -> new Random().nextInt())).findFirst().get(); | ||
} | ||
|
||
private boolean isDefaultVectorDbIndex(Project project, String index) throws FeaturestoreException { | ||
return getAllDefaultVectorDbIndex(project).contains(index); | ||
} | ||
|
||
private Set<String> getAllDefaultVectorDbIndex(Project project) throws FeaturestoreException { | ||
Set<String> indices; | ||
if (!Strings.isNullOrEmpty(settings.getOpensearchDefaultEmbeddingIndexName())) { | ||
indices = Arrays.stream(settings.getOpensearchDefaultEmbeddingIndexName().split(",")) | ||
.collect(Collectors.toSet()); | ||
} else { | ||
indices = Sets.newHashSet(); | ||
for (int i = 0; i < settings.getOpensearchNumDefaultEmbeddingIndex(); i++) { | ||
indices.add(getVectorDbIndexPrefix(project) + "_default_project_embedding_" + i); | ||
} | ||
} | ||
if (indices.size() == 0) { | ||
throw new FeaturestoreException( | ||
RESTCodes.FeaturestoreErrorCode.OPENSEARCH_DEFAULT_EMBEDDING_INDEX_SUFFIX_NOT_DEFINED, Level.FINE, | ||
"Default vector db index is not defined."); | ||
} | ||
return indices; | ||
} | ||
|
||
private String getVectorDbIndexPrefix(Project project) { | ||
return project.getId() + "__embedding"; | ||
} | ||
|
||
private String getVectorDbColPrefix(Featuregroup featuregroup) { | ||
return Utils.getFeaturegroupName(featuregroup) + "_"; | ||
} | ||
|
||
} |
67 changes: 67 additions & 0 deletions
67
...n/src/main/java/io/hops/hopsworks/common/featurestore/embedding/VectorDatabaseClient.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
/* | ||
* This file is part of Hopsworks | ||
* Copyright (C) 2023, Hopsworks AB. All rights reserved | ||
* | ||
* Hopsworks is free software: you can redistribute it and/or modify it under the terms of | ||
* the GNU Affero General Public License as published by the Free Software Foundation, | ||
* either version 3 of the License, or (at your option) any later version. | ||
* | ||
* Hopsworks is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; | ||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR | ||
* PURPOSE. See the GNU Affero General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Affero General Public License along with this program. | ||
* If not, see <https://www.gnu.org/licenses/>. | ||
*/ | ||
|
||
package io.hops.hopsworks.common.featurestore.embedding; | ||
|
||
import com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException; | ||
import io.hops.hopsworks.common.opensearch.OpenSearchClient; | ||
import io.hops.hopsworks.exceptions.FeaturestoreException; | ||
import io.hops.hopsworks.exceptions.OpenSearchException; | ||
import io.hops.hopsworks.restutils.RESTCodes; | ||
import io.hops.hopsworks.vectordb.VectorDatabase; | ||
import io.hops.hopsworks.vectordb.VectorDatabaseFactory; | ||
|
||
import javax.annotation.PreDestroy; | ||
import javax.ejb.ConcurrencyManagement; | ||
import javax.ejb.ConcurrencyManagementType; | ||
import javax.ejb.EJB; | ||
import javax.ejb.Singleton; | ||
import javax.ejb.TransactionAttribute; | ||
import javax.ejb.TransactionAttributeType; | ||
import java.util.logging.Level; | ||
import java.util.logging.Logger; | ||
|
||
@Singleton | ||
@TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED) | ||
@ConcurrencyManagement(ConcurrencyManagementType.BEAN) | ||
public class VectorDatabaseClient { | ||
|
||
@EJB | ||
private OpenSearchClient openSearchClient; | ||
private VectorDatabase vectorDatabase; | ||
private static final Logger LOG = Logger.getLogger(EmbeddingController.class.getName()); | ||
|
||
public synchronized VectorDatabase getClient() throws FeaturestoreException { | ||
if (vectorDatabase == null) { | ||
try { | ||
vectorDatabase = VectorDatabaseFactory.getOpensearchDatabase(openSearchClient.getClient()); | ||
} catch (OpenSearchException | ServiceDiscoveryException e) { | ||
throw new FeaturestoreException(RESTCodes.FeaturestoreErrorCode.COULD_NOT_CREATE_FEATUREGROUP, | ||
Level.FINE, "Cannot create opensearch vectordb"); | ||
} | ||
} | ||
return vectorDatabase; | ||
} | ||
|
||
@PreDestroy | ||
private void close() { | ||
try { | ||
vectorDatabase.close(); | ||
} catch (Exception ex) { | ||
LOG.log(Level.SEVERE, null, ex); | ||
} | ||
} | ||
} |
Oops, something went wrong.