-
Notifications
You must be signed in to change notification settings - Fork 143
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Remote Vector Index Build] Introduce RemoteIndexClient skeleton and Build Request construction #2560
base: main
Are you sure you want to change the base?
[Remote Vector Index Build] Introduce RemoteIndexClient skeleton and Build Request construction #2560
Changes from all commits
abc6075
6702c59
86f938d
7435e09
8d18301
8219b1a
0a0a3e5
85d1217
781b2d3
84e8ae2
050fbf7
daef151
a029933
446c678
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -165,4 +165,25 @@ public class KNNConstants { | |
public static final String DERIVED_VECTOR_FIELD_ATTRIBUTE_KEY = "knn-derived-source-enabled"; | ||
public static final String DERIVED_VECTOR_FIELD_ATTRIBUTE_TRUE_VALUE = "true"; | ||
public static final String DERIVED_VECTOR_FIELD_ATTRIBUTE_FALSE_VALUE = "false"; | ||
|
||
// Remote build constants | ||
public static final String BUILD_ENDPOINT = "/_build"; | ||
public static final String STATUS_ENDPOINT = "/_status"; | ||
public static final String S3 = "s3"; | ||
public static final String BUCKET = "bucket"; | ||
// Build request keys | ||
public static final String ALGORITHM = "algorithm"; | ||
public static final String ALGORITHM_PARAMETERS = "algorithm_parameters"; | ||
public static final String INDEX_PARAMETERS = "index_parameters"; | ||
public static final String DOC_COUNT = "doc_count"; | ||
public static final String TENANT_ID = "tenant_id"; | ||
public static final String DOC_ID_PATH = "doc_id_path"; | ||
public static final String VECTOR_PATH = "vector_path"; | ||
public static final String CONTAINER_NAME = "container_name"; | ||
public static final String REPOSITORY_TYPE = "repository_type"; | ||
// Server responses | ||
public static final String JOB_ID = "job_id"; | ||
public static final String TASK_STATUS = "task_status"; | ||
public static final String INDEX_PATH = "index_path"; | ||
public static final String ERROR_MESSAGE = "error_message"; | ||
Comment on lines
+169
to
+188
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. none of these are common constants and are only required at specific classes. I would suggest moving things
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,14 +6,19 @@ | |
package org.opensearch.knn.index.codec.nativeindex.remote; | ||
|
||
import lombok.extern.log4j.Log4j2; | ||
import org.apache.commons.lang.NotImplementedException; | ||
import org.opensearch.common.StopWatch; | ||
import org.opensearch.common.UUIDs; | ||
import org.opensearch.common.annotation.ExperimentalApi; | ||
import org.opensearch.index.IndexSettings; | ||
import org.opensearch.knn.index.KNNSettings; | ||
import org.opensearch.knn.index.codec.nativeindex.NativeIndexBuildStrategy; | ||
import org.opensearch.knn.index.codec.nativeindex.model.BuildIndexParams; | ||
import org.opensearch.knn.index.remote.HTTPRemoteBuildRequest; | ||
import org.opensearch.knn.index.remote.RemoteBuildRequest; | ||
import org.opensearch.knn.index.remote.RemoteBuildRequestBuilder; | ||
import org.opensearch.knn.index.remote.RemoteBuildResponse; | ||
import org.opensearch.knn.index.remote.RemoteIndexClient; | ||
import org.opensearch.knn.index.remote.RemoteIndexClientFactory; | ||
import org.opensearch.repositories.RepositoriesService; | ||
import org.opensearch.repositories.Repository; | ||
import org.opensearch.repositories.RepositoryMissingException; | ||
|
@@ -38,8 +43,8 @@ public class RemoteIndexBuildStrategy implements NativeIndexBuildStrategy { | |
private final NativeIndexBuildStrategy fallbackStrategy; | ||
private final IndexSettings indexSettings; | ||
|
||
static final String VECTOR_BLOB_FILE_EXTENSION = ".knnvec"; | ||
static final String DOC_ID_FILE_EXTENSION = ".knndid"; | ||
public static final String VECTOR_BLOB_FILE_EXTENSION = ".knnvec"; | ||
public static final String DOC_ID_FILE_EXTENSION = ".knndid"; | ||
Comment on lines
+46
to
+47
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why we need them to be public? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Used in client, good to draw directly from here in case the file structure changes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the point is that whenever there are changes like this, it usually hints at the layers of abstraction not being optimal. I think if we build the request separately this isn't needed then right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes that is correct. I feel if these constants required in more than 1 class better to create a constant class for RemoteIndexBuild service. Unresolving the comments |
||
static final String VECTORS_PATH = "_vectors"; | ||
|
||
/** | ||
|
@@ -125,18 +130,26 @@ public void buildAndWriteIndex(BuildIndexParams indexInfo) throws IOException { | |
time_in_millis = stopWatch.stop().totalTime().millis(); | ||
log.debug("Repository write took {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName()); | ||
|
||
// TODO future implementations will set the following two params depending on some setting to denote the protocol | ||
RemoteIndexClient client = RemoteIndexClientFactory.getRemoteIndexClient(RemoteIndexClientFactory.TYPE_HTTP); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why we need to pass the client type? why HTTP cannot be default? |
||
RemoteBuildRequest request = RemoteBuildRequestBuilder.builder(HTTPRemoteBuildRequest.class) | ||
.indexSettings(indexSettings) | ||
.indexInfo(indexInfo) | ||
.repositoryMetadata(getRepository().getMetadata()) | ||
.blobName(blobName) | ||
.build(); | ||
stopWatch = new StopWatch().start(); | ||
submitVectorBuild(); | ||
RemoteBuildResponse remoteBuildResponse = client.submitVectorBuild(request); | ||
time_in_millis = stopWatch.stop().totalTime().millis(); | ||
log.debug("Submit vector build took {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName()); | ||
|
||
stopWatch = new StopWatch().start(); | ||
String downloadPath = awaitVectorBuild(); | ||
RemoteStatusResponse remoteStatusResponse = client.awaitVectorBuild(remoteBuildResponse); | ||
time_in_millis = stopWatch.stop().totalTime().millis(); | ||
log.debug("Await vector build took {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName()); | ||
|
||
stopWatch = new StopWatch().start(); | ||
vectorRepositoryAccessor.readFromRepository(downloadPath, indexInfo.getIndexOutputWithBuffer()); | ||
vectorRepositoryAccessor.readFromRepository(remoteStatusResponse.getIndexPath(), indexInfo.getIndexOutputWithBuffer()); | ||
time_in_millis = stopWatch.stop().totalTime().millis(); | ||
log.debug("Repository read took {} ms for vector field [{}]", time_in_millis, indexInfo.getFieldName()); | ||
} catch (Exception e) { | ||
|
@@ -163,20 +176,4 @@ private BlobStoreRepository getRepository() throws RepositoryMissingException { | |
assert repository instanceof BlobStoreRepository : "Repository should be instance of BlobStoreRepository"; | ||
return (BlobStoreRepository) repository; | ||
} | ||
|
||
/** | ||
* Submit vector build request to remote vector build service | ||
* | ||
*/ | ||
private void submitVectorBuild() { | ||
throw new NotImplementedException(); | ||
} | ||
|
||
/** | ||
* Wait on remote vector build to complete | ||
* @return String The path from which we should perform download, delimited by "/" | ||
*/ | ||
private String awaitVectorBuild() throws NotImplementedException { | ||
throw new NotImplementedException(); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.knn.index.codec.nativeindex.remote; | ||
|
||
import lombok.AllArgsConstructor; | ||
import lombok.Getter; | ||
|
||
@Getter | ||
@AllArgsConstructor | ||
public class RemoteStatusResponse { | ||
private String indexPath; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,11 +15,29 @@ | |
import org.opensearch.knn.index.engine.NativeLibrary; | ||
import org.opensearch.knn.index.engine.ResolvedMethodContext; | ||
|
||
import java.util.HashMap; | ||
import java.util.Map; | ||
import java.util.function.Function; | ||
|
||
import static org.opensearch.knn.common.KNNConstants.ALGORITHM; | ||
import static org.opensearch.knn.common.KNNConstants.ALGORITHM_PARAMETERS; | ||
import static org.opensearch.knn.common.KNNConstants.INDEX_DESCRIPTION_PARAMETER; | ||
import static org.opensearch.knn.common.KNNConstants.METHOD_HNSW; | ||
import static org.opensearch.knn.common.KNNConstants.METHOD_IVF; | ||
import static org.opensearch.knn.common.KNNConstants.METHOD_PARAMETER_EF_CONSTRUCTION; | ||
import static org.opensearch.knn.common.KNNConstants.METHOD_PARAMETER_EF_SEARCH; | ||
import static org.opensearch.knn.common.KNNConstants.METHOD_PARAMETER_M; | ||
import static org.opensearch.knn.common.KNNConstants.METHOD_PARAMETER_NLIST; | ||
import static org.opensearch.knn.common.KNNConstants.METHOD_PARAMETER_NLIST_DEFAULT; | ||
import static org.opensearch.knn.common.KNNConstants.METHOD_PARAMETER_NPROBES; | ||
import static org.opensearch.knn.common.KNNConstants.METHOD_PARAMETER_NPROBES_DEFAULT; | ||
import static org.opensearch.knn.common.KNNConstants.METHOD_PARAMETER_SPACE_TYPE; | ||
import static org.opensearch.knn.common.KNNConstants.NAME; | ||
import static org.opensearch.knn.common.KNNConstants.PARAMETERS; | ||
import static org.opensearch.knn.common.KNNConstants.SPACE_TYPE; | ||
import static org.opensearch.knn.index.KNNSettings.INDEX_KNN_DEFAULT_ALGO_PARAM_EF_CONSTRUCTION; | ||
import static org.opensearch.knn.index.KNNSettings.INDEX_KNN_DEFAULT_ALGO_PARAM_EF_SEARCH; | ||
import static org.opensearch.knn.index.KNNSettings.INDEX_KNN_DEFAULT_SPACE_TYPE; | ||
|
||
/** | ||
* Implements NativeLibrary for the faiss native library | ||
|
@@ -109,6 +127,67 @@ public Float scoreToRadialThreshold(Float score, SpaceType spaceType) { | |
return spaceType.scoreToDistanceTranslation(score); | ||
} | ||
|
||
// TODO refactor to make the index parameter fetching more intelligent and less cumbersome | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes - not to block this PR, but I think we should avoid fetching method specific parameters outside of the method class (i.e. FaissHNSWMethod). We should loop back and make sure thats properly encapsulated in the future. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If this is a todo, better to have a github issue for this. I also think there should be a better way to fetch these details from fieldInfo. This logic is very fragile |
||
/** | ||
* Get the parameters that need to be passed to the remote build service for training | ||
* | ||
* @param indexInfoParameters result of indexInfo.getParameters() to parse | ||
* @return Map of parameters to be used as "index_parameters" | ||
*/ | ||
@Override | ||
public Map<String, Object> getRemoteIndexingParameters(Map<String, Object> indexInfoParameters) { | ||
Map<String, Object> indexParameters = new HashMap<>(); | ||
String methodName = (String) indexInfoParameters.get(NAME); | ||
indexParameters.put(ALGORITHM, methodName); | ||
indexParameters.put(METHOD_PARAMETER_SPACE_TYPE, indexInfoParameters.getOrDefault(SPACE_TYPE, INDEX_KNN_DEFAULT_SPACE_TYPE)); | ||
|
||
assert (indexInfoParameters.containsKey(PARAMETERS)); | ||
Object innerParams = indexInfoParameters.get(PARAMETERS); | ||
assert (innerParams instanceof Map); | ||
{ | ||
Map<String, Object> algorithmParams = new HashMap<>(); | ||
Map<String, Object> innerMap = (Map<String, Object>) innerParams; | ||
switch (methodName) { | ||
case METHOD_HNSW -> { | ||
algorithmParams.put( | ||
METHOD_PARAMETER_EF_CONSTRUCTION, | ||
innerMap.getOrDefault(METHOD_PARAMETER_EF_CONSTRUCTION, INDEX_KNN_DEFAULT_ALGO_PARAM_EF_CONSTRUCTION) | ||
); | ||
algorithmParams.put( | ||
METHOD_PARAMETER_EF_SEARCH, | ||
innerMap.getOrDefault(METHOD_PARAMETER_EF_SEARCH, INDEX_KNN_DEFAULT_ALGO_PARAM_EF_SEARCH) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why we are doing a getOrDefault. |
||
); | ||
Object indexDescription = indexInfoParameters.get(INDEX_DESCRIPTION_PARAMETER); | ||
assert indexDescription instanceof String; | ||
algorithmParams.put(METHOD_PARAMETER_M, getMFromIndexDescription((String) indexDescription)); | ||
} | ||
case METHOD_IVF -> { | ||
algorithmParams.put( | ||
METHOD_PARAMETER_NLIST, | ||
innerMap.getOrDefault(METHOD_PARAMETER_NLIST, METHOD_PARAMETER_NLIST_DEFAULT) | ||
); | ||
algorithmParams.put( | ||
METHOD_PARAMETER_NPROBES, | ||
innerMap.getOrDefault(METHOD_PARAMETER_NPROBES, METHOD_PARAMETER_NPROBES_DEFAULT) | ||
); | ||
} | ||
Comment on lines
+164
to
+173
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. since IVF is not getting used why we are parsing these parameters? |
||
} | ||
indexParameters.put(ALGORITHM_PARAMETERS, algorithmParams); | ||
} | ||
return indexParameters; | ||
} | ||
|
||
public static int getMFromIndexDescription(String indexDescription) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why this is public function? |
||
int commaIndex = indexDescription.indexOf(","); | ||
if (commaIndex == -1) { | ||
throw new IllegalArgumentException("Invalid index description: " + indexDescription); | ||
} | ||
String hnswPart = indexDescription.substring(0, commaIndex); | ||
int m = Integer.parseInt(hnswPart.substring(4)); | ||
assert (m > 1 && m < 100); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. from where these magic number of 1 and 100 are coming up? |
||
return m; | ||
} | ||
|
||
@Override | ||
public ResolvedMethodContext resolveMethod( | ||
KNNMethodContext knnMethodContext, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Have you tested your code by connecting to a HTTP URL? Because I don't think so these 3 dependencies are only needed to connect to remote endpoint. In the POC we have tested it requires much more. Ref: https://github.com/navneet1v/k-NN/blob/remote-vector-staging-2.19/build.gradle#L321-L345
I would suggest doing a test to a remote endpoint for this.