diff --git a/lucene/licenses/agrona-1.20.0.jar.sha1 b/lucene/licenses/agrona-1.20.0.jar.sha1 new file mode 100644 index 000000000000..badef8d6e169 --- /dev/null +++ b/lucene/licenses/agrona-1.20.0.jar.sha1 @@ -0,0 +1 @@ +00580b67864f7739bf7778162f418ada69fa3037 diff --git a/lucene/licenses/agrona-LICENSE-ASL.txt b/lucene/licenses/agrona-LICENSE-ASL.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/lucene/licenses/agrona-NOTICE.txt b/lucene/licenses/agrona-NOTICE.txt new file mode 100644 index 000000000000..795926439ada --- /dev/null +++ b/lucene/licenses/agrona-NOTICE.txt @@ -0,0 +1,6 @@ +This product includes software developed by the Agrona project. +https://github.com/real-logic/agrona + +Copyright © 2014-2023 Real Logic Limited + +Licensed under the Apache License, Version 2.0. diff --git a/lucene/licenses/commons-math3-3.6.1.jar.sha1 b/lucene/licenses/commons-math3-3.6.1.jar.sha1 new file mode 100644 index 000000000000..ed9a549757f5 --- /dev/null +++ b/lucene/licenses/commons-math3-3.6.1.jar.sha1 @@ -0,0 +1 @@ +e4ba98f1d4b3c80ec46392f25e094a6a2e58fcbf diff --git a/lucene/licenses/commons-math3-LICENSE-ASL.txt b/lucene/licenses/commons-math3-LICENSE-ASL.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/lucene/licenses/commons-math3-NOTICE.txt b/lucene/licenses/commons-math3-NOTICE.txt new file mode 100644 index 000000000000..5e2a2f91d48a --- /dev/null +++ b/lucene/licenses/commons-math3-NOTICE.txt @@ -0,0 +1,4 @@ +This product includes software developed by the Apache Commons Math project. +https://commons.apache.org/proper/commons-math/ + +Licensed under the Apache License, Version 2.0. diff --git a/lucene/licenses/jvector-4.0.0-beta.6.jar.sha1 b/lucene/licenses/jvector-4.0.0-beta.6.jar.sha1 new file mode 100644 index 000000000000..740284cac97f --- /dev/null +++ b/lucene/licenses/jvector-4.0.0-beta.6.jar.sha1 @@ -0,0 +1 @@ +31836df381e6cd9f101a30ec6d8a5d1e60c1adcf diff --git a/lucene/licenses/jvector-LICENSE-ASL.txt b/lucene/licenses/jvector-LICENSE-ASL.txt new file mode 100644 index 000000000000..d64569567334 --- /dev/null +++ b/lucene/licenses/jvector-LICENSE-ASL.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/lucene/licenses/jvector-NOTICE.txt b/lucene/licenses/jvector-NOTICE.txt new file mode 100644 index 000000000000..0542e27d7ef7 --- /dev/null +++ b/lucene/licenses/jvector-NOTICE.txt @@ -0,0 +1,6 @@ +This product includes software developed by the JVector project. +https://github.com/jbellis/jvector + +Copyright © 2023 Jonathan Ellis + +Licensed under the Apache License, Version 2.0. diff --git a/lucene/licenses/snakeyaml-2.4.jar.sha1 b/lucene/licenses/snakeyaml-2.4.jar.sha1 new file mode 100644 index 000000000000..8739f8c17629 --- /dev/null +++ b/lucene/licenses/snakeyaml-2.4.jar.sha1 @@ -0,0 +1 @@ +e0666b825b796f85521f02360e77f4c92c5a7a07 diff --git a/lucene/licenses/snakeyaml-LICENSE-ASL.txt b/lucene/licenses/snakeyaml-LICENSE-ASL.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/lucene/licenses/snakeyaml-NOTICE.txt b/lucene/licenses/snakeyaml-NOTICE.txt new file mode 100644 index 000000000000..c1e6931cc149 --- /dev/null +++ b/lucene/licenses/snakeyaml-NOTICE.txt @@ -0,0 +1,4 @@ +This product includes software developed by the SnakeYAML project. +https://bitbucket.org/snakeyaml/snakeyaml + +Licensed under the Apache License, Version 2.0. diff --git a/lucene/sandbox/build.gradle b/lucene/sandbox/build.gradle index daf952f84a8d..9efd8f1cd68f 100644 --- a/lucene/sandbox/build.gradle +++ b/lucene/sandbox/build.gradle @@ -16,12 +16,25 @@ */ +plugins { + id 'java-library' +} description = 'Various third party contributions and new ideas' +java { + modularity.inferModulePath = true +} + dependencies { moduleApi project(':lucene:core') moduleApi project(':lucene:queries') moduleApi project(':lucene:facet') moduleTestImplementation project(':lucene:test-framework') + + moduleImplementation('io.github.jbellis:jvector:4.0.0-beta.6') { + exclude group: 'org.slf4j', module: 'slf4j-api' + } + + moduleImplementation 'org.slf4j:slf4j-api:2.0.17' } diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index ee9be3227de2..df26b895050a 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -16,13 +16,16 @@ */ /** Various third party contributions and new ideas */ +@SuppressWarnings("requires-automatic") module org.apache.lucene.sandbox { requires org.apache.lucene.core; requires org.apache.lucene.queries; requires org.apache.lucene.facet; + requires jvector; exports org.apache.lucene.payloads; exports org.apache.lucene.sandbox.codecs.faiss; + exports org.apache.lucene.sandbox.codecs.jvector; exports org.apache.lucene.sandbox.codecs.idversion; exports org.apache.lucene.sandbox.codecs.quantization; exports org.apache.lucene.sandbox.document; @@ -41,5 +44,8 @@ provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.sandbox.codecs.idversion.IDVersionPostingsFormat; provides org.apache.lucene.codecs.KnnVectorsFormat with - org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat; + org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat, + org.apache.lucene.sandbox.codecs.jvector.JVectorFormat; + provides org.apache.lucene.codecs.Codec with + org.apache.lucene.sandbox.codecs.jvector.JVectorCodec; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java new file mode 100644 index 000000000000..2e74da91c8d0 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/ForceMergesOnlyMergePolicy.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.jvector; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import org.apache.lucene.index.MergePolicy; +import org.apache.lucene.index.MergeTrigger; +import org.apache.lucene.index.SegmentCommitInfo; +import org.apache.lucene.index.SegmentInfos; + +/** + * A merge policy that only merges segments if they are forced. This is useful for testing and + * benchmarking purposes. Since it can be used for benchmarks, it is placed in the common codec + * module. + */ +public class ForceMergesOnlyMergePolicy extends MergePolicy { + private final boolean useCompoundFile; + + public ForceMergesOnlyMergePolicy() { + this(false); + } + + public ForceMergesOnlyMergePolicy(boolean useCompoundFile) { + super(); + this.useCompoundFile = useCompoundFile; + } + + @Override + public MergeSpecification findMerges( + MergeTrigger mergeTrigger, SegmentInfos segmentInfos, MergeContext mergeContext) + throws IOException { + return null; + } + + @Override + public MergeSpecification findForcedMerges( + SegmentInfos segmentInfos, + int maxSegmentCount, + Map segmentsToMerge, + MergeContext mergeContext) + throws IOException { + // If the segments are already merged (e.g. there's only 1 segment), or + // there are segments = segmentInfos.asList(); + MergeSpecification spec = new MergeSpecification(); + + final OneMerge merge = new OneMerge(segments); + spec.add(merge); + return spec; + } + + @Override + public boolean useCompoundFile( + SegmentInfos segmentInfos, SegmentCommitInfo newSegment, MergeContext mergeContext) + throws IOException { + return useCompoundFile; + } + + @Override + public MergeSpecification findForcedDeletesMerges( + SegmentInfos segmentInfos, MergeContext mergeContext) throws IOException { + return null; + } + + /** + * Returns true if the number of segments eligible for merging is less than or equal to the + * specified {@code maxNumSegments}. + */ + protected boolean isMerged( + SegmentInfos infos, + int maxNumSegments, + Map segmentsToMerge, + MergeContext mergeContext) + throws IOException { + final int numSegments = infos.size(); + int numToMerge = 0; + SegmentCommitInfo mergeInfo = null; + boolean segmentIsOriginal = false; + for (int i = 0; i < numSegments && numToMerge <= maxNumSegments; i++) { + final SegmentCommitInfo info = infos.info(i); + final Boolean isOriginal = segmentsToMerge.get(info); + if (isOriginal != null) { + segmentIsOriginal = isOriginal; + numToMerge++; + mergeInfo = info; + } + } + + return numToMerge <= maxNumSegments + && (numToMerge != 1 || !segmentIsOriginal || isMerged(infos, mergeInfo, mergeContext)); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorCodec.java new file mode 100644 index 000000000000..0eb26c419138 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorCodec.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.jvector; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.lucene103.Lucene103Codec; + +/** + * A custom {@link FilterCodec} that wraps the default Lucene codec with JVector vector indexing + * support. This codec registers the {@link JVectorFormat} as the k-NN vectors format used during + * indexing and searching.Add commentMore actions + */ +public class JVectorCodec extends FilterCodec { + + public static final String CODEC_NAME = "JVectorCodec"; + private int minBatchSizeForQuantization; + private boolean mergeOnDisk; + + public JVectorCodec() { + this( + CODEC_NAME, + new Lucene103Codec(), + JVectorFormat.DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION, + JVectorFormat.DEFAULT_MERGE_ON_DISK); + } + + public JVectorCodec(int minBatchSizeForQuantization, boolean mergeOnDisk) { + this(CODEC_NAME, new Lucene103Codec(), minBatchSizeForQuantization, mergeOnDisk); + } + + public JVectorCodec( + String codecName, Codec delegate, int minBatchSizeForQuantization, boolean mergeOnDisk) { + super(codecName, delegate); + this.minBatchSizeForQuantization = minBatchSizeForQuantization; + this.mergeOnDisk = mergeOnDisk; + } + + @Override + public KnnVectorsFormat knnVectorsFormat() { + return new JVectorFormat(minBatchSizeForQuantization, mergeOnDisk); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java new file mode 100644 index 000000000000..3c6b37f4050b --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFloatVectorValues.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.jvector; + +import io.github.jbellis.jvector.graph.NodesIterator; +import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex; +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import io.github.jbellis.jvector.vector.VectorizationProvider; +import io.github.jbellis.jvector.vector.types.VectorFloat; +import io.github.jbellis.jvector.vector.types.VectorTypeSupport; +import java.io.IOException; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.search.VectorScorer; + +/** + * A {@link FloatVectorValues} implementation backed by a JVector {@link OnDiskGraphIndex}. Provides + * access to vector data and iteration over document vectors stored on disk. Supports both Lucene's + * float[] access API and JVector's native {@link VectorFloat} API. + */ +public class JVectorFloatVectorValues extends FloatVectorValues { + private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = + VectorizationProvider.getInstance().getVectorTypeSupport(); + + private final OnDiskGraphIndex onDiskGraphIndex; + private final OnDiskGraphIndex.View view; + private final VectorSimilarityFunction similarityFunction; + + public JVectorFloatVectorValues( + OnDiskGraphIndex onDiskGraphIndex, VectorSimilarityFunction similarityFunction) + throws IOException { + this.onDiskGraphIndex = onDiskGraphIndex; + this.view = onDiskGraphIndex.getView(); + this.similarityFunction = similarityFunction; + } + + @Override + public int dimension() { + return onDiskGraphIndex.getDimension(); + } + + @Override + public int size() { + return onDiskGraphIndex.size(); + } + + public VectorFloat vectorFloatValue(int ord) { + if (!onDiskGraphIndex.containsNode(ord)) { + throw new RuntimeException("ord " + ord + " not found in graph"); + } + + return view.getVector(ord); + } + + @Override + public DocIndexIterator iterator() { + return new DocIndexIterator() { + private int docId = -1; + private final NodesIterator nodesIterator = onDiskGraphIndex.getNodes(0); + + @Override + public long cost() { + return size(); + } + + @Override + public int index() { + return docId; + } + + @Override + public int docID() { + return docId; + } + + @Override + public int nextDoc() throws IOException { + if (nodesIterator.hasNext()) { + docId = nodesIterator.next(); + } else { + docId = NO_MORE_DOCS; + } + + return docId; + } + + @Override + public int advance(int target) throws IOException { + return slowAdvance(target); + } + }; + } + + @Override + public float[] vectorValue(int i) throws IOException { + try { + final VectorFloat vector = vectorFloatValue(i); + return (float[]) vector.get(); + } catch (Throwable e) { + throw new RuntimeException(e); + } + } + + @Override + public FloatVectorValues copy() throws IOException { + return this; + } + + @Override + public VectorScorer scorer(float[] query) throws IOException { + return new JVectorVectorScorer( + this, VECTOR_TYPE_SUPPORT.createFloatVector(query), similarityFunction); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java new file mode 100644 index 000000000000..2f3bf63e7e68 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorFormat.java @@ -0,0 +1,174 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.jvector; + +import java.io.IOException; +import java.util.function.Function; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; + +/** + * A Lucene {@link KnnVectorsFormat} implementation for the JVector indexing format. This format + * defines how vectors are stored, searched, and laid out on disk for maximum performance and + * flexibility.Add commentMore actions + */ +public class JVectorFormat extends KnnVectorsFormat { + public static final String NAME = "JVectorFormat"; + public static final String META_CODEC_NAME = "JVectorVectorsFormatMeta"; + public static final String VECTOR_INDEX_CODEC_NAME = "JVectorVectorsFormatIndex"; + public static final String JVECTOR_FILES_SUFFIX = "jvector"; + public static final String META_EXTENSION = "meta-" + JVECTOR_FILES_SUFFIX; + public static final String VECTOR_INDEX_EXTENSION = "data-" + JVECTOR_FILES_SUFFIX; + public static final int DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION = + 1024; // The minimum number of vectors required to trigger + // quantization + public static final int VERSION_START = 0; + public static final int VERSION_CURRENT = VERSION_START; + private static final int DEFAULT_MAX_CONN = 32; + private static final int DEFAULT_BEAM_WIDTH = 100; + private static final float DEFAULT_ALPHA = 2f; + static final boolean DEFAULT_MERGE_ON_DISK = true; + private static final float DEFAULT_NEIGHBOR_OVERFLOW = 2f; + + private final int maxConn; + private final int beamWidth; + private final int minBatchSizeForQuantization; + private final Function numberOfSubspacesPerVectorSupplier; + private final boolean mergeOnDisk; + private final float alpha; + private final float neighborOverflow; + + public JVectorFormat() { + this( + NAME, + DEFAULT_MAX_CONN, + DEFAULT_BEAM_WIDTH, + DEFAULT_NEIGHBOR_OVERFLOW, + DEFAULT_ALPHA, + JVectorFormat::getDefaultNumberOfSubspacesPerVector, + DEFAULT_MINIMUM_BATCH_SIZE_FOR_QUANTIZATION, + DEFAULT_MERGE_ON_DISK); + } + + public JVectorFormat(int minBatchSizeForQuantization, boolean mergeOnDisk) { + this( + NAME, + DEFAULT_MAX_CONN, + DEFAULT_BEAM_WIDTH, + DEFAULT_NEIGHBOR_OVERFLOW, + DEFAULT_ALPHA, + JVectorFormat::getDefaultNumberOfSubspacesPerVector, + minBatchSizeForQuantization, + mergeOnDisk); + } + + public JVectorFormat( + int maxConn, + int beamWidth, + float neighborOverflow, + float alpha, + int minBatchSizeForQuantization, + boolean mergeOnDisk) { + this( + NAME, + maxConn, + beamWidth, + neighborOverflow, + alpha, + JVectorFormat::getDefaultNumberOfSubspacesPerVector, + minBatchSizeForQuantization, + mergeOnDisk); + } + + public JVectorFormat( + String name, + int maxConn, + int beamWidth, + float neighborOverflow, + float alpha, + Function numberOfSubspacesPerVectorSupplier, + int minBatchSizeForQuantization, + boolean mergeOnDisk) { + super(name); + this.maxConn = maxConn; + this.beamWidth = beamWidth; + this.numberOfSubspacesPerVectorSupplier = numberOfSubspacesPerVectorSupplier; + this.minBatchSizeForQuantization = minBatchSizeForQuantization; + this.mergeOnDisk = mergeOnDisk; + this.alpha = alpha; + this.neighborOverflow = neighborOverflow; + } + + @Override + public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { + return new JVectorWriter( + state, + maxConn, + beamWidth, + neighborOverflow, + alpha, + numberOfSubspacesPerVectorSupplier, + minBatchSizeForQuantization, + mergeOnDisk); + } + + @Override + public KnnVectorsReader fieldsReader(SegmentReadState state) throws IOException { + return new JVectorReader(state, mergeOnDisk); + } + + @Override + public int getMaxDimensions(String dim) { + return 8192; + } + + /** + * This method returns the default number of subspaces per vector for a given original dimension. + * Should be used as a default value for the number of subspaces per vector in case no value is + * provided. + * + * @param originalDimension original vector dimension + * @return default number of subspaces per vector + */ + public static int getDefaultNumberOfSubspacesPerVector(int originalDimension) { + int compressedBytes; + if (originalDimension <= 32) { + compressedBytes = originalDimension; + } else if (originalDimension <= 64) { + compressedBytes = 32; + } else if (originalDimension <= 200) { + compressedBytes = (int) (originalDimension * 0.5); + } else if (originalDimension <= 400) { + compressedBytes = 100; + } else if (originalDimension <= 768) { + compressedBytes = + 64; // used for benchmarks, cohere wikipedia-768 achieves high recall w/ greater indexing + // throughput + } else if (originalDimension <= 1536) { + compressedBytes = 192; + } else if (originalDimension <= 4096) { + compressedBytes = (int) (originalDimension * 0.0625); + } else { + return (int) (originalDimension * 0.0625); + } + return compressedBytes; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java new file mode 100644 index 000000000000..6483d7c71393 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorIndexWriter.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.jvector; + +import io.github.jbellis.jvector.disk.IndexWriter; +import java.io.IOException; +import org.apache.lucene.store.IndexOutput; + +/** + * JVectorRandomAccessWriter is a wrapper around IndexOutput that implements RandomAccessWriter. + * Note: This is not thread safe! + */ +public class JVectorIndexWriter implements IndexWriter { + private final IndexOutput indexOutputDelegate; + + public JVectorIndexWriter(IndexOutput indexOutputDelegate) { + this.indexOutputDelegate = indexOutputDelegate; + } + + @Override + public long position() throws IOException { + return indexOutputDelegate.getFilePointer(); + } + + @Override + public void close() throws IOException { + indexOutputDelegate.close(); + } + + @Override + public void write(int b) throws IOException { + indexOutputDelegate.writeByte((byte) b); + } + + @Override + public void write(byte[] b) throws IOException { + indexOutputDelegate.writeBytes(b, 0, b.length); + } + + @Override + public void write(byte[] b, int off, int len) throws IOException { + indexOutputDelegate.writeBytes(b, off, len); + } + + @Override + public void writeBoolean(boolean v) throws IOException { + indexOutputDelegate.writeByte((byte) (v ? 1 : 0)); + } + + @Override + public void writeByte(int v) throws IOException { + indexOutputDelegate.writeByte((byte) v); + } + + @Override + public void writeShort(int v) throws IOException { + indexOutputDelegate.writeShort((short) v); + } + + @Override + public void writeChar(int v) throws IOException { + throw new UnsupportedOperationException( + "JVectorRandomAccessWriter does not support writing chars"); + } + + @Override + public void writeInt(int v) throws IOException { + indexOutputDelegate.writeInt(v); + } + + @Override + public void writeLong(long v) throws IOException { + indexOutputDelegate.writeLong(v); + } + + @Override + public void writeFloat(float v) throws IOException { + indexOutputDelegate.writeInt(Float.floatToIntBits(v)); + } + + @Override + public void writeDouble(double v) throws IOException { + writeLong(Double.doubleToLongBits(v)); + } + + @Override + public void writeBytes(String s) throws IOException { + throw new UnsupportedOperationException( + "JVectorIndexWriter does not support writing String as bytes"); + } + + @Override + public void writeChars(String s) throws IOException { + throw new UnsupportedOperationException("JVectorIndexWriter does not support writing chars"); + } + + @Override + public void writeUTF(String s) throws IOException { + throw new UnsupportedOperationException( + "JVectorIndexWriter does not support writing UTF strings"); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java new file mode 100644 index 000000000000..5744170d4e5f --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorKnnCollector.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.jvector; + +import java.util.Objects; +import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.knn.KnnSearchStrategy; + +/** + * Wrapper class for KnnCollector that provides passing of additional parameters specific for + * JVector. + */ +public final class JVectorKnnCollector implements KnnCollector { + private final KnnCollector delegate; + private final float threshold; + private final float rerankFloor; + private final int overQueryFactor; + private final boolean usePruning; + + /** + * Constructs a new JVectorKnnCollector. + * + * @param delegate the underlying KnnCollector to delegate calls to + * @param threshold the similarity threshold for JVector + * @param rerankFloor the rerank floor value + * @param overQueryFactor the over-query factor + * @param usePruning whether to apply pruning + */ + public JVectorKnnCollector( + KnnCollector delegate, + float threshold, + float rerankFloor, + int overQueryFactor, + boolean usePruning) { + this.delegate = Objects.requireNonNull(delegate, "delegate must not be null"); + this.threshold = threshold; + this.rerankFloor = rerankFloor; + this.overQueryFactor = overQueryFactor; + this.usePruning = usePruning; + } + + public KnnCollector getDelegate() { + return delegate; + } + + public float getThreshold() { + return threshold; + } + + public float getRerankFloor() { + return rerankFloor; + } + + public int getOverQueryFactor() { + return overQueryFactor; + } + + public boolean isUsePruning() { + return usePruning; + } + + @Override + public boolean earlyTerminated() { + return delegate.earlyTerminated(); + } + + @Override + public void incVisitedCount(int count) { + delegate.incVisitedCount(count); + } + + @Override + public long visitedCount() { + return delegate.visitedCount(); + } + + @Override + public long visitLimit() { + return delegate.visitLimit(); + } + + @Override + public int k() { + return delegate.k(); + } + + @Override + public boolean collect(int docId, float similarity) { + return delegate.collect(docId, similarity); + } + + @Override + public float minCompetitiveSimilarity() { + return delegate.minCompetitiveSimilarity(); + } + + @Override + public TopDocs topDocs() { + return delegate.topDocs(); + } + + @Override + public KnnSearchStrategy getSearchStrategy() { + return null; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java new file mode 100644 index 000000000000..d7c549fa621d --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorRandomAccessReader.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.jvector; + +import io.github.jbellis.jvector.disk.RandomAccessReader; +import io.github.jbellis.jvector.disk.ReaderSupplier; +import java.io.EOFException; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.FloatBuffer; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.IOUtils; + +/** + * Provides random access to vector data stored in the JVector format. Implements {@link + * RandomAccessReader} to support efficient retrieval of individual vectors by document ID or + * ordinal. This class is used during query evaluation and rescoring to fetch exact vectors or + * quantized representations. + */ +public class JVectorRandomAccessReader implements RandomAccessReader { + private final byte[] internalBuffer = new byte[Long.BYTES]; + private final byte[] internalFloatBuffer = new byte[Float.BYTES]; + private final IndexInput indexInputDelegate; + + public JVectorRandomAccessReader(IndexInput indexInputDelegate) { + this.indexInputDelegate = indexInputDelegate; + } + + @Override + public void seek(long offset) throws IOException { + indexInputDelegate.seek(offset); + } + + @Override + public long getPosition() throws IOException { + return indexInputDelegate.getFilePointer(); + } + + @Override + public int readInt() throws IOException { + return indexInputDelegate.readInt(); + } + + @Override + public float readFloat() throws IOException { + indexInputDelegate.readBytes(internalFloatBuffer, 0, Float.BYTES); + FloatBuffer buffer = ByteBuffer.wrap(internalFloatBuffer).asFloatBuffer(); + return buffer.get(); + } + + @Override + public long readLong() throws IOException { + return indexInputDelegate.readLong(); + } + + @Override + public void readFully(byte[] bytes) throws IOException { + indexInputDelegate.readBytes(bytes, 0, bytes.length); + } + + @Override + public void readFully(ByteBuffer buffer) throws IOException { + // validate that the requested bytes actually exist ---- + long remainingInFile = indexInputDelegate.length() - indexInputDelegate.getFilePointer(); + if (buffer.remaining() > remainingInFile) { + throw new EOFException( + "Requested " + buffer.remaining() + " bytes but only " + remainingInFile + " available"); + } + + // Heap buffers with a backing array can be filled in one call ---- + if (buffer.hasArray()) { + int off = buffer.arrayOffset() + buffer.position(); + int len = buffer.remaining(); + indexInputDelegate.readBytes(buffer.array(), off, len); + buffer.position(buffer.limit()); // advance fully + return; + } + + // Direct / non-array buffers: copy in reasonable chunks ---- + while (buffer.hasRemaining()) { + final int bytesToRead = Math.min(buffer.remaining(), Long.BYTES); + indexInputDelegate.readBytes(this.internalBuffer, 0, bytesToRead); + buffer.put(this.internalBuffer, 0, bytesToRead); + } + } + + @Override + public void readFully(long[] vector) throws IOException { + for (int i = 0; i < vector.length; i++) { + vector[i] = readLong(); + } + } + + @Override + public void read(int[] ints, int offset, int count) throws IOException { + for (int i = 0; i < count; i++) { + ints[offset + i] = readInt(); + } + } + + @Override + public void read(float[] floats, int offset, int count) throws IOException { + final ByteBuffer byteBuffer = ByteBuffer.allocate(Float.BYTES * count); + indexInputDelegate.readBytes(byteBuffer.array(), offset, Float.BYTES * count); + FloatBuffer buffer = byteBuffer.asFloatBuffer(); + buffer.get(floats, offset, count); + } + + @Override + public void close() throws IOException {} + + @Override + public long length() throws IOException { + return indexInputDelegate.length(); + } + + /** + * Supplies readers which are actually slices of the original IndexInput. We will vend out slices + * in order for us to easily find the footer of the jVector graph index. This is useful because + * our logic that reads the graph that the footer is always at {@link IndexInput#length()} of the + * slice. Which is how {@link + * io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex#load(ReaderSupplier, long)} is working + * behind the scenes. The header offset, on the other hand, is flexible because we can provide it + * as a parameter to {@link + * io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex#load(ReaderSupplier, long)} + */ + public static class Supplier implements ReaderSupplier { + private final AtomicInteger readerCount = new AtomicInteger(0); + private final IndexInput currentInput; + private final long sliceStartOffset; + private final long sliceLength; + private final ConcurrentHashMap readers = + new ConcurrentHashMap<>(); + + public Supplier(IndexInput indexInput, long sliceStartOffset, long sliceLength) + throws IOException { + this.currentInput = indexInput; + this.sliceStartOffset = sliceStartOffset; + this.sliceLength = sliceLength; + } + + @Override + public RandomAccessReader get() throws IOException { + synchronized (this) { + final IndexInput input = + currentInput + .slice("Input Slice for the jVector graph or PQ", sliceStartOffset, sliceLength) + .clone(); + + var reader = new JVectorRandomAccessReader(input); + int readerId = readerCount.getAndIncrement(); + readers.put(readerId, reader); + return reader; + } + } + + @Override + public void close() throws IOException { + // Close source of all cloned inputs + IOUtils.closeWhileHandlingException(currentInput); + + // Close all readers + for (RandomAccessReader reader : readers.values()) { + IOUtils.closeWhileHandlingException(reader::close); + } + readers.clear(); + readerCount.set(0); + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java new file mode 100644 index 000000000000..7abdc3a73376 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorReader.java @@ -0,0 +1,339 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.jvector; + +import io.github.jbellis.jvector.disk.ReaderSupplier; +import io.github.jbellis.jvector.graph.GraphSearcher; +import io.github.jbellis.jvector.graph.SearchResult; +import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex; +import io.github.jbellis.jvector.graph.similarity.DefaultSearchScoreProvider; +import io.github.jbellis.jvector.graph.similarity.ScoreFunction; +import io.github.jbellis.jvector.graph.similarity.SearchScoreProvider; +import io.github.jbellis.jvector.quantization.PQVectors; +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import io.github.jbellis.jvector.vector.VectorizationProvider; +import io.github.jbellis.jvector.vector.types.VectorFloat; +import io.github.jbellis.jvector.vector.types.VectorTypeSupport; +import java.io.Closeable; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil; +import org.apache.lucene.codecs.hnsw.FlatVectorsFormat; +import org.apache.lucene.codecs.hnsw.FlatVectorsReader; +import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.IOUtils; + +/** + * A KnnVectorsReader implementation for the JVector codec that supports reading and searching + * on-disk graph-based vector indices and optional product quantized vectors. Loads per-field vector + * metadata and exposes float vector values, similarity search, and integrity checking. Uses + * GraphSearcher with optional reranking for approximate or exact search. Falls back to Lucene's + * FlatVectorsReader during merge operations. + */ +public class JVectorReader extends KnnVectorsReader { + private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = + VectorizationProvider.getInstance().getVectorTypeSupport(); + private static final FlatVectorsFormat FLAT_VECTORS_FORMAT = + new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.getLucene99FlatVectorsScorer()); + + // TODO: Expose these values with JVectorFormat constructor args + public static final Double DEFAULT_QUERY_SIMILARITY_THRESHOLD = 0.0; + public static final Double DEFAULT_QUERY_RERANK_FLOOR = 0.0; + public static final Boolean DEFAULT_QUERY_USE_PRUNING = false; + public static final int DEFAULT_OVER_QUERY_FACTOR = 3; + + private final FieldInfos fieldInfos; + private final String baseDataFileName; + private final Map fieldEntryMap = new HashMap<>(1); + private final Directory directory; + private final SegmentReadState state; + private final FlatVectorsReader flatVectorsReader; + private final boolean mergeOnDisk; + + public JVectorReader(SegmentReadState state, boolean mergeOnDisk) throws IOException { + this.state = state; + this.mergeOnDisk = mergeOnDisk; + this.flatVectorsReader = FLAT_VECTORS_FORMAT.fieldsReader(state); + this.fieldInfos = state.fieldInfos; + this.baseDataFileName = state.segmentInfo.name + "_" + state.segmentSuffix; + String metaFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, JVectorFormat.META_EXTENSION); + this.directory = state.directory; + boolean success = false; + try (ChecksumIndexInput meta = state.directory.openChecksumInput(metaFileName)) { + CodecUtil.checkIndexHeader( + meta, + JVectorFormat.META_CODEC_NAME, + JVectorFormat.VERSION_START, + JVectorFormat.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + readFields(meta); + CodecUtil.checkFooter(meta); + + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + @Override + public void checkIntegrity() throws IOException { + for (FieldEntry fieldEntry : fieldEntryMap.values()) { + try (var indexInput = + state.directory.openInput(fieldEntry.vectorIndexFieldDataFileName, state.context)) { + CodecUtil.checksumEntireFile(indexInput); + } + } + } + + @Override + public FloatVectorValues getFloatVectorValues(String field) throws IOException { + if (mergeOnDisk) { + return flatVectorsReader.getFloatVectorValues(field); + } + final FieldEntry fieldEntry = fieldEntryMap.get(field); + return new JVectorFloatVectorValues(fieldEntry.index, fieldEntry.similarityFunction); + } + + @Override + public ByteVectorValues getByteVectorValues(String field) throws IOException { + return null; + } + + @Override + public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) + throws IOException { + final OnDiskGraphIndex index = fieldEntryMap.get(field).index; + final JVectorKnnCollector jvectorKnnCollector; + if (knnCollector instanceof JVectorKnnCollector) { + jvectorKnnCollector = (JVectorKnnCollector) knnCollector; + } else { + jvectorKnnCollector = + new JVectorKnnCollector( + knnCollector, + DEFAULT_QUERY_SIMILARITY_THRESHOLD.floatValue(), + DEFAULT_QUERY_RERANK_FLOOR.floatValue(), + DEFAULT_OVER_QUERY_FACTOR, + DEFAULT_QUERY_USE_PRUNING); + } + + VectorFloat query = VECTOR_TYPE_SUPPORT.createFloatVector(target); + final SearchScoreProvider ssp; + + try (var view = index.getView()) { + if (fieldEntryMap.get(field).pqVectors + != null) { // Quantized, use the precomputed score function + final PQVectors pqVectors = fieldEntryMap.get(field).pqVectors; + // SearchScoreProvider that does a first pass with the loaded-in-memory PQVectors, + // then reranks with the exact vectors that are stored on disk in the index + ScoreFunction.ApproximateScoreFunction asf = + pqVectors.precomputedScoreFunctionFor( + query, fieldEntryMap.get(field).similarityFunction); + ScoreFunction.ExactScoreFunction reranker = + view.rerankerFor(query, fieldEntryMap.get(field).similarityFunction); + ssp = new DefaultSearchScoreProvider(asf, reranker); + } else { // Not quantized, used typical searcher + ssp = + DefaultSearchScoreProvider.exact( + query, fieldEntryMap.get(field).similarityFunction, view); + } + // Acceptdocs document bits filtering + io.github.jbellis.jvector.util.Bits compatibleBits = + doc -> acceptDocs == null || acceptDocs.get(doc); + try (var graphSearcher = new GraphSearcher(index)) { + final var searchResults = + graphSearcher.search( + ssp, + jvectorKnnCollector.k(), + jvectorKnnCollector.k() * jvectorKnnCollector.getOverQueryFactor(), + jvectorKnnCollector.getThreshold(), + jvectorKnnCollector.getRerankFloor(), + compatibleBits); + for (SearchResult.NodeScore ns : searchResults.getNodes()) { + jvectorKnnCollector.collect(ns.node, ns.score); + } + } + } + } + + @Override + public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) + throws IOException { + // TODO + } + + @Override + public void close() throws IOException { + IOUtils.close(flatVectorsReader); + for (FieldEntry fieldEntry : fieldEntryMap.values()) { + IOUtils.close(fieldEntry); + } + } + + private void readFields(ChecksumIndexInput meta) throws IOException { + for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) { + final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); + JVectorWriter.VectorIndexFieldMetadata vectorIndexFieldMetadata = + new JVectorWriter.VectorIndexFieldMetadata(meta); + assert fieldInfo.number == vectorIndexFieldMetadata.getFieldNumber(); + fieldEntryMap.put(fieldInfo.name, new FieldEntry(fieldInfo, vectorIndexFieldMetadata)); + } + } + + class FieldEntry implements Closeable { + private final VectorSimilarityFunction similarityFunction; + private final long vectorIndexOffset; + private final long vectorIndexLength; + private final long pqCodebooksAndVectorsLength; + private final long pqCodebooksAndVectorsOffset; + private final String vectorIndexFieldDataFileName; + private final ReaderSupplier indexReaderSupplier; + private final ReaderSupplier pqCodebooksReaderSupplier; + private final OnDiskGraphIndex index; + private final PQVectors pqVectors; // The product quantized vectors with their codebooks + + public FieldEntry( + FieldInfo fieldInfo, JVectorWriter.VectorIndexFieldMetadata vectorIndexFieldMetadata) + throws IOException { + this.similarityFunction = + VectorSimilarityMapper.ordToDistFunc( + vectorIndexFieldMetadata.getVectorSimilarityFunction().ordinal()); + this.vectorIndexOffset = vectorIndexFieldMetadata.getVectorIndexOffset(); + this.vectorIndexLength = vectorIndexFieldMetadata.getVectorIndexLength(); + this.pqCodebooksAndVectorsLength = vectorIndexFieldMetadata.getPqCodebooksAndVectorsLength(); + this.pqCodebooksAndVectorsOffset = vectorIndexFieldMetadata.getPqCodebooksAndVectorsOffset(); + + this.vectorIndexFieldDataFileName = + baseDataFileName + "_" + fieldInfo.name + "." + JVectorFormat.VECTOR_INDEX_EXTENSION; + + // For the slice we would like to include the Lucene header, unfortunately, we have to do this + // because jVector use global + // offsets instead of local offsets + final long sliceLength = + vectorIndexLength + + CodecUtil.indexHeaderLength( + JVectorFormat.VECTOR_INDEX_CODEC_NAME, state.segmentSuffix); + // Load the graph index + this.indexReaderSupplier = + new JVectorRandomAccessReader.Supplier( + directory.openInput(vectorIndexFieldDataFileName, state.context), 0, sliceLength); + this.index = OnDiskGraphIndex.load(indexReaderSupplier, vectorIndexOffset); + + // If quantized load the compressed product quantized vectors with their codebooks + if (pqCodebooksAndVectorsLength > 0) { + assert pqCodebooksAndVectorsOffset > 0; + if (pqCodebooksAndVectorsOffset < vectorIndexOffset) { + throw new IllegalArgumentException( + "pqCodebooksAndVectorsOffset must be greater than vectorIndexOffset"); + } + this.pqCodebooksReaderSupplier = + new JVectorRandomAccessReader.Supplier( + directory.openInput(vectorIndexFieldDataFileName, state.context), + pqCodebooksAndVectorsOffset, + pqCodebooksAndVectorsLength); + + try (final var randomAccessReader = pqCodebooksReaderSupplier.get()) { + this.pqVectors = PQVectors.load(randomAccessReader); + } + } else { + this.pqCodebooksReaderSupplier = null; + this.pqVectors = null; + } + } + + @Override + public void close() throws IOException { + if (indexReaderSupplier != null) { + IOUtils.close(indexReaderSupplier::close); + } + if (pqCodebooksReaderSupplier != null) { + IOUtils.close(pqCodebooksReaderSupplier::close); + } + } + } + + /** + * This is a list of vector similarity functions that are currently supported by this version of + * the JVector Lucene codec: Euclidean, Dot Product/Angular, Cosine Note: If benchmarking with + * luceneutil, these are selectable through knnPerfTest.py + */ + public static class VectorSimilarityMapper { + + public static final List JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS = + List.of( + VectorSimilarityFunction.EUCLIDEAN, + VectorSimilarityFunction.DOT_PRODUCT, + VectorSimilarityFunction.COSINE); + + public static final Map< + org.apache.lucene.index.VectorSimilarityFunction, VectorSimilarityFunction> + LUCENE_TO_JVECTOR_MAP = + Map.of( + org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN, + VectorSimilarityFunction.EUCLIDEAN, + org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT, + VectorSimilarityFunction.DOT_PRODUCT, + org.apache.lucene.index.VectorSimilarityFunction.COSINE, + VectorSimilarityFunction.COSINE); + + public static int distFuncToOrd(org.apache.lucene.index.VectorSimilarityFunction func) { + if (LUCENE_TO_JVECTOR_MAP.containsKey(func)) { + return JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS.indexOf(LUCENE_TO_JVECTOR_MAP.get(func)); + } + + throw new IllegalArgumentException("Invalid distance function: " + func); + } + + public static VectorSimilarityFunction ordToDistFunc(int ord) { + return JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS.get(ord); + } + + public static org.apache.lucene.index.VectorSimilarityFunction ordToLuceneDistFunc(int ord) { + if (ord < 0 || ord >= JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS.size()) { + throw new IllegalArgumentException("Invalid ord: " + ord); + } + VectorSimilarityFunction jvectorFunc = JVECTOR_SUPPORTED_SIMILARITY_FUNCTIONS.get(ord); + for (Map.Entry + entry : LUCENE_TO_JVECTOR_MAP.entrySet()) { + if (entry.getValue().equals(jvectorFunc)) { + return entry.getKey(); + } + } + throw new IllegalStateException( + "No matching Lucene VectorSimilarityFunction found for ordinal: " + ord); + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java new file mode 100644 index 000000000000..f7eb8d42c6ba --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorVectorScorer.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.jvector; + +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import io.github.jbellis.jvector.vector.types.VectorFloat; +import java.io.IOException; +import org.apache.lucene.index.KnnVectorValues; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.VectorScorer; + +/** + * A VectorScorer that computes similarity scores between a target query vector and document vectors + * using JVector's float vector representation. Uses a VectorSimilarityFunction to compare the + * target vector with each document vector, and iterates over candidates via the associated + * DocIndexIterator. + */ +public class JVectorVectorScorer implements VectorScorer { + private final JVectorFloatVectorValues floatVectorValues; + private final KnnVectorValues.DocIndexIterator docIndexIterator; + private final VectorFloat target; + private final VectorSimilarityFunction similarityFunction; + + public JVectorVectorScorer( + JVectorFloatVectorValues vectorValues, + VectorFloat target, + VectorSimilarityFunction similarityFunction) { + this.floatVectorValues = vectorValues; + this.docIndexIterator = floatVectorValues.iterator(); + this.target = target; + this.similarityFunction = similarityFunction; + } + + @Override + public float score() throws IOException { + return similarityFunction.compare( + target, floatVectorValues.vectorFloatValue(docIndexIterator.index())); + } + + @Override + public DocIdSetIterator iterator() { + return docIndexIterator; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java new file mode 100644 index 000000000000..946775c89ec6 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/JVectorWriter.java @@ -0,0 +1,776 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.sandbox.codecs.jvector; + +import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding; + +import io.github.jbellis.jvector.graph.GraphIndexBuilder; +import io.github.jbellis.jvector.graph.OnHeapGraphIndex; +import io.github.jbellis.jvector.graph.RandomAccessVectorValues; +import io.github.jbellis.jvector.graph.disk.OnDiskSequentialGraphIndexWriter; +import io.github.jbellis.jvector.graph.disk.feature.Feature; +import io.github.jbellis.jvector.graph.disk.feature.FeatureId; +import io.github.jbellis.jvector.graph.disk.feature.InlineVectors; +import io.github.jbellis.jvector.graph.similarity.BuildScoreProvider; +import io.github.jbellis.jvector.quantization.ProductQuantization; +import io.github.jbellis.jvector.vector.VectorizationProvider; +import io.github.jbellis.jvector.vector.types.VectorFloat; +import io.github.jbellis.jvector.vector.types.VectorTypeSupport; +import java.io.DataOutput; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.function.Function; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnFieldVectorsWriter; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.codecs.hnsw.FlatFieldVectorsWriter; +import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil; +import org.apache.lucene.codecs.hnsw.FlatVectorsFormat; +import org.apache.lucene.codecs.hnsw.FlatVectorsWriter; +import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Sorter; +import org.apache.lucene.index.VectorEncoding; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.hnsw.CloseableRandomVectorScorerSupplier; + +/** + * Writes vector data using the JVector format. This class is responsible for serializing vectors + * and building index structures such as graphs or quantization data, during the indexing process. + */ +public class JVectorWriter extends KnnVectorsWriter { + + private static final long SHALLOW_RAM_BYTES_USED = + RamUsageEstimator.shallowSizeOfInstance(JVectorWriter.class); + private static final FlatVectorsFormat FLAT_VECTORS_FORMAT = + new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.getLucene99FlatVectorsScorer()); + private final List> fields = new ArrayList<>(); + + private final IndexOutput meta; + private final IndexOutput vectorIndex; + private final FlatVectorsWriter flatVectorWriter; + private final String indexDataFileName; + private final String baseDataFileName; + private final SegmentWriteState segmentWriteState; + private final int maxConn; + private final int beamWidth; + private final float degreeOverflow; + private final float alpha; + private final Function + numberOfSubspacesPerVectorSupplier; // Number of subspaces used per vector for PQ quantization + // as a function of the original dimension + private final int + minimumBatchSizeForQuantization; // Threshold for the vector count above which we will trigger + // PQ quantization + private final boolean mergeOnDisk; + + private boolean finished = false; + + public JVectorWriter( + SegmentWriteState segmentWriteState, + int maxConn, + int beamWidth, + float degreeOverflow, + float alpha, + Function numberOfSubspacesPerVectorSupplier, + int minimumBatchSizeForQuantization, + boolean mergeOnDisk) + throws IOException { + this.segmentWriteState = segmentWriteState; + this.maxConn = maxConn; + this.beamWidth = beamWidth; + this.degreeOverflow = degreeOverflow; + this.alpha = alpha; + this.numberOfSubspacesPerVectorSupplier = numberOfSubspacesPerVectorSupplier; + this.minimumBatchSizeForQuantization = minimumBatchSizeForQuantization; + this.mergeOnDisk = mergeOnDisk; + this.flatVectorWriter = FLAT_VECTORS_FORMAT.fieldsWriter(segmentWriteState); + String metaFileName = + IndexFileNames.segmentFileName( + segmentWriteState.segmentInfo.name, + segmentWriteState.segmentSuffix, + JVectorFormat.META_EXTENSION); + + this.indexDataFileName = + IndexFileNames.segmentFileName( + segmentWriteState.segmentInfo.name, + segmentWriteState.segmentSuffix, + JVectorFormat.VECTOR_INDEX_EXTENSION); + this.baseDataFileName = + segmentWriteState.segmentInfo.name + "_" + segmentWriteState.segmentSuffix; + + boolean success = false; + try { + meta = segmentWriteState.directory.createOutput(metaFileName, segmentWriteState.context); + vectorIndex = + segmentWriteState.directory.createOutput(indexDataFileName, segmentWriteState.context); + CodecUtil.writeIndexHeader( + meta, + JVectorFormat.META_CODEC_NAME, + JVectorFormat.VERSION_CURRENT, + segmentWriteState.segmentInfo.getId(), + segmentWriteState.segmentSuffix); + + CodecUtil.writeIndexHeader( + vectorIndex, + JVectorFormat.VECTOR_INDEX_CODEC_NAME, + JVectorFormat.VERSION_CURRENT, + segmentWriteState.segmentInfo.getId(), + segmentWriteState.segmentSuffix); + + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + @SuppressWarnings("unchecked") + @Override + public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { + if (fieldInfo.getVectorEncoding() == VectorEncoding.BYTE) { + final String errorMessage = + "byte[] vectors are not supported in JVector. " + + "Instead you should only use float vectors and leverage product quantization during indexing." + + "This can provides much greater savings in storage and memory"; + throw new UnsupportedOperationException(errorMessage); + } + final FlatFieldVectorsWriter flatFieldVectorsWriter = flatVectorWriter.addField(fieldInfo); + FieldWriter newField = + new FieldWriter<>(fieldInfo, segmentWriteState.segmentInfo.name, flatFieldVectorsWriter); + + fields.add(newField); + return newField; + } + + @SuppressWarnings("unchecked") + public KnnFieldVectorsWriter addMergeField( + FieldInfo fieldInfo, FloatVectorValues mergeFloatVector, RandomAccessVectorValues ravv) + throws UnsupportedOperationException { + if (fieldInfo.getVectorEncoding() == VectorEncoding.BYTE) { + final String errorMessage = + "byte[] vectors are not supported in JVector. " + + "Instead you should only use float vectors and leverage product quantization during indexing." + + "This can provides much greater savings in storage and memory"; + throw new UnsupportedOperationException(errorMessage); + } + return new FieldWriter<>(fieldInfo, segmentWriteState.segmentInfo.name, mergeFloatVector, ravv); + } + + @SuppressWarnings("unchecked") + @Override + public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { + CloseableRandomVectorScorerSupplier scorerSupplier = + flatVectorWriter.mergeOneFieldToIndex(fieldInfo, mergeState); + try { + switch (fieldInfo.getVectorEncoding()) { + case BYTE: + var byteWriter = (FieldWriter) addField(fieldInfo); + ByteVectorValues mergedBytes = + MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState); + var iterator = mergedBytes.iterator(); + for (int doc = iterator.nextDoc(); + doc != DocIdSetIterator.NO_MORE_DOCS; + doc = iterator.nextDoc()) { + byteWriter.addValue(doc, mergedBytes.vectorValue(doc)); + } + writeField(byteWriter); + break; + case FLOAT32: + final FieldWriter floatVectorFieldWriter; + FloatVectorValues mergeFloatVector = + MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); + if (mergeOnDisk) { + final var ravv = + new RandomAccessMergedFloatVectorValues(fieldInfo, mergeState, scorerSupplier); + floatVectorFieldWriter = + (FieldWriter) addMergeField(fieldInfo, mergeFloatVector, ravv); + } else { + floatVectorFieldWriter = (FieldWriter) addField(fieldInfo); + var itr = mergeFloatVector.iterator(); + for (int doc = itr.nextDoc(); + doc != DocIdSetIterator.NO_MORE_DOCS; + doc = itr.nextDoc()) { + floatVectorFieldWriter.addValue(doc, mergeFloatVector.vectorValue(doc)); + } + } + writeField(floatVectorFieldWriter); + break; + } + } finally { + IOUtils.close(scorerSupplier); + } + } + + @Override + public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { + flatVectorWriter.flush(maxDoc, sortMap); + for (FieldWriter field : fields) { + if (sortMap == null) { + writeField(field); + } else { + throw new UnsupportedOperationException("Not implemented yet"); + } + } + } + + private void writeField(FieldWriter fieldData) throws IOException { + OnHeapGraphIndex graph = fieldData.getGraph(); + final var vectorIndexFieldMetadata = writeGraph(graph, fieldData); + meta.writeInt(fieldData.fieldInfo.number); + vectorIndexFieldMetadata.toOutput(meta); + } + + private VectorIndexFieldMetadata writeGraph(OnHeapGraphIndex graph, FieldWriter fieldData) + throws IOException { + final String vectorIndexFieldFileName = + baseDataFileName + + "_" + + fieldData.fieldInfo.name + + "." + + JVectorFormat.VECTOR_INDEX_EXTENSION; + + final int fieldNumber = fieldData.fieldInfo.number; + final VectorEncoding vectorEncoding = fieldData.fieldInfo.getVectorEncoding(); + final VectorSimilarityFunction vectorSimilarityFunction = + fieldData.fieldInfo.getVectorSimilarityFunction(); + final int vectorDimension = fieldData.fieldInfo.getVectorDimension(); + final long vectorIndexOffset; + final long vectorIndexLength; + final long pqCodebooksAndVectorsOffset; + final long pqCodebooksAndVectorsLength; + + try (IndexOutput indexOutput = + segmentWriteState.directory.createOutput( + vectorIndexFieldFileName, segmentWriteState.context); + final var jVectorIndexWriter = new JVectorIndexWriter(indexOutput); ) { + CodecUtil.writeIndexHeader( + indexOutput, + JVectorFormat.VECTOR_INDEX_CODEC_NAME, + JVectorFormat.VERSION_CURRENT, + segmentWriteState.segmentInfo.getId(), + segmentWriteState.segmentSuffix); + final long startOffset = indexOutput.getFilePointer(); + vectorIndexOffset = startOffset; + + try (var writer = + new OnDiskSequentialGraphIndexWriter.Builder(graph, jVectorIndexWriter) + .with(new InlineVectors(fieldData.randomAccessVectorValues.dimension())) + .build()) { + var suppliers = + Feature.singleStateFactory( + FeatureId.INLINE_VECTORS, + nodeId -> + new InlineVectors.State(fieldData.randomAccessVectorValues.getVector(nodeId))); + + writer.write(suppliers); + long endGraphOffset = jVectorIndexWriter.position(); + vectorIndexLength = endGraphOffset - startOffset; + + if (fieldData.randomAccessVectorValues.size() >= minimumBatchSizeForQuantization) { + writePQCodebooksAndVectors(jVectorIndexWriter, fieldData); + pqCodebooksAndVectorsLength = jVectorIndexWriter.position() - endGraphOffset; + pqCodebooksAndVectorsOffset = endGraphOffset; + } else { + pqCodebooksAndVectorsOffset = 0; + pqCodebooksAndVectorsLength = 0; + } + CodecUtil.writeFooter(indexOutput); + } + return new VectorIndexFieldMetadata( + fieldNumber, + vectorEncoding, + vectorSimilarityFunction, + vectorDimension, + vectorIndexOffset, + vectorIndexLength, + pqCodebooksAndVectorsOffset, + pqCodebooksAndVectorsLength); + } + } + + /** + * Writes the product quantization (PQ) codebooks and encoded vectors to a DataOutput stream. This + * method compresses the original vector data using product quantization and encodes all vector + * values into a smaller, compressed form for storage or transfer. + * + * @param out The DataOutput stream where the compressed PQ codebooks and encoded vectors will be + * written. + * @param fieldData The field writer object providing access to the vector data to be compressed. + * @throws IOException If an I/O error occurs during writing. + */ + private void writePQCodebooksAndVectors(DataOutput out, FieldWriter fieldData) + throws IOException { + final var M = + numberOfSubspacesPerVectorSupplier.apply(fieldData.randomAccessVectorValues.dimension()); + final var numberOfClustersPerSubspace = + Math.min(256, fieldData.randomAccessVectorValues.size()); // number of centroids per + // subspace + ProductQuantization pq = + ProductQuantization.compute( + fieldData.randomAccessVectorValues, + M, // number of subspaces + numberOfClustersPerSubspace, // number of centroids per subspace + fieldData.fieldInfo.getVectorSimilarityFunction() + == VectorSimilarityFunction.EUCLIDEAN); // center the dataset + var pqv = pq.encodeAll(fieldData.randomAccessVectorValues); + // write the compressed vectors to disk + pqv.write(out); + } + + /** + * Metadata associated with a single field's vector index. Includes information such as offsets, + * lengths, encoding types, and other field-specific indexing data required during read and write + * phases. + */ + public static class VectorIndexFieldMetadata { + int fieldNumber; + VectorEncoding vectorEncoding; + VectorSimilarityFunction vectorSimilarityFunction; + int vectorDimension; + long vectorIndexOffset; + long vectorIndexLength; + long pqCodebooksAndVectorsOffset; + long pqCodebooksAndVectorsLength; + + public void toOutput(IndexOutput out) throws IOException { + out.writeInt(fieldNumber); + out.writeInt(vectorEncoding.ordinal()); + out.writeInt(JVectorReader.VectorSimilarityMapper.distFuncToOrd(vectorSimilarityFunction)); + out.writeVInt(vectorDimension); + out.writeVLong(vectorIndexOffset); + out.writeVLong(vectorIndexLength); + out.writeVLong(pqCodebooksAndVectorsOffset); + out.writeVLong(pqCodebooksAndVectorsLength); + } + + public VectorIndexFieldMetadata( + int fieldNumber, + VectorEncoding vectorEncoding, + VectorSimilarityFunction vectorSimilarityFunction, + int vectorDimension, + long vectorIndexOffset, + long vectorIndexLength, + long pqCodebooksAndVectorsOffset, + long pqCodebooksAndVectorsLength) { + this.fieldNumber = fieldNumber; + this.vectorEncoding = vectorEncoding; + this.vectorSimilarityFunction = vectorSimilarityFunction; + this.vectorDimension = vectorDimension; + this.vectorIndexOffset = vectorIndexOffset; + this.vectorIndexLength = vectorIndexLength; + this.pqCodebooksAndVectorsOffset = pqCodebooksAndVectorsOffset; + this.pqCodebooksAndVectorsLength = pqCodebooksAndVectorsLength; + } + + public VectorIndexFieldMetadata(IndexInput in) throws IOException { + this.fieldNumber = in.readInt(); + this.vectorEncoding = readVectorEncoding(in); // This reads a byte + this.vectorSimilarityFunction = + JVectorReader.VectorSimilarityMapper.ordToLuceneDistFunc(in.readInt()); + this.vectorDimension = in.readVInt(); + this.vectorIndexOffset = in.readVLong(); + this.vectorIndexLength = in.readVLong(); + this.pqCodebooksAndVectorsOffset = in.readVLong(); + this.pqCodebooksAndVectorsLength = in.readVLong(); + } + + public int getFieldNumber() { + return fieldNumber; + } + + public VectorEncoding getVectorEncoding() { + return vectorEncoding; + } + + public VectorSimilarityFunction getVectorSimilarityFunction() { + return vectorSimilarityFunction; + } + + public int getVectorDimension() { + return vectorDimension; + } + + public long getVectorIndexOffset() { + return vectorIndexOffset; + } + + public long getVectorIndexLength() { + return vectorIndexLength; + } + + public long getPqCodebooksAndVectorsOffset() { + return pqCodebooksAndVectorsOffset; + } + + public long getPqCodebooksAndVectorsLength() { + return pqCodebooksAndVectorsLength; + } + } + + @Override + public void finish() throws IOException { + if (finished) { + throw new IllegalStateException("already finished"); + } + finished = true; + + if (meta != null) { + meta.writeInt(-1); + CodecUtil.writeFooter(meta); + } + + if (vectorIndex != null) { + CodecUtil.writeFooter(vectorIndex); + } + + flatVectorWriter.finish(); + } + + @Override + public void close() throws IOException { + IOUtils.close(meta, vectorIndex, flatVectorWriter); + } + + @Override + public long ramBytesUsed() { + long total = SHALLOW_RAM_BYTES_USED; + for (FieldWriter field : fields) { + total += field.ramBytesUsed(); + } + return total; + } + + class FieldWriter extends KnnFieldVectorsWriter { + private final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(FieldWriter.class); + private final FieldInfo fieldInfo; + private int lastDocID = -1; + private final GraphIndexBuilder graphIndexBuilder; + private final RandomAccessVectorValues randomAccessVectorValues; + private final FloatVectorValues mergedFloatVector; + private final FlatFieldVectorsWriter flatFieldVectorsWriter; + private final BuildScoreProvider buildScoreProvider; + + FieldWriter( + FieldInfo fieldInfo, + String segmentName, + FloatVectorValues mergedFloatVector, + RandomAccessVectorValues ravv) { + this.flatFieldVectorsWriter = null; + this.randomAccessVectorValues = ravv; + this.mergedFloatVector = mergedFloatVector; + this.fieldInfo = fieldInfo; + this.buildScoreProvider = + BuildScoreProvider.randomAccessScoreProvider( + randomAccessVectorValues, getVectorSimilarityFunction(fieldInfo)); + this.graphIndexBuilder = + new GraphIndexBuilder( + buildScoreProvider, + fieldInfo.getVectorDimension(), + maxConn, + beamWidth, + degreeOverflow, + alpha, + true); + } + + FieldWriter( + FieldInfo fieldInfo, String segmentName, FlatFieldVectorsWriter flatFieldVectorsWriter) { + this.flatFieldVectorsWriter = flatFieldVectorsWriter; + this.randomAccessVectorValues = + new RandomAccessVectorValuesOverFlatFields(flatFieldVectorsWriter, fieldInfo); + this.mergedFloatVector = null; + this.fieldInfo = fieldInfo; + this.buildScoreProvider = + BuildScoreProvider.randomAccessScoreProvider( + randomAccessVectorValues, getVectorSimilarityFunction(fieldInfo)); + this.graphIndexBuilder = + new GraphIndexBuilder( + buildScoreProvider, + randomAccessVectorValues.dimension(), + maxConn, + beamWidth, + degreeOverflow, + alpha, + true); + } + + @Override + public void addValue(int docID, T vectorValue) throws IOException { + if (docID <= lastDocID) { + throw new IllegalArgumentException( + "VectorValuesField \"" + + fieldInfo.name + + "\" appears more than once in this document (only one value is allowed per field)"); + } + if (vectorValue instanceof float[]) { + flatFieldVectorsWriter.addValue(docID, vectorValue); + } else { + throw new IllegalArgumentException("Unsupported vector type: " + vectorValue.getClass()); + } + + lastDocID = docID; + } + + @Override + public T copyValue(T vectorValue) { + throw new UnsupportedOperationException("copyValue not supported"); + } + + @Override + public long ramBytesUsed() { + return SHALLOW_SIZE + flatFieldVectorsWriter.ramBytesUsed(); + } + + io.github.jbellis.jvector.vector.VectorSimilarityFunction getVectorSimilarityFunction( + FieldInfo fieldInfo) { + switch (fieldInfo.getVectorSimilarityFunction()) { + case EUCLIDEAN: + return io.github.jbellis.jvector.vector.VectorSimilarityFunction.EUCLIDEAN; + case COSINE: + return io.github.jbellis.jvector.vector.VectorSimilarityFunction.COSINE; + case DOT_PRODUCT: + return io.github.jbellis.jvector.vector.VectorSimilarityFunction.DOT_PRODUCT; + // $CASES-OMITTED$ + default: + throw new IllegalArgumentException( + "Unsupported similarity function: " + fieldInfo.getVectorSimilarityFunction()); + } + } + + /** + * Builds and returns the {@link OnHeapGraphIndex} for the current field by adding all vector + * entries to the graph index builder. If a merged vector view is available, it uses the + * associated iterator to add only the live documents. Otherwise, it adds all vectors in the + * {@code randomAccessVectorValues}. After populating the graph, it performs any necessary + * cleanup and returns the final in-memory graph index. + * + * @return the constructed {@link OnHeapGraphIndex} + * @throws IOException if reading vector data fails + */ + public OnHeapGraphIndex getGraph() throws IOException { + + if (mergedFloatVector != null) { + var itr = mergedFloatVector.iterator(); + for (int doc = itr.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = itr.nextDoc()) { + graphIndexBuilder.addGraphNode(doc, randomAccessVectorValues.getVector(doc)); + } + } else { + for (int i = 0; i < randomAccessVectorValues.size(); i++) { + graphIndexBuilder.addGraphNode(i, randomAccessVectorValues.getVector(i)); + } + } + + graphIndexBuilder.cleanup(); + return graphIndexBuilder.getGraph(); + } + } + + static class RandomAccessMergedFloatVectorValues implements RandomAccessVectorValues { + private static final int READER_ID = 0; + private static final int READER_ORD = 1; + + private final VectorTypeSupport VECTOR_TYPE_SUPPORT = + VectorizationProvider.getInstance().getVectorTypeSupport(); + + private final KnnVectorsReader[] readers; + private final FloatVectorValues[] perReaderFloatVectorValues; + + private final int totalDocsCount; + private final int size; + + private final int[][] ordMapping; + + private final int dimension; + + private String fieldName; + + public RandomAccessMergedFloatVectorValues( + FieldInfo fieldInfo, + MergeState mergeState, + CloseableRandomVectorScorerSupplier scorerSupplier) + throws IOException { + this.fieldName = fieldInfo.name; + this.totalDocsCount = Math.toIntExact(Arrays.stream(mergeState.maxDocs).asLongStream().sum()); + + int totalVectorsCount = 0; + int dimension = 0; + + List allReaders = new ArrayList<>(); + + for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) { + FieldInfos fieldInfos = mergeState.fieldInfos[i]; + if (MergedVectorValues.hasVectorValues(fieldInfos, fieldName)) { + KnnVectorsReader reader = mergeState.knnVectorsReaders[i]; + if (reader != null) { + FloatVectorValues values = reader.getFloatVectorValues(fieldName); + if (values != null) { + allReaders.add(reader); + totalVectorsCount += values.size(); + dimension = Math.max(dimension, values.dimension()); + } + } + } + } + + assert (totalVectorsCount <= totalDocsCount) + : "Total number of vectors exceeds the total number of documents"; + assert (dimension > 0) : "No vectors found for field " + fieldName; + + this.size = totalVectorsCount; + this.readers = new KnnVectorsReader[allReaders.size()]; + for (int i = 0; i < readers.length; i++) { + readers[i] = allReaders.get(i); + } + this.perReaderFloatVectorValues = new FloatVectorValues[readers.length]; + this.dimension = dimension; + + this.ordMapping = new int[totalDocsCount][2]; + + int documentsIterated = 0; + + MergeState.DocMap[] docMaps = mergeState.docMaps; + + for (int readerIdx = 0; readerIdx < readers.length; readerIdx++) { + final FloatVectorValues values = readers[readerIdx].getFloatVectorValues(fieldName); + perReaderFloatVectorValues[readerIdx] = values; + + KnnVectorValues.DocIndexIterator it = values.iterator(); + for (int docId = it.nextDoc(); + docId != DocIdSetIterator.NO_MORE_DOCS; + docId = it.nextDoc()) { + if (docMaps[readerIdx].get(docId) != -1) { + final int globalOrd = docMaps[readerIdx].get(docId); + ordMapping[globalOrd][READER_ID] = readerIdx; + ordMapping[globalOrd][READER_ORD] = docId; + } + + documentsIterated++; + } + } + + if (documentsIterated < totalVectorsCount) { + throw new IllegalStateException( + "More documents were expected than what was found in the readers. " + + " Expected at least number of total Vectors: " + + totalVectorsCount + + " but found only " + + documentsIterated + + " documents"); + } + } + + @Override + public int size() { + return size; + } + + @Override + public int dimension() { + return dimension; + } + + @Override + public VectorFloat getVector(int ord) { + if (ord < 0 || ord >= totalDocsCount) { + throw new IllegalArgumentException("Ordinal out of bounds " + ord); + } + + try { + final int readerIdx = ordMapping[ord][READER_ID]; + final int readerOrd = ordMapping[ord][READER_ORD]; + + synchronized (this) { + final FloatVectorValues values = perReaderFloatVectorValues[readerIdx]; + final float[] vector = values.vectorValue(readerOrd); + final float[] copy = new float[vector.length]; + System.arraycopy(vector, 0, copy, 0, vector.length); + return VECTOR_TYPE_SUPPORT.createFloatVector(copy); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public boolean isValueShared() { + return false; + } + + @Override + public RandomAccessVectorValues copy() { + throw new UnsupportedOperationException("Copy not supported"); + } + } + + static class RandomAccessVectorValuesOverFlatFields implements RandomAccessVectorValues { + private final VectorTypeSupport VECTOR_TYPE_SUPPORT = + VectorizationProvider.getInstance().getVectorTypeSupport(); + + private final FlatFieldVectorsWriter flatFieldVectorsWriter; + private final int dimension; + + RandomAccessVectorValuesOverFlatFields( + FlatFieldVectorsWriter flatFieldVectorsWriter, FieldInfo fieldInfo) { + this.flatFieldVectorsWriter = flatFieldVectorsWriter; + this.dimension = fieldInfo.getVectorDimension(); + } + + @Override + public int size() { + return flatFieldVectorsWriter.getVectors().size(); + } + + @Override + public int dimension() { + return dimension; + } + + @Override + public VectorFloat getVector(int nodeId) { + final float[] vector = (float[]) flatFieldVectorsWriter.getVectors().get(nodeId); + return VECTOR_TYPE_SUPPORT.createFloatVector(vector); + } + + @Override + public boolean isValueShared() { + return false; + } + + @Override + public RandomAccessVectorValues copy() { + throw new UnsupportedOperationException("Copy not supported"); + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/package-info.java new file mode 100644 index 000000000000..5f05b040c88a --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/jvector/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This package contains the implementation of the JVector codec, a Lucene codec for approximate + * nearest neighbor search using vector quantization and HNSW graph indexing. It is based on the + * OpenSearch JVector codec and optimized for Lucene. + */ +package org.apache.lucene.sandbox.codecs.jvector; diff --git a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec new file mode 100644 index 000000000000..f34eae907f96 --- /dev/null +++ b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.sandbox.codecs.jvector.JVectorCodec diff --git a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat index 29a44d2ecfa8..84f11e50fd0a 100644 --- a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat +++ b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat @@ -14,3 +14,4 @@ # limitations under the License. org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat +org.apache.lucene.sandbox.codecs.jvector.JVectorFormat diff --git a/versions.lock b/versions.lock index 2a47276b55ac..a064ffaf5bb6 100644 --- a/versions.lock +++ b/versions.lock @@ -6,14 +6,16 @@ "com.ibm.icu:icu4j:77.1" : "47ea4550,refs=6", "commons-codec:commons-codec:1.18.0" : "e6288df0,refs=6", "commons-io:commons-io:2.16.1" : "5ce8cdc6,refs=2", + "io.github.jbellis:jvector:4.0.0-beta.6" : "9f877bb0,refs=7", "io.sgr:s2-geometry-library-java:1.0.0" : "cbc357ab,refs=4", "junit:junit:4.13.2" : "fa9ef26b,refs=4", "net.sf.jopt-simple:jopt-simple:5.0.4" : "85a1e4c6,refs=2", "net.sourceforge.nekohtml:nekohtml:1.9.22" : "5ce8cdc6,refs=2", + "org.agrona:agrona:1.20.0" : "9f877bb0,refs=7", "org.antlr:antlr4-runtime:4.13.2" : "d9953130,refs=4", "org.apache.commons:commons-compress:1.27.1" : "5ce8cdc6,refs=2", "org.apache.commons:commons-lang3:3.16.0" : "5ce8cdc6,refs=2", - "org.apache.commons:commons-math3:3.6.1" : "85a1e4c6,refs=2", + "org.apache.commons:commons-math3:3.6.1" : "dd26014b,refs=8", "org.apache.opennlp:opennlp-tools:2.5.4" : "2f760bab,refs=4", "org.carrot2:morfologik-fsa:2.1.9" : "79af844b,refs=4", "org.carrot2:morfologik-polish:2.1.9" : "fe494320,refs=3", @@ -21,7 +23,8 @@ "org.hamcrest:hamcrest:3.0" : "fa9ef26b,refs=4", "org.locationtech.spatial4j:spatial4j:0.8" : "cbc357ab,refs=4", "org.openjdk.jmh:jmh-core:1.37" : "85a1e4c6,refs=2", - "org.slf4j:slf4j-api:2.0.17" : "2f760bab,refs=4", + "org.slf4j:slf4j-api:2.0.17" : "07f0efc6,refs=10", + "org.yaml:snakeyaml:2.4" : "9f877bb0,refs=7", "ua.net.nlp:morfologik-ukrainian-search:4.9.1" : "fe494320,refs=3", "xerces:xercesImpl:2.12.2" : "5ce8cdc6,refs=2" }, @@ -48,16 +51,18 @@ "commons-io:commons-io:2.16.1" : "6f16ff86,refs=2", "io.github.eisop:dataflow-errorprone:3.41.0-eisop1" : "7d2143da,refs=39", "io.github.java-diff-utils:java-diff-utils:4.12" : "7d2143da,refs=39", + "io.github.jbellis:jvector:4.0.0-beta.6" : "43dd284b,refs=10", "io.sgr:s2-geometry-library-java:1.0.0" : "1d5a4b2b,refs=4", "javax.inject:javax.inject:1" : "7d2143da,refs=39", "junit:junit:4.13.2" : "b35e5d7a,refs=74", "net.bytebuddy:byte-buddy:1.15.11" : "b7ba1646,refs=2", "net.sf.jopt-simple:jopt-simple:5.0.4" : "152d9f78,refs=3", "net.sourceforge.nekohtml:nekohtml:1.9.22" : "6f16ff86,refs=2", + "org.agrona:agrona:1.20.0" : "43dd284b,refs=10", "org.antlr:antlr4-runtime:4.13.2" : "6fbc4021,refs=5", "org.apache.commons:commons-compress:1.27.1" : "6f16ff86,refs=2", "org.apache.commons:commons-lang3:3.16.0" : "6f16ff86,refs=2", - "org.apache.commons:commons-math3:3.6.1" : "152d9f78,refs=3", + "org.apache.commons:commons-math3:3.6.1" : "f0656784,refs=12", "org.apache.opennlp:opennlp-tools:2.5.4" : "b91715f0,refs=6", "org.assertj:assertj-core:3.27.3" : "b7ba1646,refs=2", "org.carrot2:morfologik-fsa:2.1.9" : "e077a675,refs=8", @@ -71,12 +76,55 @@ "org.openjdk.jmh:jmh-core:1.37" : "152d9f78,refs=3", "org.openjdk.jmh:jmh-generator-annprocess:1.37" : "ecaf1d73,refs=1", "org.pcollections:pcollections:4.0.1" : "7d2143da,refs=39", - "org.slf4j:slf4j-api:2.0.17" : "b91715f0,refs=6", + "org.slf4j:slf4j-api:2.0.17" : "736bb8da,refs=15", + "org.yaml:snakeyaml:2.4" : "43dd284b,refs=10", "ua.net.nlp:morfologik-ukrainian-search:4.9.1" : "cb00cecf,refs=5", "xerces:xercesImpl:2.12.2" : "6f16ff86,refs=2" } }, "because" : { + "07f0efc6" : [ + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:analysis:opennlp" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:analysis:opennlp" + } + ], "152d9f78" : [ { "configuration" : "annotationProcessor", @@ -137,6 +185,48 @@ "projectPath" : ":lucene:analysis:opennlp" } ], + "43dd284b" : [ + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:highlighter" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:memory" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:monitor" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "47ea4550" : [ { "configuration" : "compileClasspath", @@ -205,6 +295,68 @@ "projectPath" : ":lucene:queries" } ], + "736bb8da" : [ + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis.tests" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis.tests" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:highlighter" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:memory" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:monitor" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis:opennlp" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis:opennlp" + } + ], "79af844b" : [ { "configuration" : "compileClasspath", @@ -425,6 +577,36 @@ "projectPath" : ":lucene:analysis:phonetic" } ], + "9f877bb0" : [ + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "b35e5d7a" : [ { "configuration" : "testCompileClasspath", @@ -817,6 +999,40 @@ "projectPath" : ":lucene:expressions" } ], + "dd26014b" : [ + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "e077a675" : [ { "configuration" : "testCompileClasspath", @@ -883,6 +1099,56 @@ "projectPath" : ":lucene:benchmark-jmh" } ], + "f0656784" : [ + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:highlighter" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:memory" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:monitor" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "fa9ef26b" : [ { "configuration" : "compileClasspath",