Skip to content

Commit

Permalink
Merge pull request #2490 from opencb/TASK-6596
Browse files Browse the repository at this point in the history
TASK-6596 - Error loading long variants: DocValuesField "variantId" is too large, must be <=32766
  • Loading branch information
j-coll authored Jul 25, 2024
2 parents f35ed46 + f7b4b52 commit 2dcd910
Show file tree
Hide file tree
Showing 19 changed files with 166 additions and 53 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -573,11 +573,15 @@ public ProjectMetadata getProjectMetadata() {

public ProjectMetadata getAndUpdateProjectMetadata(ObjectMap options) throws StorageEngineException {
ProjectMetadata projectMetadata = getProjectMetadata();

checkSameSpeciesAndAssembly(options, projectMetadata);
if (options != null && (projectMetadata == null
|| StringUtils.isEmpty(projectMetadata.getSpecies()) && options.containsKey(SPECIES.key())
|| StringUtils.isEmpty(projectMetadata.getAssembly()) && options.containsKey(ASSEMBLY.key()))) {

projectMetadata = updateProjectMetadata(pm -> {
// Check again, in case it was updated by another thread
checkSameSpeciesAndAssembly(options, pm);
if (pm == null) {
pm = new ProjectMetadata();
}
Expand All @@ -598,6 +602,25 @@ public ProjectMetadata getAndUpdateProjectMetadata(ObjectMap options) throws Sto
return projectMetadata;
}

private static void checkSameSpeciesAndAssembly(ObjectMap options, ProjectMetadata projectMetadata) throws StorageEngineException {
if (options != null && projectMetadata != null) {
if (options.containsKey(ASSEMBLY.key())) {
if (StringUtils.isNotEmpty(projectMetadata.getAssembly()) && !projectMetadata.getAssembly()
.equalsIgnoreCase(options.getString(ASSEMBLY.key()))) {
throw new StorageEngineException("Incompatible assembly change from '" + projectMetadata.getAssembly() + "' to '"
+ options.getString(ASSEMBLY.key()) + "'");
}
}
if (options.containsKey(SPECIES.key())) {
if (StringUtils.isNotEmpty(projectMetadata.getSpecies()) && !projectMetadata.getSpecies()
.equalsIgnoreCase(toCellBaseSpeciesName(options.getString(SPECIES.key())))) {
throw new StorageEngineException("Incompatible species change from '" + projectMetadata.getSpecies() + "' to '"
+ options.getString(SPECIES.key()) + "'");
}
}
}
}

public DataResult<VariantFileMetadata> getVariantFileMetadata(int studyId, int fileId, QueryOptions options)
throws StorageEngineException {
return fileDBAdaptor.getVariantFileMetadata(studyId, fileId, options);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1362,6 +1362,19 @@ public VariantQueryExecutor getVariantQueryExecutor(ParsedVariantQuery variantQu
throw new VariantQueryException("No VariantQueryExecutor found to run the query!");
}

public final VariantQueryExecutor getVariantQueryExecutor(Class<? extends VariantQueryExecutor> clazz)
throws StorageEngineException {
Optional<VariantQueryExecutor> first = getVariantQueryExecutors()
.stream()
.filter(e -> e instanceof SearchIndexVariantQueryExecutor)
.findFirst();
if (first.isPresent()) {
return first.get();
} else {
throw new StorageEngineException("VariantQueryExecutor " + clazz + " not found");
}
}

public Query preProcessQuery(Query originalQuery, QueryOptions options) {
try {
return getVariantQueryParser().preProcessQuery(originalQuery, options);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,7 @@
import org.opencb.opencga.storage.core.io.plain.StringDataReader;
import org.opencb.opencga.storage.core.io.plain.StringDataWriter;
import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager;
import org.opencb.opencga.storage.core.metadata.models.CohortMetadata;
import org.opencb.opencga.storage.core.metadata.models.FileMetadata;
import org.opencb.opencga.storage.core.metadata.models.StudyMetadata;
import org.opencb.opencga.storage.core.metadata.models.TaskMetadata;
import org.opencb.opencga.storage.core.metadata.models.*;
import org.opencb.opencga.storage.core.variant.adaptors.GenotypeClass;
import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor;
import org.opencb.opencga.storage.core.variant.io.VariantReaderUtils;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,9 @@ public VariantQueryResult<Long> approximateCount(ParsedVariantQuery variantQuery

DataResult<VariantSearchModel> nativeResult = searchManager
.nativeQuery(dbName, searchEngineQuery, queryOptions);
List<String> variantIds = nativeResult.getResults().stream().map(VariantSearchModel::getId).collect(Collectors.toList());
List<Variant> variantIds = nativeResult.getResults().stream()
.map(VariantSearchModel::toVariantSimple)
.collect(Collectors.toList());
// Adjust numSamples if the results from SearchManager is smaller than numSamples
// If this happens, the count is not approximated
if (variantIds.size() < sampling) {
Expand Down Expand Up @@ -283,12 +285,12 @@ public boolean doIntersectWithSearch(Query query, QueryOptions options) {
return intersect;
}

protected Iterator<String> variantIdIteratorFromSearch(Query query) {
protected Iterator<Variant> variantIdIteratorFromSearch(Query query) {
return variantIdIteratorFromSearch(query, Integer.MAX_VALUE, 0, null);
}

protected Iterator<String> variantIdIteratorFromSearch(Query query, int limit, int skip, AtomicLong numTotalResults) {
Iterator<String> variantsIterator;
protected Iterator<Variant> variantIdIteratorFromSearch(Query query, int limit, int skip, AtomicLong numTotalResults) {
Iterator<Variant> variantsIterator;
QueryOptions queryOptions = new QueryOptions()
.append(QueryOptions.LIMIT, limit)
.append(QueryOptions.SKIP, skip)
Expand All @@ -302,14 +304,14 @@ protected Iterator<String> variantIdIteratorFromSearch(Query query, int limit, i
}
variantsIterator = nativeResult.getResults()
.stream()
.map(VariantSearchModel::getId)
.map(VariantSearchModel::toVariantSimple)
.iterator();
} else {
SolrNativeIterator nativeIterator = searchManager.nativeIterator(dbName, query, queryOptions);
if (numTotalResults != null) {
numTotalResults.set(nativeIterator.getNumFound());
}
variantsIterator = Iterators.transform(nativeIterator, VariantSearchModel::getId);
variantsIterator = Iterators.transform(nativeIterator, VariantSearchModel::toVariantSimple);
}
} catch (VariantSearchException | IOException e) {
throw new VariantQueryException("Error querying " + VariantSearchManager.SEARCH_ENGINE_ID, e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,15 @@
package org.opencb.opencga.storage.core.variant.search;

import org.apache.solr.client.solrj.beans.Field;
import org.opencb.biodata.models.variant.Variant;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import static org.opencb.opencga.storage.core.variant.search.VariantSearchToVariantConverter.HASH_PREFIX;

/**
* Created by wasim on 09/11/16.
*/
Expand Down Expand Up @@ -140,6 +143,9 @@ public class VariantSearchModel {
@Field("fileInfo_*")
private Map<String, String> fileInfo;

@Field("attr_*")
private Map<String, Object> attr;


public static final double MISSING_VALUE = -100.0;

Expand Down Expand Up @@ -171,6 +177,7 @@ public VariantSearchModel() {
this.qual = new HashMap<>();
this.filter = new HashMap<>();
this.fileInfo = new HashMap<>();
this.attr = new HashMap<>();
}

public VariantSearchModel(VariantSearchModel init) {
Expand Down Expand Up @@ -210,6 +217,7 @@ public VariantSearchModel(VariantSearchModel init) {
this.qual = init.getQual();
this.filter = init.getFilter();
this.fileInfo = init.getFileInfo();
this.attr = init.getAttr();
}

@Override
Expand Down Expand Up @@ -251,6 +259,7 @@ public String toString() {
sb.append(", qual=").append(qual);
sb.append(", filter=").append(filter);
sb.append(", fileInfo=").append(fileInfo);
sb.append(", attr=").append(attr);
sb.append('}');
return sb.toString();
}
Expand All @@ -259,6 +268,17 @@ public String getId() {
return id;
}

public Variant toVariantSimple() {
String variantId = getId();
if (variantId.startsWith(HASH_PREFIX)) {
Object o = getAttr().get("attr_id");
variantId = o instanceof String ? (String) o : ((List<String>) o).get(0);
}
Variant variant = new Variant(variantId);
variant.setId(variantId);
return variant;
}

public VariantSearchModel setId(String id) {
this.id = id;
return this;
Expand Down Expand Up @@ -579,4 +599,12 @@ public VariantSearchModel setFileInfo(Map<String, String> fileInfo) {
return this;
}

public Map<String, Object> getAttr() {
return attr;
}

public VariantSearchModel setAttr(Map<String, Object> attr) {
this.attr = attr;
return this;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ public class VariantSearchToVariantConverter implements ComplexTypeConverter<Var
public static final double MISSING_VALUE = -100.0;
private static final String LIST_SEP = "___";
private static final String FIELD_SEP = " -- ";
static final String HASH_PREFIX = "#";

private final Logger logger = LoggerFactory.getLogger(VariantSearchToVariantConverter.class);
private final Set<VariantField> includeFields;
Expand All @@ -79,10 +80,9 @@ public VariantSearchToVariantConverter(Set<VariantField> includeFields) {
@Override
public Variant convertToDataModelType(VariantSearchModel variantSearchModel) {
// set chromosome, start, end, ref, alt from ID
Variant variant = new Variant(variantSearchModel.getId());
Variant variant = variantSearchModel.toVariantSimple();

// set ID, chromosome, start, end, ref, alt, type
variant.setId(variantSearchModel.getVariantId());
// set chromosome, start, end, ref, alt, type

// set variant type
if (StringUtils.isNotEmpty(variantSearchModel.getType())) {
Expand Down Expand Up @@ -662,8 +662,10 @@ public VariantSearchModel convertToStorageType(Variant variant) {
List<String> other = new ArrayList<>();

// Set general Variant attributes: id, dbSNP, chromosome, start, end, type
variantSearchModel.setId(variant.toString()); // Internal unique ID e.g. 3:1000:AT:-
variantSearchModel.setVariantId(variant.getId());
String variantId = getVariantId(variant);
variantSearchModel.setId(variantId); // Internal unique ID e.g. 3:1000:AT:-
variantSearchModel.setVariantId(variantId);
variantSearchModel.getAttr().put("attr_id", variant.toString());
variantSearchModel.setChromosome(variant.getChromosome());
variantSearchModel.setStart(variant.getStart());
variantSearchModel.setEnd(variant.getEnd());
Expand Down Expand Up @@ -1019,8 +1021,7 @@ public VariantSearchModel convertToStorageType(Variant variant) {
// This field contains all possible IDs: id, dbSNP, names, genes, transcripts, protein, clinvar, hpo, ...
// This will help when searching by variant id. This is added at the end of the method after collecting all IDs
Set<String> xrefs = variantAnnotationModelUtils.extractXRefs(variant.getAnnotation());
xrefs.add(variantSearchModel.getId());
xrefs.add(variantSearchModel.getVariantId());
xrefs.add(variantId);
if (variant.getNames() != null && !variant.getNames().isEmpty()) {
variant.getNames().forEach(name -> {
if (name != null) {
Expand All @@ -1032,6 +1033,20 @@ public VariantSearchModel convertToStorageType(Variant variant) {
return variantSearchModel;
}

public static String getVariantId(Variant variant) {
String variantString = variant.toString();
if (variantString.length() > 32766) {
// variantString.length() >= Short.MAX_VALUE
return hashVariantId(variant, variantString);
} else {
return variantString;
}
}

public static String hashVariantId(Variant variant, String variantString) {
return HASH_PREFIX + variant.getChromosome() + ":" + variant.getStart() + ":" + Integer.toString(variantString.hashCode());
}

private void convertStudies(Variant variant, VariantSearchModel variantSearchModel, List<String> other) {
// Sanity check
if (CollectionUtils.isEmpty(variant.getStudies())) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.common.SolrException;
import org.opencb.biodata.models.core.Region;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.commons.datastore.core.Query;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.datastore.solr.FacetQueryParser;
Expand All @@ -35,7 +34,10 @@
import org.opencb.opencga.storage.core.variant.adaptors.VariantField;
import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryException;
import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam;
import org.opencb.opencga.storage.core.variant.query.*;
import org.opencb.opencga.storage.core.variant.query.KeyOpValue;
import org.opencb.opencga.storage.core.variant.query.ParsedVariantQuery;
import org.opencb.opencga.storage.core.variant.query.Values;
import org.opencb.opencga.storage.core.variant.query.VariantQueryParser;
import org.opencb.opencga.storage.core.variant.query.projection.VariantQueryProjectionParser;
import org.opencb.opencga.storage.core.variant.search.VariantSearchToVariantConverter;
import org.slf4j.Logger;
Expand Down Expand Up @@ -79,7 +81,7 @@ public class SolrQueryParser {
static {
includeMap = new HashMap<>();

includeMap.put("id", "id,variantId");
includeMap.put("id", "id,variantId,attr_id");
includeMap.put("chromosome", "chromosome");
includeMap.put("start", "start");
includeMap.put("end", "end");
Expand Down Expand Up @@ -477,7 +479,9 @@ private String parseGenomicFilter(Query query) {
genes.addAll(variantQueryXref.getGenes());
xrefs.addAll(variantQueryXref.getIds());
xrefs.addAll(variantQueryXref.getOtherXrefs());
xrefs.addAll(variantQueryXref.getVariants().stream().map(Variant::toString).collect(Collectors.toList()));
xrefs.addAll(variantQueryXref.getVariants().stream()
.map(VariantSearchToVariantConverter::getVariantId)
.collect(Collectors.toList()));

// Regions
if (StringUtils.isNotEmpty(query.getString(REGION.key()))) {
Expand Down Expand Up @@ -1616,15 +1620,12 @@ private String[] includeFieldsWithMandatory(String[] includes) {
return new String[0];
}

String[] mandatoryIncludeFields = new String[]{"id", "chromosome", "start", "end", "type"};
String[] includeWithMandatory = new String[includes.length + mandatoryIncludeFields.length];
for (int i = 0; i < includes.length; i++) {
includeWithMandatory[i] = includes[i];
}
for (int i = 0; i < mandatoryIncludeFields.length; i++) {
includeWithMandatory[includes.length + i] = mandatoryIncludeFields[i];
}
return includeWithMandatory;
Set<String> mandatoryIncludeFields = new HashSet<>(Arrays.asList("id", "attr_id", "chromosome", "start", "end", "type"));
Set<String> includeWithMandatory = new LinkedHashSet<>(includes.length + mandatoryIncludeFields.size());

includeWithMandatory.addAll(Arrays.asList(includes));
includeWithMandatory.addAll(mandatoryIncludeFields);
return includeWithMandatory.toArray(new String[0]);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import org.junit.experimental.categories.Category;
import org.opencb.biodata.models.variant.VariantFileMetadata;
import org.opencb.biodata.models.variant.metadata.VariantMetadata;
import org.opencb.commons.datastore.core.ObjectMap;
import org.opencb.opencga.core.testclassification.duration.ShortTests;
import org.opencb.opencga.storage.core.io.managers.IOConnectorProvider;
import org.opencb.opencga.storage.core.io.managers.LocalIOConnector;
Expand Down Expand Up @@ -45,6 +46,7 @@ public class VariantMetadataConverterTest {
@Before
public void setUp() throws Exception {
metadataManager = new VariantStorageMetadataManager(new DummyVariantStorageMetadataDBAdaptorFactory());
projectMetadata = metadataManager.getAndUpdateProjectMetadata(new ObjectMap());

URI uri = VariantStorageBaseTest.getResourceUri("platinum/1K.end.platinum-genomes-vcf-NA12877_S1.genome.vcf.gz");
variantReaderUtils = new VariantReaderUtils(new IOConnectorProvider(LocalIOConnector.class));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ public void before() throws Exception {
variantStorageEngine.getConfiguration().getCellbase().setUrl(ParamConstants.CELLBASE_URL);
variantStorageEngine.getConfiguration().getCellbase().setVersion(ParamConstants.CELLBASE_VERSION);
variantStorageEngine.getConfiguration().getCellbase().setDataRelease(ParamConstants.CELLBASE_DATA_RELEASE);
variantStorageEngine.getOptions().put(VariantStorageOptions.ASSEMBLY.key(), "grch38");
if (!loaded) {
clearDB(DB_NAME);
loadFiles();
Expand All @@ -59,6 +60,7 @@ protected void loadFiles() throws Exception {
variantStorageEngine.getConfiguration().getCellbase().setUrl(ParamConstants.CELLBASE_URL);
variantStorageEngine.getConfiguration().getCellbase().setVersion(ParamConstants.CELLBASE_VERSION);
variantStorageEngine.getConfiguration().getCellbase().setDataRelease(ParamConstants.CELLBASE_DATA_RELEASE);
variantStorageEngine.getOptions().put(VariantStorageOptions.ASSEMBLY.key(), "grch38");
studyMetadata = new StudyMetadata(1, "s1");
// variantStorageEngine.getOptions().append(VariantStorageOptions.ANNOTATOR_CELLBASE_EXCLUDE.key(), "expression,clinical");
input1 = getResourceUri("variant-test-bnd.vcf");
Expand Down
Loading

0 comments on commit 2dcd910

Please sign in to comment.