Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TASK-4642 - Incorrect results when running Exomiser on INDELs and Multiallelic #2306

Merged
merged 7 commits into from
Jul 7, 2023
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ public void run() throws ToolException {

VariantQuery query = new VariantQuery()
.study(studyId)
.sample(sampleId + ":0/1,1/1")
.sample(sampleId)
.includeSample(samples)
.includeSampleData("GT")
.unknownGenotype("./.");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,25 @@ public enum GenotypeClass implements Predicate<String> {
return true;
}),

/**
* Genotypes containing any secondary alternate.
* <p>
* 1/2, 2/3, ./2, 0/2, ...
*/
SEC(str -> {
Genotype gt = parseGenotype(str);
if (gt == null) {
// Skip invalid genotypes
return false;
}
for (int allele : gt.getAllelesIdx()) {
if (allele > 1) {
return true;
}
}
return false;
}),

/**
* Genotypes containing reference and secondary alternates only.
* <p>
Expand All @@ -233,6 +252,7 @@ public enum GenotypeClass implements Predicate<String> {
return hasSecondaryAlternate;
}),


/**
* Contains the main alternate.
* <p>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import org.opencb.opencga.storage.core.variant.query.VariantQueryUtils;

import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -193,6 +194,12 @@ public VariantQuery includeSampleData(String value) {
put(VariantQueryParam.INCLUDE_SAMPLE_DATA.key(), value);
return this;
}

public VariantQuery includeSampleData(Collection<String> value) {
put(VariantQueryParam.INCLUDE_SAMPLE_DATA.key(), value);
return this;
}

public String includeSampleData() {
return getString(VariantQueryParam.INCLUDE_SAMPLE_DATA.key());
}
Expand Down Expand Up @@ -257,6 +264,12 @@ public VariantQuery includeFile(String value) {
put(VariantQueryParam.INCLUDE_FILE.key(), value);
return this;
}

public VariantQuery includeFile(Collection<String> value) {
put(VariantQueryParam.INCLUDE_FILE.key(), value);
return this;
}

public VariantQuery includeFileAll() {
return includeFile(ParamConstants.ALL);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2285,7 +2285,12 @@ public void testReturnNoneFiles() {
queryResult = query(new Query(INCLUDE_FILE.key(), VariantQueryUtils.NONE).append(INCLUDE_SAMPLE.key(), ALL), new QueryOptions());
assertEquals(allVariants.getResults().size(), queryResult.getResults().size());
for (Variant variant : queryResult.getResults()) {
assertThat(variant.getStudies().get(0).getFiles(), is(Collections.emptyList()));
if (variant.getLengthReference() == 0 || variant.getLengthAlternate() == 0) {
assertThat(variant.getStudies().get(0).getFiles(), is(not(Collections.emptyList())));
assertThat(variant.getStudies().get(0).getFiles().get(0).getCall(), is(not(nullValue())));
} else {
assertThat(variant.getStudies().get(0).getFiles(), is(Collections.emptyList()));
}
assertThat(new HashSet<>(variant.getStudies().get(0).getSampleDataKeys()), is(FORMAT));
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ public enum HadoopVariantStorageOptions implements ConfigurationOption {
SAMPLE_INDEX_BUILD_MAX_SAMPLES_PER_MR("storage.hadoop.sampleIndex.build.maxSamplesPerMR", 2000),
SAMPLE_INDEX_ANNOTATION_MAX_SAMPLES_PER_MR("storage.hadoop.sampleIndex.annotation.maxSamplesPerMR", 2000),
SAMPLE_INDEX_FAMILY_MAX_TRIOS_PER_MR("storage.hadoop.sampleIndex.family.maxTriosPerMR", 1000),
SAMPLE_INDEX_QUERY_SAMPLE_INDEX_ONLY_PD_BUFFER("storage.hadoop.sampleIndex.query.sampleIndexOnly.partialData.buffer", 10000),
SAMPLE_INDEX_QUERY_SAMPLE_INDEX_ONLY_PD_BATCH("storage.hadoop.sampleIndex.query.sampleIndexOnly.partialData.batch", 250),
SAMPLE_INDEX_QUERY_EXTENDED_REGION_FILTER("storage.hadoop.sampleIndex.query.extendedRegionFilter.default", 5_000_000),

/////////////////////////
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,8 @@ public List<Scan> parseQueryMultiRegion(VariantQueryProjection selectElements, Q
}

List<Scan> scans;
if ((regions.isEmpty() || regions.size() == 1) && variants.isEmpty() && idIntersect.isEmpty()) {
int numLocusFilters = regions.size() + variants.size() + idIntersect.size();
if (numLocusFilters <= 1) {
scans = Collections.singletonList(parseQuery(selectElements, query, options));
} else {
scans = new ArrayList<>(regions.size() + variants.size() + idIntersect.size());
Expand All @@ -236,7 +237,7 @@ public List<Scan> parseQueryMultiRegion(VariantQueryProjection selectElements, Q
subQuery.remove(ID.key());
subQuery.remove(ID_INTERSECT.key());

subQuery.put(REGION.key(), "MULTI_REGION");
subQuery.put(REGION.key(), "MULTI_REGION (#" + numLocusFilters + ")");
Scan templateScan = parseQuery(selectElements, subQuery, options);

for (Region region : regions) {
Expand All @@ -254,7 +255,7 @@ public List<Scan> parseQueryMultiRegion(VariantQueryProjection selectElements, Q
subQuery.put(ID.key(), variant);
try {
Scan scan = new Scan(templateScan);
scan.setSmall(true);
scan.setOneRowLimit();
addVariantIdFilter(scan, variant);
scans.add(scan);
} catch (IOException e) {
Expand Down Expand Up @@ -301,6 +302,17 @@ public Scan parseQuery(VariantQueryProjection selectElements, Query query, Query
Variant variant = VariantQueryUtils.toVariant(ids.get(0));
addVariantIdFilter(scan, variant);
regionOrVariant = variant;
scan.setOneRowLimit();
}
if (isValidParam(query, ID_INTERSECT)) {
List<String> ids = query.getAsStringList(ID_INTERSECT.key());
if (ids.size() != 1) {
throw VariantQueryException.malformedParam(ID_INTERSECT, ids.toString(), "Unsupported multiple variant ids filter");
}
Variant variant = VariantQueryUtils.toVariant(ids.get(0));
addVariantIdFilter(scan, variant);
regionOrVariant = variant;
scan.setOneRowLimit();
}

// if (isValidParam(query, ID)) {
Expand Down Expand Up @@ -587,21 +599,26 @@ public Scan parseQuery(VariantQueryProjection selectElements, Query query, Query
}
scan.setReversed(options.getString(QueryOptions.ORDER, QueryOptions.ASCENDING).equals(QueryOptions.DESCENDING));

logger.info("----------------------------");
logger.info("StartRow = " + Bytes.toStringBinary(scan.getStartRow()));
logger.info("StopRow = " + Bytes.toStringBinary(scan.getStopRow()));
if (regionOrVariant != null) {
logger.info("\tRegion = " + regionOrVariant);
}
logger.info("columns (" + scan.getFamilyMap().getOrDefault(family, Collections.emptyNavigableSet()).size() + ") = "
+ scan.getFamilyMap().getOrDefault(family, Collections.emptyNavigableSet())
.stream().map(Bytes::toString).collect(Collectors.joining(",")));
logger.info("MaxResultSize = " + scan.getMaxResultSize());
logger.info("Filters = " + scan.getFilter());
if (!scan.getTimeRange().isAllTime()) {
logger.info("TimeRange = " + scan.getTimeRange());
}
logger.info("Batch = " + scan.getBatch());
if (!options.getBoolean(VariantHadoopDBAdaptor.QUIET)) {
logger.info("----------------------------");
String startRow = Bytes.toStringBinary(scan.getStartRow());
if (!startRow.startsWith("MULTI_REGION")) {
logger.info("StartRow = " + startRow);
logger.info("StopRow = " + Bytes.toStringBinary(scan.getStopRow()));
}
if (regionOrVariant != null) {
logger.info("\tRegion = " + regionOrVariant);
}
logger.info("columns (" + scan.getFamilyMap().getOrDefault(family, Collections.emptyNavigableSet()).size() + ") = "
+ scan.getFamilyMap().getOrDefault(family, Collections.emptyNavigableSet())
.stream().map(Bytes::toString).collect(Collectors.joining(",")));
logger.info("MaxResultSize = " + scan.getMaxResultSize());
logger.info("Filters = " + scan.getFilter());
if (!scan.getTimeRange().isAllTime()) {
logger.info("TimeRange = " + scan.getTimeRange());
}
logger.info("Batch = " + scan.getBatch());
}
return scan;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@

public class VariantHadoopDBAdaptor implements VariantDBAdaptor {
public static final String NATIVE = "native";
public static final String QUIET = "quiet";
public static final QueryParam ANNOT_NAME = QueryParam.create("annotName", "", Type.TEXT);

protected static Logger logger = LoggerFactory.getLogger(VariantHadoopDBAdaptor.class);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -240,11 +240,7 @@ protected StudyEntry convert(List<VariantRow.SampleColumn> sampleDataMap,
}

Map<String, List<String>> alternateFileMap = new HashMap<>();
for (Pair<String, PhoenixArray> pair : filesMap) {
String fileId = pair.getKey();
PhoenixArray fileColumn = pair.getValue();
addFileEntry(studyMetadata, variant, studyEntry, fileId, fileColumn, alternateFileMap);
}
addFileEntries(filesMap, variant, studyMetadata, studyEntry, alternateFileMap);
addSecondaryAlternates(variant, studyEntry, studyMetadata, alternateFileMap);

fillEmptySamplesData(studyEntry, studyMetadata, fillMissingColumnValue);
Expand Down Expand Up @@ -386,8 +382,36 @@ private List<String> remapSamplesData(List<String> sampleData, int[] formatsMap)
}
}

private void addFileEntry(StudyMetadata studyMetadata, Variant variant, StudyEntry studyEntry, String fileIdStr,
PhoenixArray fileColumn, Map<String, List<String>> alternateFileMap) {
private void addFileEntries(List<Pair<String, PhoenixArray>> filesMap, Variant variant, StudyMetadata studyMetadata,
StudyEntry studyEntry, Map<String, List<String>> alternateFileMap) {
// Some file entries might be added only for their "OriginalCall" info.
// These would be added at the end, but only if the original call is not already present.
ArrayList<FileEntry> filesOnlyCall = new ArrayList<>();
for (Pair<String, PhoenixArray> pair : filesMap) {
String fileId = pair.getKey();
PhoenixArray fileColumn = pair.getValue();
addFileEntry(studyMetadata, variant, fileId, fileColumn, alternateFileMap, studyEntry.getFiles(), filesOnlyCall);
}
if (!filesOnlyCall.isEmpty()) {
// Create a set of original calls to avoid duplicates
Set<String> variantIds = new HashSet<>();
for (FileEntry fileEntry : studyEntry.getFiles()) {
if (fileEntry.getCall() != null) {
variantIds.add(fileEntry.getCall().getVariantId());
}
}
for (FileEntry fileEntry : filesOnlyCall) {
if (variantIds.add(fileEntry.getCall().getVariantId())) {
// Not seen, so add to the list of file entries
studyEntry.getFiles().add(fileEntry);
}
}
}
}

private void addFileEntry(StudyMetadata studyMetadata, Variant variant, String fileIdStr,
PhoenixArray fileColumn, Map<String, List<String>> alternateFileMap,
List<FileEntry> files, List<FileEntry> filesOnlyCall) {
int fileId = Integer.parseInt(fileIdStr);
String alternateRaw = (String) (fileColumn.getElement(FILE_SEC_ALTS_IDX));
String alternate = normalizeNonRefAlternateCoordinate(variant, alternateRaw);
Expand All @@ -399,10 +423,10 @@ private void addFileEntry(StudyMetadata studyMetadata, Variant variant, StudyEnt

if (configuration.getProjection() != null
&& !configuration.getProjection().getStudy(studyMetadata.getId()).getFiles().contains(fileId)) {
// TODO: Should we return the original CALL?
// if (call != null && !call.isEmpty()) {
// studyEntry.getFiles().add(new FileEntry(fileName, call, Collections.emptyMap()));
// }
if (call != null && !call.isEmpty()) {
OriginalCall originalCall = parseOriginalCall(call);
filesOnlyCall.add(new FileEntry(fileName, originalCall, Collections.emptyMap()));
}
return;
}

Expand All @@ -412,8 +436,7 @@ private void addFileEntry(StudyMetadata studyMetadata, Variant variant, StudyEnt
VariantOverlappingStatus overlappingStatus =
VariantOverlappingStatus.valueFromShortString((String) (fileColumn.getElement(FILE_VARIANT_OVERLAPPING_STATUS_IDX)));
if (call != null && !call.isEmpty()) {
int i = call.lastIndexOf(':');
originalCall = new OriginalCall(call.substring(0, i), Integer.valueOf(call.substring(i + 1)));
originalCall = parseOriginalCall(call);
} else if (overlappingStatus.equals(VariantOverlappingStatus.MULTI)) {
attributes.put(StudyEntry.FILTER, "SiteConflict");
AlternateCoordinate alternateCoordinate = getAlternateCoordinate(alternateRaw);
Expand All @@ -424,7 +447,13 @@ private void addFileEntry(StudyMetadata studyMetadata, Variant variant, StudyEnt
alternateCoordinate.getReference(),
alternateCoordinate.getAlternate()).toString(), 0);
}
studyEntry.getFiles().add(new FileEntry(fileName, originalCall, attributes));
files.add(new FileEntry(fileName, originalCall, attributes));
}

private OriginalCall parseOriginalCall(String call) {
int i = call.lastIndexOf(':');
OriginalCall originalCall = new OriginalCall(call.substring(0, i), Integer.valueOf(call.substring(i + 1)));
return originalCall;
}

public static HashMap<String, String> convertFileAttributes(PhoenixArray fileColumn, List<String> fixedAttributes) {
Expand Down
Loading
Loading