Skip to content

Commit

Permalink
storage: Add sampleIndexConfiguration.fileDataConfiguration. #TASK-6765
Browse files Browse the repository at this point in the history
  • Loading branch information
j-coll committed Sep 9, 2024
1 parent 6fc18ee commit 717beac
Show file tree
Hide file tree
Showing 8 changed files with 109 additions and 165 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,147 +15,12 @@ public class SampleIndexConfiguration {

public static final int DEFAULT_FILE_POSITION_SIZE_BITS = 3;
private static final double[] QUAL_THRESHOLDS = new double[]{10, 20, 30};
private static final double[] DP_THRESHOLDS = new double[]{5, 10, 15, 20, 30, 40, 50};
private static final double[] DP_THRESHOLDS_NULLABLE = new double[]{5, 10, 15, 20, 30, 50};

private final FileIndexConfiguration fileIndexConfiguration = new FileIndexConfiguration();
private final FileDataConfiguration fileDataConfiguration = new FileDataConfiguration();
private final AnnotationIndexConfiguration annotationIndexConfiguration = new AnnotationIndexConfiguration();

public static SampleIndexConfiguration backwardCompatibleConfiguration() {
double[] backwardCompatibleThresholds = new double[]{0.001, 0.005, 0.01};
SampleIndexConfiguration sampleIndexConfiguration = new SampleIndexConfiguration()
.addFileIndexField(new IndexFieldConfiguration(
IndexFieldConfiguration.Source.FILE,
StudyEntry.FILTER,
IndexFieldConfiguration.Type.CATEGORICAL,
VCFConstants.PASSES_FILTERS_v4))
.addFileIndexField(new IndexFieldConfiguration(
IndexFieldConfiguration.Source.FILE, StudyEntry.QUAL, QUAL_THRESHOLDS).setNullable(false))
.addFileIndexField(new IndexFieldConfiguration(
IndexFieldConfiguration.Source.SAMPLE, VCFConstants.DEPTH_KEY, DP_THRESHOLDS).setNullable(false));
sampleIndexConfiguration.getAnnotationIndexConfiguration().getPopulationFrequency()
.addPopulation(new Population(ParamConstants.POP_FREQ_1000G_CB_V4, "ALL"))
.addPopulation(new Population(ParamConstants.POP_FREQ_GNOMAD_GENOMES, "ALL"))
.setThresholds(backwardCompatibleThresholds);

sampleIndexConfiguration.getFileIndexConfiguration().setFilePositionBits(4);

// Ensure backward compatibility with these two params:
sampleIndexConfiguration.addFileIndexField(new IndexFieldConfiguration(
IndexFieldConfiguration.Source.SAMPLE, "padding", IndexFieldConfiguration.Type.CATEGORICAL,
"add_two_extra_bits", "to_allow_backward", "compatibility"));
sampleIndexConfiguration.getFileIndexConfiguration().setFixedFieldsFirst(false);

IndexFieldConfiguration biotypeConfiguration = new IndexFieldConfiguration(IndexFieldConfiguration.Source.ANNOTATION,
"biotype",
IndexFieldConfiguration.Type.CATEGORICAL_MULTI_VALUE)
.setValues(
NONSENSE_MEDIATED_DECAY,
LINCRNA,
MIRNA,
RETAINED_INTRON,
SNRNA,
SNORNA,
"other_non_pseudo_gene",
// "other",
PROTEIN_CODING
).setValuesMapping(new HashMap<>());
biotypeConfiguration.getValuesMapping().put(LINCRNA, Arrays.asList(
"lncRNA",
NON_CODING,
LINCRNA,
"macro_lncRNA",
ANTISENSE,
SENSE_INTRONIC,
SENSE_OVERLAPPING,
THREEPRIME_OVERLAPPING_NCRNA,
"bidirectional_promoter_lncRNA"));
biotypeConfiguration.getValuesMapping().put("other_non_pseudo_gene", Arrays.asList(
PROCESSED_TRANSCRIPT,
NON_STOP_DECAY,
MISC_RNA,
RRNA,
MT_RRNA,
MT_TRNA,
IG_C_GENE,
IG_D_GENE,
IG_J_GENE,
IG_V_GENE,
TR_C_GENE,
TR_D_GENE,
TR_J_GENE,
TR_V_GENE,
NMD_TRANSCRIPT_VARIANT,
TRANSCRIBED_UNPROCESSED_PSEUDGENE,
AMBIGUOUS_ORF,
KNOWN_NCRNA,
RETROTRANSPOSED,
LRG_GENE
));
biotypeConfiguration.setNullable(false);

sampleIndexConfiguration.getAnnotationIndexConfiguration().setBiotype(biotypeConfiguration);
IndexFieldConfiguration consequenceType = new IndexFieldConfiguration(
IndexFieldConfiguration.Source.ANNOTATION,
"consequenceType",
IndexFieldConfiguration.Type.CATEGORICAL_MULTI_VALUE)
.setValues(
SPLICE_DONOR_VARIANT,
TRANSCRIPT_ABLATION,
TRANSCRIPT_AMPLIFICATION,
INITIATOR_CODON_VARIANT,
SPLICE_REGION_VARIANT,
INCOMPLETE_TERMINAL_CODON_VARIANT,
"utr",
"mirna_tfbs",
MISSENSE_VARIANT,
FRAMESHIFT_VARIANT,
INFRAME_DELETION,
INFRAME_INSERTION,
START_LOST,
STOP_GAINED,
STOP_LOST,
SPLICE_ACCEPTOR_VARIANT
).setValuesMapping(new HashMap<>());
consequenceType.getValuesMapping().put("mirna_tfbs", Arrays.asList(
TF_BINDING_SITE_VARIANT,
MATURE_MIRNA_VARIANT));
consequenceType.getValuesMapping().put("utr", Arrays.asList(
THREE_PRIME_UTR_VARIANT,
FIVE_PRIME_UTR_VARIANT));
consequenceType.setNullable(false);

sampleIndexConfiguration.getAnnotationIndexConfiguration().setConsequenceType(consequenceType);

sampleIndexConfiguration.getAnnotationIndexConfiguration().setTranscriptFlagIndexConfiguration(
new IndexFieldConfiguration(
IndexFieldConfiguration.Source.ANNOTATION,
"transcriptFlag",
IndexFieldConfiguration.Type.CATEGORICAL_MULTI_VALUE,
"do_not_use"
).setNullable(false));
sampleIndexConfiguration.getAnnotationIndexConfiguration().setTranscriptCombination(false);

sampleIndexConfiguration.getAnnotationIndexConfiguration().setClinicalSource(
new IndexFieldConfiguration(
IndexFieldConfiguration.Source.ANNOTATION, "clinicalSource",
IndexFieldConfiguration.Type.CATEGORICAL_MULTI_VALUE, "cosmic")
.setNullable(false));
sampleIndexConfiguration.getAnnotationIndexConfiguration().setClinicalSignificance(
new IndexFieldConfiguration(
IndexFieldConfiguration.Source.ANNOTATION, "clinicalSignificance",
IndexFieldConfiguration.Type.CATEGORICAL_MULTI_VALUE,
ClinicalSignificance.likely_benign.toString(),
ClinicalSignificance.uncertain_significance.toString(),
ClinicalSignificance.likely_pathogenic.toString(),
ClinicalSignificance.pathogenic.toString(),
"unused_target_drug",
"unused_pgx",
"unused_bit8"
).setNullable(false));

return sampleIndexConfiguration;
}

public static SampleIndexConfiguration defaultConfiguration() {
return defaultConfiguration(false);
}
Expand All @@ -178,6 +43,9 @@ public static SampleIndexConfiguration defaultConfiguration(boolean cellbaseV4)

sampleIndexConfiguration.getFileIndexConfiguration()
.setFilePositionBits(DEFAULT_FILE_POSITION_SIZE_BITS);
sampleIndexConfiguration.getFileDataConfiguration()
.setIncludeOriginalCall(true)
.setIncludeSecondaryAlternates(true);

IndexFieldConfiguration biotypeConfiguration = new IndexFieldConfiguration(IndexFieldConfiguration.Source.ANNOTATION,
"biotype",
Expand Down Expand Up @@ -312,10 +180,6 @@ public static SampleIndexConfiguration defaultConfiguration(boolean cellbaseV4)
return sampleIndexConfiguration;
}

public void validate() {
validate(null);
}

public void validate(String cellbaseVersion) {
addMissingValues(defaultConfiguration("v4".equalsIgnoreCase(cellbaseVersion)));

Expand All @@ -336,6 +200,13 @@ public void addMissingValues(SampleIndexConfiguration defaultConfiguration) {
if (fileIndexConfiguration.getCustomFields().isEmpty()) {
fileIndexConfiguration.getCustomFields().addAll(defaultConfiguration.fileIndexConfiguration.customFields);
}
if (fileDataConfiguration.includeOriginalCall == null) {
fileDataConfiguration.includeOriginalCall = defaultConfiguration.fileDataConfiguration.includeOriginalCall;
}
if (fileDataConfiguration.includeSecondaryAlternates == null) {
fileDataConfiguration.includeSecondaryAlternates = defaultConfiguration.fileDataConfiguration.includeSecondaryAlternates;
}

if (annotationIndexConfiguration.getPopulationFrequency() == null) {
annotationIndexConfiguration.setPopulationFrequency(defaultConfiguration.annotationIndexConfiguration.populationFrequency);
}
Expand Down Expand Up @@ -368,6 +239,53 @@ public void addMissingValues(SampleIndexConfiguration defaultConfiguration) {
}
}

public static class FileDataConfiguration {
private Boolean includeOriginalCall;
private Boolean includeSecondaryAlternates;

public FileDataConfiguration() {
// By default, left as null.
// The defaultConfiguration will set it to true when constructed.
this.includeOriginalCall = null;
this.includeSecondaryAlternates = null;
}

public Boolean getIncludeOriginalCall() {
return includeOriginalCall;
}

public FileDataConfiguration setIncludeOriginalCall(Boolean includeOriginalCall) {
this.includeOriginalCall = includeOriginalCall;
return this;
}

public boolean isIncludeOriginalCall() {
return includeOriginalCall != null && includeOriginalCall;
}

public Boolean getIncludeSecondaryAlternates() {
return includeSecondaryAlternates;
}

public FileDataConfiguration setIncludeSecondaryAlternates(Boolean includeSecondaryAlternates) {
this.includeSecondaryAlternates = includeSecondaryAlternates;
return this;
}

public boolean isIncludeSecondaryAlternates() {
return includeSecondaryAlternates != null && includeSecondaryAlternates;
}

@Override
public String toString() {
final StringBuilder sb = new StringBuilder("FileDataConfiguration{");
sb.append("includeOriginalCall=").append(includeOriginalCall);
sb.append(", includeSecondaryAlternates=").append(includeSecondaryAlternates);
sb.append('}');
return sb.toString();
}
}

public static class FileIndexConfiguration {

private final List<IndexFieldConfiguration> customFields = new ArrayList<>();
Expand Down Expand Up @@ -687,6 +605,9 @@ public FileIndexConfiguration getFileIndexConfiguration() {
return fileIndexConfiguration;
}

public FileDataConfiguration getFileDataConfiguration() {
return fileDataConfiguration;
}

public SampleIndexConfiguration addFileIndexField(IndexFieldConfiguration fileIndex) {
if (fileIndexConfiguration.getCustomFields().contains(fileIndex)) {
Expand Down Expand Up @@ -719,6 +640,7 @@ public int hashCode() {
public String toString() {
final StringBuilder sb = new StringBuilder("SampleIndexConfiguration{");
sb.append("fileIndexConfiguration=").append(fileIndexConfiguration);
sb.append("fileDataConfiguration=").append(fileDataConfiguration);
sb.append(", annotationIndexConfiguration=").append(annotationIndexConfiguration);
sb.append('}');
return sb.toString();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,19 @@ public class FileDataIndexSchema extends DataSchema {

private final DataFieldWithContext<Variant, OriginalCall> originalCallField;
private final DataFieldWithContext<Variant, List<AlternateCoordinate>> secondaryAlternatesField;
private boolean includeOriginalCall = true;
private boolean includeSecondaryAlternates = true;
private final SampleIndexConfiguration.FileDataConfiguration fileDataConfiguration;

public FileDataIndexSchema(SampleIndexConfiguration.FileIndexConfiguration fileIndexConfiguration) {
if (includeOriginalCall) {
public FileDataIndexSchema(SampleIndexConfiguration.FileDataConfiguration fileDataConfiguration) {
this.fileDataConfiguration = fileDataConfiguration;
if (fileDataConfiguration.isIncludeOriginalCall()) {
originalCallField = new VarBinaryDataField(
new IndexFieldConfiguration(IndexFieldConfiguration.Source.FILE, "ORIGINAL_CALL", null))
.fromWithContext(new VariantOriginalCallToBytesConverter());
addField(originalCallField);
} else {
originalCallField = null;
}
if (includeSecondaryAlternates) {
if (fileDataConfiguration.isIncludeOriginalCall()) {
secondaryAlternatesField = new VarBinaryDataField(
new IndexFieldConfiguration(IndexFieldConfiguration.Source.STUDY, "SECONDARY_ALTERNATES", null))
.fromWithContext(new AlternateCoordinateToBytesConverter());
Expand All @@ -44,25 +44,21 @@ public FileDataIndexSchema(SampleIndexConfiguration.FileIndexConfiguration fileI
}

public boolean isIncludeOriginalCall() {
return includeOriginalCall;
return fileDataConfiguration.isIncludeOriginalCall();
}

public DataFieldWithContext<Variant, OriginalCall> getOriginalCallField() {
return originalCallField;
}

public boolean isIncludeSecondaryAlternates() {
return includeSecondaryAlternates;
return fileDataConfiguration.isIncludeSecondaryAlternates();
}

public DataFieldWithContext<Variant, List<AlternateCoordinate>> getSecondaryAlternatesField() {
return secondaryAlternatesField;
}

public void writeOriginalCall(Variant variant, OriginalCall call, ByteBuffer bb) {
getOriginalCallField().write(variant, call, bb);
}

public OriginalCall readOriginalCall(ByteBuffer fileDataByteBuffer, Variant variant) {
return readFieldAndDecode(fileDataByteBuffer, originalCallField, variant);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,12 @@ public Map<String, TreeSet<SampleVariantIndexEntry>> convertToMapSampleVariantIn
BitBuffer fileIndexEntry;
do {
fileIndexEntry = fileIndexSchema.readEntry(fileIndexStream);
ByteBuffer fileDataEntry = fileDataSchema.readNextEntry(fileDataBuffer);
ByteBuffer fileDataEntry;
if (fileDataBuffer == null) {
fileDataEntry = null;
} else {
fileDataEntry = fileDataSchema.readNextEntry(fileDataBuffer);
}
values.add(new SampleVariantIndexEntry(variant, fileIndexEntry, fileDataEntry));
} while (this.fileIndexSchema.isMultiFile(fileIndexEntry));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -636,6 +636,7 @@ private Scan parse(SingleSampleIndexQuery query, LocusQuery locusQuery, boolean
scan.setCaching(hBaseManager.getConf().getInt("hbase.client.scanner.caching", 100));

logger.info("---------");
logger.info("Study = \"" + query.getStudy() + "\" (id=" + studyId + ")");
logger.info("Sample = \"" + query.getSample() + "\" (id=" + sampleId + ") , schema version = " + query.getSchema().getVersion());
logger.info("Table = " + getSampleIndexTableName(query));
printScan(scan);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,12 @@ public void build(Put put) {
put.addColumn(COLUMN_FAMILY, SampleIndexSchema.toGenotypeCountColumn(gt), Bytes.toBytes(variants.size()));
put.addColumn(COLUMN_FAMILY, SampleIndexSchema.toFileIndexColumn(gt), fileIndexBuffer.getBuffer());
int position = fileDataIndexBuffer.position();
fileDataIndexBuffer.rewind();
fileDataIndexBuffer.limit(position);
put.addColumn(COLUMN_FAMILY, ByteBuffer.wrap(SampleIndexSchema.toFileDataColumn(gt)), put.getTimestamp(),
fileDataIndexBuffer);
if (position > 0) {
fileDataIndexBuffer.rewind();
fileDataIndexBuffer.limit(position);
put.addColumn(COLUMN_FAMILY, ByteBuffer.wrap(SampleIndexSchema.toFileDataColumn(gt)), put.getTimestamp(),
fileDataIndexBuffer);
}
}
}

Expand Down Expand Up @@ -404,7 +406,9 @@ private void partialBuild(boolean flush) {
fileIndexBuffer.setBitBuffer(gtEntry.getFilesIndex().get(0), offset);
offset += fileIndexSchema.getBitsLength();
prev = gtEntry;
fileDataIndexSchema.writeEntry(fileDataBuffer, gtEntry.getFileData().get(0));
if (!gtEntry.getFileData().isEmpty()) {
fileDataIndexSchema.writeEntry(fileDataBuffer, gtEntry.getFileData().get(0));
}
}

// Do not write the whole buffer, but only the corresponding to the processed entries.
Expand All @@ -422,8 +426,10 @@ public void build(Put put) {
put.addColumn(COLUMN_FAMILY, SampleIndexSchema.toGenotypeColumn(gt), variantsBuffer);
put.addColumn(COLUMN_FAMILY, SampleIndexSchema.toGenotypeCountColumn(gt), Bytes.toBytes(variantsCount));
put.addColumn(COLUMN_FAMILY, SampleIndexSchema.toFileIndexColumn(gt), fileIndexBuffer.toByteArray());
put.addColumn(COLUMN_FAMILY, ByteBuffer.wrap(SampleIndexSchema.toFileDataColumn(gt)),
put.getTimestamp(), fileDataBuffer.toByteByffer());
if (fileDataBuffer.size() > 0) {
put.addColumn(COLUMN_FAMILY, ByteBuffer.wrap(SampleIndexSchema.toFileDataColumn(gt)),
put.getTimestamp(), fileDataBuffer.toByteByffer());
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ public SampleIndexSchema(SampleIndexConfiguration configuration, int version) {
this.version = version;
this.configuration = configuration;
fileIndex = new FileIndexSchema(configuration.getFileIndexConfiguration());
fileData = new FileDataIndexSchema(configuration.getFileIndexConfiguration());
fileData = new FileDataIndexSchema(configuration.getFileDataConfiguration());
// annotationSummaryIndexSchema = new AnnotationSummaryIndexSchema();
ctIndex = new ConsequenceTypeIndexSchema(configuration.getAnnotationIndexConfiguration().getConsequenceType());
biotypeIndex = new BiotypeIndexSchema(configuration.getAnnotationIndexConfiguration().getBiotype());
Expand Down Expand Up @@ -218,6 +218,7 @@ public String toString() {
sb.append("version=").append(version);
sb.append(", configuration=").append(configuration);
sb.append(", fileIndex=").append(fileIndex);
sb.append(", fileData=").append(fileData);
sb.append(", popFreqIndex=").append(popFreqIndex);
sb.append(", ctIndex=").append(ctIndex);
sb.append(", biotypeIndex=").append(biotypeIndex);
Expand Down
Loading

0 comments on commit 717beac

Please sign in to comment.