Skip to content

Commit

Permalink
Addressing CR
Browse files Browse the repository at this point in the history
  • Loading branch information
fuzhaoyuan committed Aug 9, 2024
1 parent 892faf5 commit df0eac6
Show file tree
Hide file tree
Showing 4 changed files with 142 additions and 134 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import org.cbioportal.web.parameter.StudyViewFilter;

import java.util.List;
import java.util.Map;


public interface StudyViewMapper {
Expand Down
27 changes: 11 additions & 16 deletions src/main/resources/db-scripts/clickhouse/clickhouse.sql
Original file line number Diff line number Diff line change
Expand Up @@ -277,34 +277,33 @@ FROM clinical_event ce
INNER JOIN patient p ON ce.patient_id = p.internal_id
INNER JOIN cancer_study cs ON p.cancer_study_id = cs.cancer_study_id;

CREATE TABLE IF NOT EXISTS genetic_alteration_derived_cna
CREATE TABLE IF NOT EXISTS genetic_alteration_cna_derived
(
sample_unique_id String,
cancer_study_identifier LowCardinality(String),
hugo_gene_symbol String,
cna_value Nullable(Int8),
gistic_value Nullable(Int8),
log2CNA_value Nullable(Float32)
profile_type LowCardinality(String),
alteration_value Nullable(Float32)
)
ENGINE = MergeTree()
PARTITION BY (profile_type, cancer_study_identifier)
ORDER BY (sample_unique_id, hugo_gene_symbol);

INSERT INTO TABLE genetic_alteration_derived_cna
INSERT INTO TABLE genetic_alteration_cna_derived
SELECT
sample_unique_id,
cancer_study_identifier,
hugo_gene_symbol,
any(if(profile_type = 'cna', toInt8(alteration_value), null)) as cna_value,
any(if(profile_type = 'gistic', toInt8(alteration_value), null)) as gistic_value,
any(if(profile_type = 'log2CNA', toFloat32(alteration_value), null)) as log2CNA_value
profile_type,
alteration_value
FROM
(SELECT
sample_id,
hugo_gene_symbol,
profile_type,
alteration_value,
cancer_study_id
alteration_value
FROM
(SELECT
gp.cancer_study_id AS cancer_study_id,
g.hugo_gene_symbol AS hugo_gene_symbol,
arrayElement(splitByString('_', assumeNotNull(gp.stable_id)), -1) AS profile_type,
arrayMap(x -> (x = '' ? NULL : x), splitByString(',', assumeNotNull(trim(trailing ',' from ga.values)))) AS alteration_value,
Expand All @@ -318,11 +317,7 @@ FROM
gp.genetic_alteration_type = 'COPY_NUMBER_ALTERATION')
ARRAY JOIN alteration_value, sample_id
WHERE alteration_value != 'NA') AS subquery
JOIN cancer_study cs ON cs.cancer_study_id = subquery.cancer_study_id
JOIN sample_derived sd ON sd.internal_id = subquery.sample_id
GROUP BY
sample_unique_id,
hugo_gene_symbol;
JOIN sample_derived sd ON sd.internal_id = subquery.sample_id;

OPTIMIZE TABLE sample_to_gene_panel_derived;
OPTIMIZE TABLE gene_panel_to_gene_derived;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,124 +107,22 @@
</if>
<if test="studyViewFilter.mutationDataFilters != null and !studyViewFilter.mutationDataFilters.isEmpty()">
<foreach item="mutationDataFilter" collection="studyViewFilter.mutationDataFilters" open="INTERSECT" separator="INTERSECT">
<!-- if the categorization is 'MUTATED' (pie chart) -->
<if test="mutationDataFilter.categorization == @org.cbioportal.web.parameter.MutationOption@MUTATED">
WITH total_samples AS (
SELECT sample_unique_id
FROM sample_derived
WHERE cancer_study_identifier IN
<foreach item="studyId" collection="studyViewFilter.studyIds" open="(" separator="," close=")">
#{studyId}
</foreach>
),
profiled_samples AS (
SELECT DISTINCT sgp.sample_unique_id
FROM sample_to_gene_panel_derived sgp
JOIN gene_panel_to_gene_derived gpg ON sgp.gene_panel_id = gpg.gene_panel_id
WHERE cancer_study_identifier IN
<foreach item="studyId" collection="studyViewFilter.studyIds" open="(" separator="," close=")">
#{studyId}
</foreach>
AND gpg.gene = #{mutationDataFilter.hugoGeneSymbol}
),
mutated_samples AS (
SELECT DISTINCT sample_unique_id
FROM genomic_event_derived
WHERE cancer_study_identifier IN
<foreach item="studyId" collection="studyViewFilter.studyIds" open="(" separator="," close=")">
#{studyId}
</foreach>
AND hugo_gene_symbol = #{mutationDataFilter.hugoGeneSymbol}
AND variant_type = 'mutation'
)
SELECT DISTINCT sample_unique_id
FROM (
<foreach item="dataFilterValue" collection="mutationDataFilter.values[0]" separator="UNION ALL">
<if test="dataFilterValue.value == 'MUTATED'">
SELECT sample_unique_id FROM mutated_samples
</if>
<if test="dataFilterValue.value == 'NOT_MUTATED'">
SELECT sample_unique_id FROM profiled_samples
WHERE sample_unique_id NOT IN (SELECT sample_unique_id FROM mutated_samples)
</if>
<if test="dataFilterValue.value == 'NOT_PROFILED'">
SELECT sample_unique_id FROM total_samples
WHERE sample_unique_id NOT IN (SELECT sample_unique_id FROM profiled_samples)
</if>
</foreach>
)
</if>
<!-- if the categorization is 'MUTATION_TYPE' (table) -->
<if test="mutationDataFilter.categorization == @org.cbioportal.web.parameter.MutationOption@MUTATION_TYPE">
<foreach item="dataFilterValues" collection="mutationDataFilter.values" separator="INTERSECT">
SELECT DISTINCT sample_unique_id
FROM genomic_event_derived
WHERE hugo_gene_symbol = #{mutationDataFilter.hugoGeneSymbol}
AND variant_type = 'mutation'
AND mutation_type IN (
<foreach item="dataFilterValue" collection="dataFilterValues" separator=",">
#{dataFilterValue.value}
</foreach>
)
</foreach>
</if>
</foreach>
<include refid="applyMutationDataFilter"/>
</foreach>
</if>
<if test="studyViewFilter.genomicDataFilters != null and !studyViewFilter.genomicDataFilters.isEmpty()">
<foreach item="genomicDataFilter" collection="studyViewFilter.genomicDataFilters" open="INTERSECT" separator="INTERSECT">
<bind name="valueColumn" value="genomicDataFilter.profileType + '_value'" />
<!-- 1. check if 'NA' is selected -->
<bind name="containsNA" value="false" />
<foreach item="dataFilterValue" collection="genomicDataFilter.values">
<if test="dataFilterValue.value == 'NA'">
<bind name="containsNA" value="true" />
</if>
</foreach>
WITH cna_query AS (
SELECT sample_unique_id, ${valueColumn}
FROM genetic_alteration_derived_cna
WHERE hugo_gene_symbol = #{genomicDataFilter.hugoGeneSymbol}
AND cancer_study_identifier IN
<foreach item="studyId" collection="studyViewFilter.studyIds" open="(" separator="," close=")">
#{studyId}
</foreach>
)
SELECT DISTINCT sd.sample_unique_id
FROM sample_derived sd
LEFT JOIN cna_query ON sd.sample_unique_id = cna_query.sample_unique_id
WHERE cancer_study_identifier IN
<foreach item="studyId" collection="studyViewFilter.studyIds" open="(" separator="," close=")">
#{studyId}
</foreach>
<choose>
<!-- 2. if no 'NA' is selected -->
<when test="!containsNA">
AND ${valueColumn} IN
<foreach item="dataFilterValue" collection="genomicDataFilter.values" open="(" separator="," close=")">
#{dataFilterValue.value}
</foreach>
</when>
<!-- 3. else if only 'NA' is selected -->
<when test="genomicDataFilter.values.size() == 1 and genomicDataFilter.values[0].value == 'NA'">
AND ${valueColumn} IS NULL
</when>
<!-- 4. else both 'NA' and other values are selected -->
<otherwise>
AND (${valueColumn} IS NULL
OR ${valueColumn} IN
<foreach item="dataFilterValue" collection="genomicDataFilter.values" open="(" separator="," close=")">
<if test="dataFilterValue.value != 'NA'">#{dataFilterValue.value}</if>
</foreach>
)
</otherwise>
</choose>
<!-- This filter only serve CNA data type as of now. More data types can be added with similar approach -->
<if test="genomicDataFilter.profileType == 'cna' or genomicDataFilter.profileType == 'gistic'">
<include refid="applyGenomicDataFilterForCNA"/>
</if>
</foreach>
</if>
<!-- ... extend for other elements of the StudyViewFilter object -->
<!-- Apply Sample Treatment Filter -->
<if test="studyViewFilter.sampleTreatmentFilters != null and !studyViewFilter.sampleTreatmentFilters.getFilters().isEmpty()">
<include refid="applySampleTreatmentFilter"/>
</if>
<!-- ... extend for other elements of the StudyViewFilter object -->
</trim>
</sql>

Expand Down Expand Up @@ -377,4 +275,118 @@
</trim>
</foreach>
</sql>

<sql id="applyMutationDataFilter">
<!-- if the categorization is 'MUTATED' (pie chart) -->
<if test="mutationDataFilter.categorization == @org.cbioportal.web.parameter.MutationOption@MUTATED">
WITH all_samples AS (
SELECT sample_unique_id
FROM sample_derived
WHERE cancer_study_identifier IN
<foreach item="studyId" collection="studyViewFilter.studyIds" open="(" separator="," close=")">
#{studyId}
</foreach>
),
profiled_samples AS (
SELECT DISTINCT sgp.sample_unique_id
FROM sample_to_gene_panel_derived sgp
JOIN gene_panel_to_gene_derived gpg ON sgp.gene_panel_id = gpg.gene_panel_id
WHERE cancer_study_identifier IN
<foreach item="studyId" collection="studyViewFilter.studyIds" open="(" separator="," close=")">
#{studyId}
</foreach>
AND gpg.gene = #{mutationDataFilter.hugoGeneSymbol}
),
mutated_samples AS (
SELECT DISTINCT sample_unique_id
FROM genomic_event_derived
WHERE cancer_study_identifier IN
<foreach item="studyId" collection="studyViewFilter.studyIds" open="(" separator="," close=")">
#{studyId}
</foreach>
AND hugo_gene_symbol = #{mutationDataFilter.hugoGeneSymbol}
AND variant_type = 'mutation'
)
SELECT DISTINCT sample_unique_id
FROM
<foreach item="dataFilterValue" collection="mutationDataFilter.values[0]" open="(" separator="UNION ALL" close=")">
<choose>
<when test="dataFilterValue.value == 'MUTATED'">
SELECT sample_unique_id FROM mutated_samples
</when>
<when test="dataFilterValue.value == 'NOT_MUTATED'">
SELECT sample_unique_id FROM profiled_samples
WHERE sample_unique_id NOT IN (SELECT sample_unique_id FROM mutated_samples)
</when>
<when test="dataFilterValue.value == 'NOT_PROFILED'">
SELECT sample_unique_id FROM all_samples
WHERE sample_unique_id NOT IN (SELECT sample_unique_id FROM profiled_samples)
</when>
</choose>
</foreach>
</if>
<!-- if the categorization is 'MUTATION_TYPE' (table) -->
<if test="mutationDataFilter.categorization == @org.cbioportal.web.parameter.MutationOption@MUTATION_TYPE">
<foreach item="dataFilterValues" collection="mutationDataFilter.values" separator="INTERSECT">
SELECT DISTINCT sample_unique_id
FROM genomic_event_derived
WHERE hugo_gene_symbol = #{mutationDataFilter.hugoGeneSymbol}
AND variant_type = 'mutation'
AND mutation_type IN
<foreach item="dataFilterValue" collection="dataFilterValues" open="(" separator="," close=")">
#{dataFilterValue.value}
</foreach>
</foreach>
</if>
</sql>

<sql id="applyGenomicDataFilterForCNA">
<!-- 1. check if 'NA' is selected -->
<bind name="containsNA" value="false" />
<foreach item="dataFilterValue" collection="genomicDataFilter.values">
<if test="dataFilterValue.value == 'NA'">
<bind name="containsNA" value="true" />
</if>
</foreach>
<!-- filter on study to reduce query size in preparation of the following LEFT JOIN -->
WITH cna_query AS (
SELECT sample_unique_id, alteration_value
FROM genetic_alteration_cna_derived
WHERE profile_type = #{genomicDataFilter.profileType}
AND hugo_gene_symbol = #{genomicDataFilter.hugoGeneSymbol}
AND cancer_study_identifier IN
<foreach item="studyId" collection="studyViewFilter.studyIds" open="(" separator="," close=")">
#{studyId}
</foreach>
)
SELECT DISTINCT sd.sample_unique_id
<!-- join with sample table to get all 'NA' samples -->
FROM sample_derived sd
LEFT JOIN cna_query ON sd.sample_unique_id = cna_query.sample_unique_id
WHERE cancer_study_identifier IN
<foreach item="studyId" collection="studyViewFilter.studyIds" open="(" separator="," close=")">
#{studyId}
</foreach>
<choose>
<!-- 2. if no 'NA' is selected -->
<when test="!containsNA">
AND alteration_value IN
<foreach item="dataFilterValue" collection="genomicDataFilter.values" open="(" separator="," close=")">
#{dataFilterValue.value}
</foreach>
</when>
<!-- 3. else if only 'NA' is selected -->
<when test="genomicDataFilter.values.size() == 1 and genomicDataFilter.values[0].value == 'NA'">
AND alteration_value IS NULL
</when>
<!-- 4. else both 'NA' and other values are selected -->
<otherwise>
AND alteration_value IS NULL
OR alteration_value IN
<foreach item="dataFilterValue" collection="genomicDataFilter.values" open="(" separator="," close=")">
<if test="dataFilterValue.value != 'NA'">#{dataFilterValue.value}</if>
</foreach>
</otherwise>
</choose>
</sql>
</mapper>
Original file line number Diff line number Diff line change
Expand Up @@ -232,25 +232,25 @@
<!-- for /genomic-data-counts/fetch - (returns GenomicDataCountItem objects) -->
<select id="getCNACounts" resultMap="GenomicDataCountItemResultMap">
<bind name="profileType" value="genomicDataFilters[0].profileType" />
<bind name="valueColumn" value="genomicDataFilters[0].profileType + '_value'" />
WITH cna_query as (
SELECT
hugo_gene_symbol as hugoGeneSymbol,
#{profileType} as profileType,
multiIf(${valueColumn} = 2, 'Amplified', ${valueColumn} = 1, 'Gained', ${valueColumn} = 0, 'Diploid', ${valueColumn} = -1,
'Heterozygously deleted', ${valueColumn} = -2, 'Homozygously deleted', 'NA') as label,
toString(${valueColumn}) as value,
multiIf(alteration_value = 2, 'Amplified', alteration_value = 1, 'Gained', alteration_value = 0, 'Diploid', alteration_value = -1,
'Heterozygously deleted', alteration_value = -2, 'Homozygously deleted', 'NA') as label,
toString(alteration_value) as value,
cast(count(*) as INTEGER) as count
FROM genetic_alteration_derived_cna
FROM genetic_alteration_cna_derived
<where>
sample_unique_id IN (<include refid="sampleUniqueIdsFromStudyViewFilter"/>)
profile_type = #{profileType}
AND sample_unique_id IN (<include refid="sampleUniqueIdsFromStudyViewFilter"/>)
<foreach item="genomicDataFilter" collection="genomicDataFilters" open=" AND (" separator=" OR " close=")">
hugo_gene_symbol = #{genomicDataFilter.hugoGeneSymbol}
</foreach>
</where>
GROUP BY
hugo_gene_symbol,
${valueColumn}
alteration_value
),
cna_sum AS (
SELECT
Expand Down Expand Up @@ -310,8 +310,8 @@
</foreach>
</where>
GROUP BY
hugo_gene_symbol,
mutation_type
mutation_type,
hugo_gene_symbol
</select>

<resultMap id="GenomicDataCountItemResultMap" type="org.cbioportal.model.GenomicDataCountItem">
Expand Down

0 comments on commit df0eac6

Please sign in to comment.