Skip to content

Commit

Permalink
Normalize data counts in a generic case-insensitive way
Browse files Browse the repository at this point in the history
  • Loading branch information
onursumer committed Oct 2, 2024
1 parent fb8eaac commit 23be72c
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,8 @@
import org.springframework.cache.annotation.Cacheable;
import org.springframework.stereotype.Service;

import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.*;
import java.util.stream.Collectors;
import java.util.Map;

@Service
public class StudyViewColumnarServiceImpl implements StudyViewColumnarService {
Expand Down Expand Up @@ -175,15 +172,66 @@ private StudyViewFilterContext createContext(StudyViewFilter studyViewFilter) {
}

private List<ClinicalDataCountItem> generateDataCountItemsFromDataCounts(List<ClinicalDataCount> dataCounts) {
return dataCounts.stream().collect(Collectors.groupingBy(ClinicalDataCount::getAttributeId))
return dataCounts.stream().collect(Collectors.groupingBy(ClinicalDataCount::getAttributeId))
.entrySet().parallelStream().map(e -> {
ClinicalDataCountItem item = new ClinicalDataCountItem();
item.setAttributeId(e.getKey());
item.setCounts(e.getValue());
item.setCounts(normalizeDataCounts(e.getValue()));
return item;
}).toList();
}

/**
* Normalizes data counts by merging attribute values in a case-insensitive way.
* For example attribute values "TRUE", "True", and 'true' will be merged into a single aggregated count.
* This method assumes that all the counts in the given dataCounts list has the same attributeId.
*
* @param dataCounts list of data counts for a single attribute
*
* @return normalized list of data counts
*/
private List<ClinicalDataCount> normalizeDataCounts(List<ClinicalDataCount> dataCounts) {
Collection<ClinicalDataCount> normalizedDataCounts = dataCounts
.stream()
.collect(
Collectors.groupingBy(
c -> c.getValue().toLowerCase(),
Collectors.reducing(new ClinicalDataCount(), (count1, count2) -> {
// assuming attribute ids are the same for all data counts, just pick the first one
String attributeId =
count1.getAttributeId() != null
? count1.getAttributeId()
: count2.getAttributeId();

// pick the value in a deterministic way by prioritizing lower case over upper case.
// for example, 'True' will be picked in case of 2 different values like 'TRUE', and 'True',
// and 'true' will be picked in case of 3 different values like 'TRUE', 'True', and 'true'
String value = count1.getValue() != null
? count1.getValue()
: count2.getValue();
if (count1.getValue() != null && count2.getValue() != null) {
value = count1.getValue().compareTo(count2.getValue()) > 0
? count1.getValue()
: count2.getValue();
}

// aggregate counts for the merged values
Integer count = (count1.getCount() != null ? count1.getCount(): 0) +
(count2.getCount() != null ? count2.getCount(): 0);

ClinicalDataCount aggregated = new ClinicalDataCount();
aggregated.setAttributeId(attributeId);
aggregated.setValue(value);
aggregated.setCount(count);
return aggregated;
})
)
)
.values();

return new ArrayList<>(normalizedDataCounts);
}

public static List<ClinicalDataCountItem> calculateMissingNaCountsForClinicalDataCountItems(
List<ClinicalDataCountItem> clinicalDataCountItems,
List<String> filteredAttributes,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@
<include refid="normalizeAttributeValue">
<property name="attribute_value" value="attribute_value"/>
</include>
) = '${dataFilterValue.value}'
) ILIKE '${dataFilterValue.value}'
</trim>
</foreach>
</sql>
Expand Down Expand Up @@ -578,7 +578,7 @@
<include refid="normalizeAttributeValue">
<property name="attribute_value" value="value"/>
</include>
) = '${dataFilterValue.value}'
) ILIKE '${dataFilterValue.value}'
</trim>
</foreach>
</sql>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -755,28 +755,12 @@
OR upperUTF8(${attribute_value})='N/A'
</sql>

<!-- This is to match boolean values ignoring the case -->
<sql id="isAttributeValueTrue">
upperUTF8(${attribute_value})='TRUE'
</sql>
<sql id="isAttributeValueFalse">
upperUTF8(${attribute_value})='FALSE'
</sql>

<sql id="normalizeAttributeValue">
multiIf(
<include refid="isAttributeValueNA">
<property name="attribute_value" value="${attribute_value}"/>
</include>,
'NA',
<include refid="isAttributeValueTrue">
<property name="attribute_value" value="${attribute_value}"/>
</include>,
'True',
<include refid="isAttributeValueFalse">
<property name="attribute_value" value="${attribute_value}"/>
</include>,
'False',
${attribute_value}
)
</sql>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,13 @@ public void getDeadCounts() {
Collections.emptyList()
);

assertEquals(6, categoricalClinicalDataCounts.size());
assertEquals(3, findClinicaDataCount(categoricalClinicalDataCounts, "True"));
assertEquals(4, findClinicaDataCount(categoricalClinicalDataCounts, "False"));
assertEquals(10, categoricalClinicalDataCounts.size());
assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "True"));
assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "TRUE"));
assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "true"));
assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "False"));
assertEquals(2, findClinicaDataCount(categoricalClinicalDataCounts, "FALSE"));
assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "false"));
assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "Not Released"));
assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "Not Collected"));
assertEquals(1, findClinicaDataCount(categoricalClinicalDataCounts, "Unknown"));
Expand Down

0 comments on commit 23be72c

Please sign in to comment.