1818 */
1919package org .apache .pinot .segment .local .segment .creator .impl .stats ;
2020
21+ import com .fasterxml .jackson .core .JsonProcessingException ;
2122import it .unimi .dsi .fastutil .objects .Object2ObjectOpenHashMap ;
2223import java .util .Arrays ;
2324import java .util .Map ;
2829import org .apache .pinot .segment .spi .index .StandardIndexes ;
2930import org .apache .pinot .spi .config .table .TableConfig ;
3031import org .apache .pinot .spi .config .table .TableType ;
31- import org .apache .pinot .spi .data .ComplexFieldSpec ;
3232import org .apache .pinot .spi .data .DimensionFieldSpec ;
3333import org .apache .pinot .spi .data .FieldSpec ;
3434import org .apache .pinot .spi .data .Schema ;
35+ import org .apache .pinot .spi .utils .JsonUtils ;
3536import org .apache .pinot .spi .utils .MapUtils ;
3637import org .apache .pinot .spi .utils .builder .TableConfigBuilder ;
38+ import org .slf4j .Logger ;
39+ import org .slf4j .LoggerFactory ;
3740
3841
3942/**
4043 * Extension of {@link AbstractColumnStatisticsCollector} for Map column type.
4144 *
42- * The Map column type is different than other columns in that it is essentially recursive. It contains keys
45+ * The Map column type is different from other columns in that it is essentially recursive. It contains keys
4346 * and those keys are analogous to columns and, as such, have Key level statistics. So, this class keeps track of
4447 * Map column level statistics _and_ Key level statistics. The Key Level statistics can then be used during
4548 * the creation of the Immutable Segment to make decisions about how keys will be stored or what Map data structure
5154 * heterogeneous value types for a key are encountered will construct the Map statistics it can be raised as a fault.
5255 */
5356public class MapColumnPreIndexStatsCollector extends AbstractColumnStatisticsCollector {
57+ private static final Logger LOGGER = LoggerFactory .getLogger (MapColumnPreIndexStatsCollector .class );
5458 private final Object2ObjectOpenHashMap <String , AbstractColumnStatisticsCollector > _keyStats =
5559 new Object2ObjectOpenHashMap <>(INITIAL_HASH_SET_SIZE );
5660 private final Map <String , Integer > _keyFrequencies = new Object2ObjectOpenHashMap <>(INITIAL_HASH_SET_SIZE );
5761 private String [] _sortedKeys ;
5862 private int _minLength = Integer .MAX_VALUE ;
5963 private int _maxLength = 0 ;
6064 private boolean _sealed = false ;
61- private ComplexFieldSpec _colFieldSpec ;
6265 private boolean _createNoDictCollectorsForKeys = false ;
6366
6467 public MapColumnPreIndexStatsCollector (String column , StatsCollectorConfig statsCollectorConfig ) {
6568 super (column , statsCollectorConfig );
6669 _sorted = false ;
67- _colFieldSpec = (ComplexFieldSpec ) statsCollectorConfig .getFieldSpecForColumn (column );
6870 Map <String , FieldIndexConfigs > indexConfigsByCol = FieldIndexConfigsUtil .createIndexConfigsByColName (
6971 statsCollectorConfig .getTableConfig (), statsCollectorConfig .getSchema ());
7072 boolean isDictionaryEnabled = indexConfigsByCol .get (column ).getConfig (StandardIndexes .dictionary ()).isEnabled ();
@@ -96,6 +98,9 @@ public void collect(Object entry) {
9698 for (Map .Entry <String , Object > mapValueEntry : mapValue .entrySet ()) {
9799 String key = mapValueEntry .getKey ();
98100 Object value = mapValueEntry .getValue ();
101+ if (value == null ) {
102+ continue ;
103+ }
99104 _keyFrequencies .merge (key , 1 , Integer ::sum );
100105 AbstractColumnStatisticsCollector keyStats = _keyStats .get (key );
101106 if (keyStats == null ) {
@@ -105,6 +110,62 @@ public void collect(Object entry) {
105110 updatePartition (key );
106111 }
107112 }
113+ if (keyStats instanceof NoDictColumnStatisticsCollector ) {
114+ keyStats .collect (value );
115+ continue ;
116+ }
117+ if (keyStats instanceof StringColumnPreIndexStatsCollector ) {
118+ if (value instanceof String || value instanceof Number || value instanceof Boolean ) {
119+ keyStats .collect (String .valueOf (value ));
120+ continue ;
121+ }
122+ try {
123+ keyStats .collect (JsonUtils .objectToString (value ));
124+ continue ;
125+ } catch (JsonProcessingException e ) {
126+ throw new RuntimeException ("Failed to serialize value for key '" + key + "': " + value , e );
127+ }
128+ }
129+ if (keyStats instanceof BigDecimalColumnPreIndexStatsCollector ) {
130+ keyStats .collect (PinotDataType .STRING .toBigDecimal (value .toString ()));
131+ continue ;
132+ }
133+ if (value instanceof Number ) {
134+ Number valueNumber = (Number ) value ;
135+ if (keyStats instanceof IntColumnPreIndexStatsCollector ) {
136+ keyStats .collect (valueNumber .intValue ());
137+ continue ;
138+ }
139+ if (keyStats instanceof LongColumnPreIndexStatsCollector ) {
140+ keyStats .collect (valueNumber .longValue ());
141+ continue ;
142+ }
143+ if (keyStats instanceof FloatColumnPreIndexStatsCollector ) {
144+ keyStats .collect (valueNumber .floatValue ());
145+ continue ;
146+ }
147+ if (keyStats instanceof DoubleColumnPreIndexStatsCollector ) {
148+ keyStats .collect (valueNumber .doubleValue ());
149+ continue ;
150+ }
151+ }
152+ if (keyStats instanceof IntColumnPreIndexStatsCollector ) {
153+ keyStats .collect (PinotDataType .STRING .toInt (value .toString ()));
154+ continue ;
155+ }
156+ if (keyStats instanceof LongColumnPreIndexStatsCollector ) {
157+ keyStats .collect (PinotDataType .STRING .toLong (value .toString ()));
158+ continue ;
159+ }
160+ if (keyStats instanceof FloatColumnPreIndexStatsCollector ) {
161+ keyStats .collect (PinotDataType .STRING .toFloat (value .toString ()));
162+ continue ;
163+ }
164+ if (keyStats instanceof DoubleColumnPreIndexStatsCollector ) {
165+ keyStats .collect (PinotDataType .STRING .toDouble (value .toString ()));
166+ continue ;
167+ }
168+ // Catch all
108169 keyStats .collect (value );
109170 }
110171 _totalNumberOfEntries ++;
@@ -161,7 +222,6 @@ public int getCardinality() {
161222 public void seal () {
162223 if (!_sealed ) {
163224 //All the keys which have appeared less than total docs insert default null Value in unique values
164- FieldSpec valueFieldSpec = _colFieldSpec .getChildFieldSpec ("value" );
165225 for (Map .Entry <String , Integer > entry : _keyFrequencies .entrySet ()) {
166226 if (entry .getValue () < _totalNumberOfEntries ) {
167227 _keyStats .get (entry .getKey ()).collect (_keyStats .get (entry .getKey ())._fieldSpec .getDefaultNullValue ());
@@ -196,7 +256,6 @@ private AbstractColumnStatisticsCollector createKeyStatsCollector(String key, Ob
196256 if (_createNoDictCollectorsForKeys ) {
197257 return new NoDictColumnStatisticsCollector (key , config );
198258 }
199-
200259 switch (type ) {
201260 case INTEGER :
202261 return new IntColumnPreIndexStatsCollector (key , config );
@@ -208,18 +267,23 @@ private AbstractColumnStatisticsCollector createKeyStatsCollector(String key, Ob
208267 return new DoubleColumnPreIndexStatsCollector (key , config );
209268 case BIG_DECIMAL :
210269 return new BigDecimalColumnPreIndexStatsCollector (key , config );
270+ case BOOLEAN :
211271 case STRING :
272+ case MAP :
273+ case OBJECT :
212274 return new StringColumnPreIndexStatsCollector (key , config );
213275 default :
214- throw new UnsupportedOperationException (String .format ("MAP column does not yet support '%s'" , type ));
276+ LOGGER .warn ("Unknown data type {} for key {} and value {}" , type , key , value );
277+ return new StringColumnPreIndexStatsCollector (key , config );
215278 }
216279 }
217280
281+ /**
282+ * Convert Map value data type to stored field type.
283+ * Note that all unknown types are automatically converted to String type.
284+ */
218285 private static FieldSpec .DataType convertToDataType (PinotDataType ty ) {
219- // TODO: I've been told that we already have a function to do this, so find that function and replace this
220286 switch (ty ) {
221- case BOOLEAN :
222- return FieldSpec .DataType .BOOLEAN ;
223287 case SHORT :
224288 case INTEGER :
225289 return FieldSpec .DataType .INT ;
@@ -233,10 +297,12 @@ private static FieldSpec.DataType convertToDataType(PinotDataType ty) {
233297 return FieldSpec .DataType .BIG_DECIMAL ;
234298 case TIMESTAMP :
235299 return FieldSpec .DataType .TIMESTAMP ;
300+ case BOOLEAN :
236301 case STRING :
237- return FieldSpec .DataType .STRING ;
302+ case OBJECT :
303+ case MAP :
238304 default :
239- throw new UnsupportedOperationException () ;
305+ return FieldSpec . DataType . STRING ;
240306 }
241307 }
242308}
0 commit comments