1818 */
1919package org .apache .pinot .segment .local .segment .creator .impl .stats ;
2020
21+ import com .fasterxml .jackson .core .JsonProcessingException ;
2122import it .unimi .dsi .fastutil .objects .Object2ObjectOpenHashMap ;
23+ import java .math .BigDecimal ;
24+ import java .text .NumberFormat ;
2225import java .util .Arrays ;
26+ import java .util .Locale ;
2327import java .util .Map ;
2428import org .apache .pinot .common .utils .PinotDataType ;
2529import org .apache .pinot .segment .spi .creator .StatsCollectorConfig ;
3236import org .apache .pinot .spi .data .DimensionFieldSpec ;
3337import org .apache .pinot .spi .data .FieldSpec ;
3438import org .apache .pinot .spi .data .Schema ;
39+ import org .apache .pinot .spi .utils .JsonUtils ;
3540import org .apache .pinot .spi .utils .MapUtils ;
3641import org .apache .pinot .spi .utils .builder .TableConfigBuilder ;
42+ import org .slf4j .Logger ;
43+ import org .slf4j .LoggerFactory ;
3744
3845
3946/**
5158 * heterogeneous value types for a key are encountered will construct the Map statistics it can be raised as a fault.
5259 */
5360public class MapColumnPreIndexStatsCollector extends AbstractColumnStatisticsCollector {
61+ private static final Logger LOGGER = LoggerFactory .getLogger (MapColumnPreIndexStatsCollector .class );
5462 private final Object2ObjectOpenHashMap <String , AbstractColumnStatisticsCollector > _keyStats =
5563 new Object2ObjectOpenHashMap <>(INITIAL_HASH_SET_SIZE );
5664 private final Map <String , Integer > _keyFrequencies = new Object2ObjectOpenHashMap <>(INITIAL_HASH_SET_SIZE );
5765 private String [] _sortedKeys ;
5866 private int _minLength = Integer .MAX_VALUE ;
5967 private int _maxLength = 0 ;
6068 private boolean _sealed = false ;
61- private ComplexFieldSpec _colFieldSpec ;
69+ private final ComplexFieldSpec _colFieldSpec ;
6270 private boolean _createNoDictCollectorsForKeys = false ;
6371
6472 public MapColumnPreIndexStatsCollector (String column , StatsCollectorConfig statsCollectorConfig ) {
@@ -96,6 +104,9 @@ public void collect(Object entry) {
96104 for (Map .Entry <String , Object > mapValueEntry : mapValue .entrySet ()) {
97105 String key = mapValueEntry .getKey ();
98106 Object value = mapValueEntry .getValue ();
107+ if (value == null ) {
108+ continue ;
109+ }
99110 _keyFrequencies .merge (key , 1 , Integer ::sum );
100111 AbstractColumnStatisticsCollector keyStats = _keyStats .get (key );
101112 if (keyStats == null ) {
@@ -105,6 +116,48 @@ public void collect(Object entry) {
105116 updatePartition (key );
106117 }
107118 }
119+ if (keyStats instanceof StringColumnPreIndexStatsCollector ) {
120+ if (value instanceof String || value instanceof Number || value instanceof Boolean ) {
121+ keyStats .collect (String .valueOf (value ));
122+ continue ;
123+ }
124+ try {
125+ keyStats .collect (JsonUtils .objectToString (value ));
126+ continue ;
127+ } catch (JsonProcessingException e ) {
128+ throw new RuntimeException ("Failed to serialize value for key '" + key + "': " + value , e );
129+ }
130+ }
131+
132+ Number valueNumber ;
133+ if (value instanceof Number ) {
134+ valueNumber = (Number ) value ;
135+ } else {
136+ valueNumber = parseFlexibleNumber (value .toString ());
137+ }
138+ if (valueNumber == null ) {
139+ continue ;
140+ }
141+ if (keyStats instanceof IntColumnPreIndexStatsCollector ) {
142+ keyStats .collect (valueNumber .intValue ());
143+ continue ;
144+ }
145+ if (keyStats instanceof LongColumnPreIndexStatsCollector ) {
146+ keyStats .collect (valueNumber .longValue ());
147+ continue ;
148+ }
149+ if (keyStats instanceof FloatColumnPreIndexStatsCollector ) {
150+ keyStats .collect (valueNumber .floatValue ());
151+ continue ;
152+ }
153+ if (keyStats instanceof DoubleColumnPreIndexStatsCollector ) {
154+ keyStats .collect (valueNumber .doubleValue ());
155+ continue ;
156+ }
157+ if (keyStats instanceof BigDecimalColumnPreIndexStatsCollector ) {
158+ keyStats .collect (new BigDecimal (valueNumber .toString ()));
159+ continue ;
160+ }
108161 keyStats .collect (value );
109162 }
110163 _totalNumberOfEntries ++;
@@ -113,6 +166,28 @@ public void collect(Object entry) {
113166 }
114167 }
115168
169+ private Number parseFlexibleNumber (String input ) {
170+ if (input == null ) {
171+ return null ;
172+ }
173+ String s = input .trim ();
174+ if (s .isEmpty ()) {
175+ return null ;
176+ }
177+ try {
178+ // Try BigDecimal first — it supports everything cleanly
179+ return new BigDecimal (s );
180+ } catch (NumberFormatException e ) {
181+ try {
182+ // Try locale parsing fallback
183+ NumberFormat nf = NumberFormat .getInstance (Locale .US );
184+ return nf .parse (s );
185+ } catch (Exception ignored ) {
186+ return null ;
187+ }
188+ }
189+ }
190+
116191 @ Override
117192 public String getMinValue () {
118193 if (_sealed ) {
@@ -196,7 +271,6 @@ private AbstractColumnStatisticsCollector createKeyStatsCollector(String key, Ob
196271 if (_createNoDictCollectorsForKeys ) {
197272 return new NoDictColumnStatisticsCollector (key , config );
198273 }
199-
200274 switch (type ) {
201275 case INTEGER :
202276 return new IntColumnPreIndexStatsCollector (key , config );
@@ -208,18 +282,23 @@ private AbstractColumnStatisticsCollector createKeyStatsCollector(String key, Ob
208282 return new DoubleColumnPreIndexStatsCollector (key , config );
209283 case BIG_DECIMAL :
210284 return new BigDecimalColumnPreIndexStatsCollector (key , config );
285+ case BOOLEAN :
211286 case STRING :
287+ case MAP :
288+ case OBJECT :
212289 return new StringColumnPreIndexStatsCollector (key , config );
213290 default :
214- throw new UnsupportedOperationException (String .format ("MAP column does not yet support '%s'" , type ));
291+ LOGGER .warn ("Unknown data type {} for key {} and value {}" , type , key , value );
292+ return new StringColumnPreIndexStatsCollector (key , config );
215293 }
216294 }
217295
296+ /**
297+ * Convert Map value data type to stored field type.
298+ * Note that all unknown types are automatically converted to String type.
299+ */
218300 private static FieldSpec .DataType convertToDataType (PinotDataType ty ) {
219- // TODO: I've been told that we already have a function to do this, so find that function and replace this
220301 switch (ty ) {
221- case BOOLEAN :
222- return FieldSpec .DataType .BOOLEAN ;
223302 case SHORT :
224303 case INTEGER :
225304 return FieldSpec .DataType .INT ;
@@ -233,10 +312,12 @@ private static FieldSpec.DataType convertToDataType(PinotDataType ty) {
233312 return FieldSpec .DataType .BIG_DECIMAL ;
234313 case TIMESTAMP :
235314 return FieldSpec .DataType .TIMESTAMP ;
315+ case BOOLEAN :
236316 case STRING :
237- return FieldSpec .DataType .STRING ;
317+ case OBJECT :
318+ case MAP :
238319 default :
239- throw new UnsupportedOperationException () ;
320+ return FieldSpec . DataType . STRING ;
240321 }
241322 }
242323}
0 commit comments