fix: Improve parsing of complex table keys (#1226)

pulpdrew · web-flow · commit 35c4222263e5 · 2025-10-01T18:19:51.000Z
Closes HDX-2528

# Summary

This PR fixes errors caused by naive parsing of complex partition and primary keys. With this change, we parse primary and partition keys using node-sql-parser to extract lists of referenced columns, rather than splitting on commas.

## Testing

Beyond the unit tests, I also tested with the following table in ClickHouse

&lt;details&gt;
&lt;summary&gt;Table Schema&lt;/summary&gt;

```sql
CREATE TABLE default.otel_logs_complex_pk
(
    `Timestamp` DateTime64(9) CODEC(Delta(8), ZSTD(1)),
    `TimestampTime` DateTime DEFAULT toDateTime(Timestamp),
    `TraceId` String CODEC(ZSTD(1)),
    `SpanId` String CODEC(ZSTD(1)),
    `TraceFlags` UInt8,
    `SeverityText` LowCardinality(String) CODEC(ZSTD(1)),
    `SeverityNumber` UInt8,
    `ServiceName` LowCardinality(String) CODEC(ZSTD(1)),
    `Body` String CODEC(ZSTD(1)),
    `ResourceSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)),
    `ResourceAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)),
    `ScopeSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)),
    `ScopeName` String CODEC(ZSTD(1)),
    `ScopeVersion` LowCardinality(String) CODEC(ZSTD(1)),
    `ScopeAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)),
    `LogAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)),
    `__hdx_materialized_k8s.cluster.name` LowCardinality(String) MATERIALIZED ResourceAttributes['k8s.cluster.name'] CODEC(ZSTD(1)),
    `__hdx_materialized_k8s.container.name` LowCardinality(String) MATERIALIZED ResourceAttributes['k8s.container.name'] CODEC(ZSTD(1)),
    `__hdx_materialized_k8s.deployment.name` LowCardinality(String) MATERIALIZED ResourceAttributes['k8s.deployment.name'] CODEC(ZSTD(1)),
    `__hdx_materialized_k8s.namespace.name` LowCardinality(String) MATERIALIZED ResourceAttributes['k8s.namespace.name'] CODEC(ZSTD(1)),
    `__hdx_materialized_k8s.node.name` LowCardinality(String) MATERIALIZED ResourceAttributes['k8s.node.name'] CODEC(ZSTD(1)),
    `__hdx_materialized_k8s.pod.name` LowCardinality(String) MATERIALIZED ResourceAttributes['k8s.pod.name'] CODEC(ZSTD(1)),
    `__hdx_materialized_k8s.pod.uid` LowCardinality(String) MATERIALIZED ResourceAttributes['k8s.pod.uid'] CODEC(ZSTD(1)),
    `__hdx_materialized_deployment.environment.name` LowCardinality(String) MATERIALIZED ResourceAttributes['deployment.environment.name'] CODEC(ZSTD(1)),
    INDEX idx_trace_id TraceId TYPE bloom_filter(0.001) GRANULARITY 1,
    INDEX idx_res_attr_key mapKeys(ResourceAttributes) TYPE bloom_filter(0.01) GRANULARITY 1,
    INDEX idx_res_attr_value mapValues(ResourceAttributes) TYPE bloom_filter(0.01) GRANULARITY 1,
    INDEX idx_scope_attr_key mapKeys(ScopeAttributes) TYPE bloom_filter(0.01) GRANULARITY 1,
    INDEX idx_scope_attr_value mapValues(ScopeAttributes) TYPE bloom_filter(0.01) GRANULARITY 1,
    INDEX idx_log_attr_key mapKeys(LogAttributes) TYPE bloom_filter(0.01) GRANULARITY 1,
    INDEX idx_log_attr_value mapValues(LogAttributes) TYPE bloom_filter(0.01) GRANULARITY 1,
    INDEX idx_lower_body lower(Body) TYPE tokenbf_v1(32768, 3, 0) GRANULARITY 8
)
ENGINE = MergeTree
PARTITION BY toStartOfInterval(Timestamp, toIntervalDay(3))
PRIMARY KEY (toStartOfInterval(Timestamp, toIntervalDay(3)), TimestampTime, dateDiff('day', Timestamp, Timestamp + toIntervalDay(1)))
ORDER BY (toStartOfInterval(Timestamp, toIntervalDay(3)), TimestampTime, dateDiff('day', Timestamp, Timestamp + toIntervalDay(1)))
SETTINGS index_granularity = 8192;
```

&lt;/details&gt;

✅ The search page loads
✅ The timestamp column is inferred when adding the source
✅ rowWhere / row selection works and is persisted in the URL
diff --git a/.changeset/famous-dancers-develop.md b/.changeset/famous-dancers-develop.md
@@ -0,0 +1,6 @@
+---
+"@hyperdx/common-utils": patch
+"@hyperdx/app": patch
+---
+
+fix: Improve table key parsing
diff --git a/packages/app/src/DBSearchPage.tsx b/packages/app/src/DBSearchPage.tsx
@@ -543,7 +543,7 @@ function optimizeDefaultOrderBy(
   if (!sortingKey) return fallbackOrderBy;
 
   const orderByArr = [];
-  const sortKeys = sortingKey.split(',').map(key => key.trim());
+  const sortKeys = splitAndTrimWithBracket(sortingKey);
   for (let i = 0; i < sortKeys.length; i++) {
     const sortKey = sortKeys[i];
     if (sortKey.includes('toStartOf') && sortKey.includes(timestampExpr)) {
diff --git a/packages/app/src/components/DBRowTable.tsx b/packages/app/src/components/DBRowTable.tsx
@@ -25,7 +25,7 @@ import {
   ClickHouseQueryError,
   ColumnMetaType,
   convertCHDataTypeToJSType,
-  extractColumnReference,
+  extractColumnReferencesFromKey,
   isJSDataTypeJSONStringifiable,
   JSDataType,
 } from '@hyperdx/common-utils/dist/clickhouse';
@@ -1039,28 +1039,24 @@ export const RawLogTable = memo(
   },
 );
 
-function appendSelectWithPrimaryAndPartitionKey(
+export function appendSelectWithPrimaryAndPartitionKey(
   select: SelectList,
   primaryKeys: string,
   partitionKey: string,
 ): { select: SelectList; additionalKeysLength: number } {
-  const partitionKeyArr = partitionKey
-    .split(',')
-    .map(k => extractColumnReference(k.trim()))
-    .filter((k): k is string => k != null && k.length > 0);
-  const primaryKeyArr =
-    primaryKeys.trim() !== '' ? splitAndTrimWithBracket(primaryKeys) : [];
-  const allKeys = [...partitionKeyArr, ...primaryKeyArr];
+  const partitionKeyArr = extractColumnReferencesFromKey(partitionKey);
+  const primaryKeyArr = extractColumnReferencesFromKey(primaryKeys);
+  const allKeys = new Set([...partitionKeyArr, ...primaryKeyArr]);
   if (typeof select === 'string') {
     const selectSplit = splitAndTrimWithBracket(select);
     const selectColumns = new Set(selectSplit);
-    const additionalKeys = allKeys.filter(k => !selectColumns.has(k));
+    const additionalKeys = [...allKeys].filter(k => !selectColumns.has(k));
     return {
       select: [...selectColumns, ...additionalKeys].join(','),
       additionalKeysLength: additionalKeys.length,
     };
   } else {
-    const additionalKeys = allKeys.map(k => ({ valueExpression: k }));
+    const additionalKeys = [...allKeys].map(k => ({ valueExpression: k }));
     return {
       select: [...select, ...additionalKeys],
       additionalKeysLength: additionalKeys.length,
diff --git a/packages/app/src/components/__tests__/DBRowTable.test.tsx b/packages/app/src/components/__tests__/DBRowTable.test.tsx
@@ -0,0 +1,119 @@
+import { appendSelectWithPrimaryAndPartitionKey } from '@/components/DBRowTable';
+
+describe('appendSelectWithPrimaryAndPartitionKey', () => {
+  it('should extract columns from partition key with nested function call', () => {
+    const result = appendSelectWithPrimaryAndPartitionKey(
+      'col1, col2',
+      'id, created_at',
+      ' toStartOfInterval(timestamp, toIntervalDay(3))',
+    );
+    expect(result).toEqual({
+      additionalKeysLength: 3,
+      select: 'col1,col2,timestamp,id,created_at',
+    });
+  });
+
+  it('should extract no columns from empty primary key and partition key', () => {
+    const result = appendSelectWithPrimaryAndPartitionKey('col1, col2', '', '');
+    expect(result).toEqual({
+      additionalKeysLength: 0,
+      select: 'col1,col2',
+    });
+  });
+
+  it('should extract columns from complex primary key', () => {
+    const result = appendSelectWithPrimaryAndPartitionKey(
+      'col1, col2',
+      'id, timestamp, toStartOfInterval(timestamp2, toIntervalDay(3))',
+      "toStartOfInterval(timestamp, toIntervalDay(3)), date_diff('DAY', col3, col4), now(), toDate(col5 + INTERVAL 1 DAY)",
+    );
+    expect(result).toEqual({
+      additionalKeysLength: 6,
+      select: 'col1,col2,timestamp,col3,col4,col5,id,timestamp2',
+    });
+  });
+
+  it('should extract map columns', () => {
+    const result = appendSelectWithPrimaryAndPartitionKey(
+      'col1, col2',
+      `map['key']`,
+      `map2['key'], map1['key3 ']`,
+    );
+    expect(result).toEqual({
+      additionalKeysLength: 3,
+      select: `col1,col2,map2['key'],map1['key3 '],map['key']`,
+    });
+  });
+
+  it('should extract map columns', () => {
+    const result = appendSelectWithPrimaryAndPartitionKey(
+      'col1, col2',
+      ``,
+      `map2['key.2']`,
+    );
+    expect(result).toEqual({
+      additionalKeysLength: 1,
+      select: `col1,col2,map2['key.2']`,
+    });
+  });
+
+  it('should extract array columns', () => {
+    const result = appendSelectWithPrimaryAndPartitionKey(
+      'col1, col2',
+      `array[1]`,
+      `array[2], array[3]`,
+    );
+    expect(result).toEqual({
+      additionalKeysLength: 3,
+      select: `col1,col2,array[2],array[3],array[1]`,
+    });
+  });
+
+  it('should extract json columns', () => {
+    const result = appendSelectWithPrimaryAndPartitionKey(
+      'col1, col2',
+      `json.b`,
+      `json.a, json.b.c, toStartOfDay(timestamp, json_2.d)`,
+    );
+    expect(result).toEqual({
+      additionalKeysLength: 5,
+      select: `col1,col2,json.a,json.b.c,timestamp,json_2.d,json.b`,
+    });
+  });
+
+  it('should extract json columns with type specifiers', () => {
+    const result = appendSelectWithPrimaryAndPartitionKey(
+      'col1, col2',
+      `json.b.:Int64`,
+      `toStartOfDay(json.a.b.:DateTime)`,
+    );
+    expect(result).toEqual({
+      additionalKeysLength: 2,
+      select: `col1,col2,json.a.b,json.b`,
+    });
+  });
+
+  it('should skip json columns with hard-to-parse type specifiers', () => {
+    const result = appendSelectWithPrimaryAndPartitionKey(
+      'col1, col2',
+      `json.b.:Array(String), col3`,
+      ``,
+    );
+    expect(result).toEqual({
+      additionalKeysLength: 1,
+      select: `col1,col2,col3`,
+    });
+  });
+
+  it('should skip nested map references', () => {
+    const result = appendSelectWithPrimaryAndPartitionKey(
+      'col1, col2',
+      `map['key']['key2'], col3`,
+      ``,
+    );
+    expect(result).toEqual({
+      additionalKeysLength: 1,
+      select: `col1,col2,col3`,
+    });
+  });
+});
diff --git a/packages/app/src/source.ts b/packages/app/src/source.ts
@@ -5,7 +5,7 @@ import objectHash from 'object-hash';
 import store from 'store2';
 import {
   ColumnMeta,
-  extractColumnReference,
+  extractColumnReferencesFromKey,
   filterColumnMetaByType,
   JSDataType,
 } from '@hyperdx/common-utils/dist/clickhouse';
@@ -252,7 +252,9 @@ export async function inferTableSourceConfig({
       connectionId,
     })
   ).primary_key;
-  const keys = splitAndTrimWithBracket(primaryKeys);
+  const primaryKeyColumns = new Set(
+    extractColumnReferencesFromKey(primaryKeys),
+  );
 
   const isOtelLogSchema = hasAllColumns(columns, [
     'Timestamp',
@@ -285,12 +287,7 @@ export async function inferTableSourceConfig({
 
   const timestampColumns = filterColumnMetaByType(columns, [JSDataType.Date]);
   const primaryKeyTimestampColumn = timestampColumns?.find(c =>
-    keys.find(
-      k =>
-        // If the key is a fn call like toUnixTimestamp(Timestamp), we need to strip it
-        // We can't use substr match since "Timestamp" would match "TimestampTime"
-        extractColumnReference(k) === c.name,
-    ),
+    primaryKeyColumns.has(c.name),
   );
 
   return {
diff --git a/packages/common-utils/src/clickhouse/__tests__/index.test.ts b/packages/common-utils/src/clickhouse/__tests__/index.test.ts
@@ -0,0 +1,39 @@
+import { extractColumnReferencesFromKey } from '..';
+
+describe('extractColumnReferencesFromKey', () => {
+  it('should extract column references from simple column names', () => {
+    expect(extractColumnReferencesFromKey('col1, col2, col3')).toEqual([
+      'col1',
+      'col2',
+      'col3',
+    ]);
+  });
+
+  it('should extract column references from function calls', () => {
+    expect(
+      extractColumnReferencesFromKey(
+        "toStartOfInterval(timestamp, toIntervalDay(3)), col2, date_diff('DAY', col3, col4), now(), toDate(col5 + INTERVAL 1 DAY)",
+      ),
+    ).toEqual(['timestamp', 'col2', 'col3', 'col4', 'col5']);
+  });
+
+  it('should handle an empty expression', () => {
+    expect(extractColumnReferencesFromKey('')).toEqual([]);
+  });
+
+  it('should handle map / json access expression', () => {
+    // This is imperfect due to lack of full ClickHouse SQL parsing - we don't pickup the nested map access here.
+    // It is not expected to be a common case that there are nested map accesses in a primary or partition key,
+    // so we just want to make sure we don't error out in this case.
+    expect(
+      extractColumnReferencesFromKey("mapCol[otherMap['key']], col2"),
+    ).toEqual(['col2']);
+  });
+
+  it('should handle array accesses', () => {
+    expect(extractColumnReferencesFromKey('arrayCol[1], col2')).toEqual([
+      'arrayCol[1]',
+      'col2',
+    ]);
+  });
+});
diff --git a/packages/common-utils/src/clickhouse/index.ts b/packages/common-utils/src/clickhouse/index.ts
@@ -19,7 +19,7 @@ import {
   splitChartConfigs,
 } from '@/renderChartConfig';
 import { ChartConfigWithOptDateRange, SQLInterval } from '@/types';
-import { hashCode } from '@/utils';
+import { hashCode, splitAndTrimWithBracket } from '@/utils';
 
 // export @clickhouse/client-common types
 export type {
@@ -273,22 +273,55 @@ export class ClickHouseQueryError extends Error {
   }
 }
 
-export function extractColumnReference(
-  sql: string,
-  maxIterations = 10,
-): string | null {
-  let iterations = 0;
-
-  // Loop until we remove all function calls and get just the column, with a maximum limit
-  while (/\w+\([^()]*\)/.test(sql) && iterations < maxIterations) {
-    // Replace the outermost function with its content
-    sql = sql.replace(/\w+\(([^()]*)\)/, '$1');
-    iterations++;
+/**
+ * Returns columns referenced in given expression, where the expression is a comma-separated list of SQL expressions
+ * E.g. "id, toStartOfInterval(timestamp, toIntervalDay(3)), user_id, json.a.b".
+ */
+export const extractColumnReferencesFromKey = (expr: string): string[] => {
+  const parser = new SQLParser.Parser();
+
+  const exprs = splitAndTrimWithBracket(expr);
+  if (!exprs?.length) {
+    return [];
   }
 
-  // If we reached the max iterations without resolving, return null to indicate an issue
-  return iterations < maxIterations ? sql.trim() : null;
-}
+  return exprs.flatMap(expr => {
+    try {
+      // Extract map or array access expressions, e.g. map['key'] or array[1], since node-sql-parser does not support them.
+      const mapAccessRegex = /\b[a-zA-Z0-9_]+\[([0-9]+|'[^']*')\]/g;
+      const mapAccesses = expr.match(mapAccessRegex) || [];
+
+      // Replace map/array accesses with a literal string ('') so that node-sql-parser ignores them
+      const exprWithoutMaps = expr.replace(mapAccessRegex, "''");
+
+      // Strip out any JSON type expressions, eg. in json.a.:Int64, remove the .:Int64 part
+      const exprWithoutMapsOrJsonType = exprWithoutMaps.replace(
+        /\.:[a-zA-Z0-9]+/g,
+        '',
+      );
+
+      // Extract out any JSON path expressions, since node-sql-parser does not support them.
+      const jsonPathRegex = /\b[a-zA-Z0-9_]+\.[a-zA-Z0-9_.]+/g;
+      const jsonPaths = exprWithoutMapsOrJsonType.match(jsonPathRegex) || [];
+
+      // Replace JSON paths and map/array accesses with a literal string ('') so that node-sql-parser ignores them
+      const exprWithoutMapsOrJson = exprWithoutMapsOrJsonType.replace(
+        jsonPathRegex,
+        "''",
+      );
+
+      // Parse remaining column references with node-sql-parser
+      const parsedColumnList = parser
+        .columnList(`select ${exprWithoutMapsOrJson}`)
+        .map(col => col.split('::')[2]);
+
+      return [...new Set([...parsedColumnList, ...jsonPaths, ...mapAccesses])];
+    } catch (e) {
+      console.error('Error parsing column references from key', e, expr);
+      return [];
+    }
+  });
+};
 
 const castToNumber = (value: string | number) => {
   if (typeof value === 'string') {