Skip to content

Commit 35c4222

Browse files
authored
fix: Improve parsing of complex table keys (#1226)
Closes HDX-2528 # Summary This PR fixes errors caused by naive parsing of complex partition and primary keys. With this change, we parse primary and partition keys using node-sql-parser to extract lists of referenced columns, rather than splitting on commas. ## Testing Beyond the unit tests, I also tested with the following table in ClickHouse <details> <summary>Table Schema</summary> ```sql CREATE TABLE default.otel_logs_complex_pk ( `Timestamp` DateTime64(9) CODEC(Delta(8), ZSTD(1)), `TimestampTime` DateTime DEFAULT toDateTime(Timestamp), `TraceId` String CODEC(ZSTD(1)), `SpanId` String CODEC(ZSTD(1)), `TraceFlags` UInt8, `SeverityText` LowCardinality(String) CODEC(ZSTD(1)), `SeverityNumber` UInt8, `ServiceName` LowCardinality(String) CODEC(ZSTD(1)), `Body` String CODEC(ZSTD(1)), `ResourceSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ResourceAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `ScopeSchemaUrl` LowCardinality(String) CODEC(ZSTD(1)), `ScopeName` String CODEC(ZSTD(1)), `ScopeVersion` LowCardinality(String) CODEC(ZSTD(1)), `ScopeAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `LogAttributes` Map(LowCardinality(String), String) CODEC(ZSTD(1)), `__hdx_materialized_k8s.cluster.name` LowCardinality(String) MATERIALIZED ResourceAttributes['k8s.cluster.name'] CODEC(ZSTD(1)), `__hdx_materialized_k8s.container.name` LowCardinality(String) MATERIALIZED ResourceAttributes['k8s.container.name'] CODEC(ZSTD(1)), `__hdx_materialized_k8s.deployment.name` LowCardinality(String) MATERIALIZED ResourceAttributes['k8s.deployment.name'] CODEC(ZSTD(1)), `__hdx_materialized_k8s.namespace.name` LowCardinality(String) MATERIALIZED ResourceAttributes['k8s.namespace.name'] CODEC(ZSTD(1)), `__hdx_materialized_k8s.node.name` LowCardinality(String) MATERIALIZED ResourceAttributes['k8s.node.name'] CODEC(ZSTD(1)), `__hdx_materialized_k8s.pod.name` LowCardinality(String) MATERIALIZED ResourceAttributes['k8s.pod.name'] CODEC(ZSTD(1)), `__hdx_materialized_k8s.pod.uid` LowCardinality(String) MATERIALIZED ResourceAttributes['k8s.pod.uid'] CODEC(ZSTD(1)), `__hdx_materialized_deployment.environment.name` LowCardinality(String) MATERIALIZED ResourceAttributes['deployment.environment.name'] CODEC(ZSTD(1)), INDEX idx_trace_id TraceId TYPE bloom_filter(0.001) GRANULARITY 1, INDEX idx_res_attr_key mapKeys(ResourceAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_res_attr_value mapValues(ResourceAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_scope_attr_key mapKeys(ScopeAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_scope_attr_value mapValues(ScopeAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_log_attr_key mapKeys(LogAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_log_attr_value mapValues(LogAttributes) TYPE bloom_filter(0.01) GRANULARITY 1, INDEX idx_lower_body lower(Body) TYPE tokenbf_v1(32768, 3, 0) GRANULARITY 8 ) ENGINE = MergeTree PARTITION BY toStartOfInterval(Timestamp, toIntervalDay(3)) PRIMARY KEY (toStartOfInterval(Timestamp, toIntervalDay(3)), TimestampTime, dateDiff('day', Timestamp, Timestamp + toIntervalDay(1))) ORDER BY (toStartOfInterval(Timestamp, toIntervalDay(3)), TimestampTime, dateDiff('day', Timestamp, Timestamp + toIntervalDay(1))) SETTINGS index_granularity = 8192; ``` </details> ✅ The search page loads ✅ The timestamp column is inferred when adding the source ✅ rowWhere / row selection works and is persisted in the URL
1 parent e032af5 commit 35c4222

File tree

7 files changed

+225
-35
lines changed

7 files changed

+225
-35
lines changed
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
"@hyperdx/common-utils": patch
3+
"@hyperdx/app": patch
4+
---
5+
6+
fix: Improve table key parsing

packages/app/src/DBSearchPage.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -543,7 +543,7 @@ function optimizeDefaultOrderBy(
543543
if (!sortingKey) return fallbackOrderBy;
544544

545545
const orderByArr = [];
546-
const sortKeys = sortingKey.split(',').map(key => key.trim());
546+
const sortKeys = splitAndTrimWithBracket(sortingKey);
547547
for (let i = 0; i < sortKeys.length; i++) {
548548
const sortKey = sortKeys[i];
549549
if (sortKey.includes('toStartOf') && sortKey.includes(timestampExpr)) {

packages/app/src/components/DBRowTable.tsx

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ import {
2525
ClickHouseQueryError,
2626
ColumnMetaType,
2727
convertCHDataTypeToJSType,
28-
extractColumnReference,
28+
extractColumnReferencesFromKey,
2929
isJSDataTypeJSONStringifiable,
3030
JSDataType,
3131
} from '@hyperdx/common-utils/dist/clickhouse';
@@ -1039,28 +1039,24 @@ export const RawLogTable = memo(
10391039
},
10401040
);
10411041

1042-
function appendSelectWithPrimaryAndPartitionKey(
1042+
export function appendSelectWithPrimaryAndPartitionKey(
10431043
select: SelectList,
10441044
primaryKeys: string,
10451045
partitionKey: string,
10461046
): { select: SelectList; additionalKeysLength: number } {
1047-
const partitionKeyArr = partitionKey
1048-
.split(',')
1049-
.map(k => extractColumnReference(k.trim()))
1050-
.filter((k): k is string => k != null && k.length > 0);
1051-
const primaryKeyArr =
1052-
primaryKeys.trim() !== '' ? splitAndTrimWithBracket(primaryKeys) : [];
1053-
const allKeys = [...partitionKeyArr, ...primaryKeyArr];
1047+
const partitionKeyArr = extractColumnReferencesFromKey(partitionKey);
1048+
const primaryKeyArr = extractColumnReferencesFromKey(primaryKeys);
1049+
const allKeys = new Set([...partitionKeyArr, ...primaryKeyArr]);
10541050
if (typeof select === 'string') {
10551051
const selectSplit = splitAndTrimWithBracket(select);
10561052
const selectColumns = new Set(selectSplit);
1057-
const additionalKeys = allKeys.filter(k => !selectColumns.has(k));
1053+
const additionalKeys = [...allKeys].filter(k => !selectColumns.has(k));
10581054
return {
10591055
select: [...selectColumns, ...additionalKeys].join(','),
10601056
additionalKeysLength: additionalKeys.length,
10611057
};
10621058
} else {
1063-
const additionalKeys = allKeys.map(k => ({ valueExpression: k }));
1059+
const additionalKeys = [...allKeys].map(k => ({ valueExpression: k }));
10641060
return {
10651061
select: [...select, ...additionalKeys],
10661062
additionalKeysLength: additionalKeys.length,
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
import { appendSelectWithPrimaryAndPartitionKey } from '@/components/DBRowTable';
2+
3+
describe('appendSelectWithPrimaryAndPartitionKey', () => {
4+
it('should extract columns from partition key with nested function call', () => {
5+
const result = appendSelectWithPrimaryAndPartitionKey(
6+
'col1, col2',
7+
'id, created_at',
8+
' toStartOfInterval(timestamp, toIntervalDay(3))',
9+
);
10+
expect(result).toEqual({
11+
additionalKeysLength: 3,
12+
select: 'col1,col2,timestamp,id,created_at',
13+
});
14+
});
15+
16+
it('should extract no columns from empty primary key and partition key', () => {
17+
const result = appendSelectWithPrimaryAndPartitionKey('col1, col2', '', '');
18+
expect(result).toEqual({
19+
additionalKeysLength: 0,
20+
select: 'col1,col2',
21+
});
22+
});
23+
24+
it('should extract columns from complex primary key', () => {
25+
const result = appendSelectWithPrimaryAndPartitionKey(
26+
'col1, col2',
27+
'id, timestamp, toStartOfInterval(timestamp2, toIntervalDay(3))',
28+
"toStartOfInterval(timestamp, toIntervalDay(3)), date_diff('DAY', col3, col4), now(), toDate(col5 + INTERVAL 1 DAY)",
29+
);
30+
expect(result).toEqual({
31+
additionalKeysLength: 6,
32+
select: 'col1,col2,timestamp,col3,col4,col5,id,timestamp2',
33+
});
34+
});
35+
36+
it('should extract map columns', () => {
37+
const result = appendSelectWithPrimaryAndPartitionKey(
38+
'col1, col2',
39+
`map['key']`,
40+
`map2['key'], map1['key3 ']`,
41+
);
42+
expect(result).toEqual({
43+
additionalKeysLength: 3,
44+
select: `col1,col2,map2['key'],map1['key3 '],map['key']`,
45+
});
46+
});
47+
48+
it('should extract map columns', () => {
49+
const result = appendSelectWithPrimaryAndPartitionKey(
50+
'col1, col2',
51+
``,
52+
`map2['key.2']`,
53+
);
54+
expect(result).toEqual({
55+
additionalKeysLength: 1,
56+
select: `col1,col2,map2['key.2']`,
57+
});
58+
});
59+
60+
it('should extract array columns', () => {
61+
const result = appendSelectWithPrimaryAndPartitionKey(
62+
'col1, col2',
63+
`array[1]`,
64+
`array[2], array[3]`,
65+
);
66+
expect(result).toEqual({
67+
additionalKeysLength: 3,
68+
select: `col1,col2,array[2],array[3],array[1]`,
69+
});
70+
});
71+
72+
it('should extract json columns', () => {
73+
const result = appendSelectWithPrimaryAndPartitionKey(
74+
'col1, col2',
75+
`json.b`,
76+
`json.a, json.b.c, toStartOfDay(timestamp, json_2.d)`,
77+
);
78+
expect(result).toEqual({
79+
additionalKeysLength: 5,
80+
select: `col1,col2,json.a,json.b.c,timestamp,json_2.d,json.b`,
81+
});
82+
});
83+
84+
it('should extract json columns with type specifiers', () => {
85+
const result = appendSelectWithPrimaryAndPartitionKey(
86+
'col1, col2',
87+
`json.b.:Int64`,
88+
`toStartOfDay(json.a.b.:DateTime)`,
89+
);
90+
expect(result).toEqual({
91+
additionalKeysLength: 2,
92+
select: `col1,col2,json.a.b,json.b`,
93+
});
94+
});
95+
96+
it('should skip json columns with hard-to-parse type specifiers', () => {
97+
const result = appendSelectWithPrimaryAndPartitionKey(
98+
'col1, col2',
99+
`json.b.:Array(String), col3`,
100+
``,
101+
);
102+
expect(result).toEqual({
103+
additionalKeysLength: 1,
104+
select: `col1,col2,col3`,
105+
});
106+
});
107+
108+
it('should skip nested map references', () => {
109+
const result = appendSelectWithPrimaryAndPartitionKey(
110+
'col1, col2',
111+
`map['key']['key2'], col3`,
112+
``,
113+
);
114+
expect(result).toEqual({
115+
additionalKeysLength: 1,
116+
select: `col1,col2,col3`,
117+
});
118+
});
119+
});

packages/app/src/source.ts

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import objectHash from 'object-hash';
55
import store from 'store2';
66
import {
77
ColumnMeta,
8-
extractColumnReference,
8+
extractColumnReferencesFromKey,
99
filterColumnMetaByType,
1010
JSDataType,
1111
} from '@hyperdx/common-utils/dist/clickhouse';
@@ -252,7 +252,9 @@ export async function inferTableSourceConfig({
252252
connectionId,
253253
})
254254
).primary_key;
255-
const keys = splitAndTrimWithBracket(primaryKeys);
255+
const primaryKeyColumns = new Set(
256+
extractColumnReferencesFromKey(primaryKeys),
257+
);
256258

257259
const isOtelLogSchema = hasAllColumns(columns, [
258260
'Timestamp',
@@ -285,12 +287,7 @@ export async function inferTableSourceConfig({
285287

286288
const timestampColumns = filterColumnMetaByType(columns, [JSDataType.Date]);
287289
const primaryKeyTimestampColumn = timestampColumns?.find(c =>
288-
keys.find(
289-
k =>
290-
// If the key is a fn call like toUnixTimestamp(Timestamp), we need to strip it
291-
// We can't use substr match since "Timestamp" would match "TimestampTime"
292-
extractColumnReference(k) === c.name,
293-
),
290+
primaryKeyColumns.has(c.name),
294291
);
295292

296293
return {
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import { extractColumnReferencesFromKey } from '..';
2+
3+
describe('extractColumnReferencesFromKey', () => {
4+
it('should extract column references from simple column names', () => {
5+
expect(extractColumnReferencesFromKey('col1, col2, col3')).toEqual([
6+
'col1',
7+
'col2',
8+
'col3',
9+
]);
10+
});
11+
12+
it('should extract column references from function calls', () => {
13+
expect(
14+
extractColumnReferencesFromKey(
15+
"toStartOfInterval(timestamp, toIntervalDay(3)), col2, date_diff('DAY', col3, col4), now(), toDate(col5 + INTERVAL 1 DAY)",
16+
),
17+
).toEqual(['timestamp', 'col2', 'col3', 'col4', 'col5']);
18+
});
19+
20+
it('should handle an empty expression', () => {
21+
expect(extractColumnReferencesFromKey('')).toEqual([]);
22+
});
23+
24+
it('should handle map / json access expression', () => {
25+
// This is imperfect due to lack of full ClickHouse SQL parsing - we don't pickup the nested map access here.
26+
// It is not expected to be a common case that there are nested map accesses in a primary or partition key,
27+
// so we just want to make sure we don't error out in this case.
28+
expect(
29+
extractColumnReferencesFromKey("mapCol[otherMap['key']], col2"),
30+
).toEqual(['col2']);
31+
});
32+
33+
it('should handle array accesses', () => {
34+
expect(extractColumnReferencesFromKey('arrayCol[1], col2')).toEqual([
35+
'arrayCol[1]',
36+
'col2',
37+
]);
38+
});
39+
});

packages/common-utils/src/clickhouse/index.ts

Lines changed: 48 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ import {
1919
splitChartConfigs,
2020
} from '@/renderChartConfig';
2121
import { ChartConfigWithOptDateRange, SQLInterval } from '@/types';
22-
import { hashCode } from '@/utils';
22+
import { hashCode, splitAndTrimWithBracket } from '@/utils';
2323

2424
// export @clickhouse/client-common types
2525
export type {
@@ -273,22 +273,55 @@ export class ClickHouseQueryError extends Error {
273273
}
274274
}
275275

276-
export function extractColumnReference(
277-
sql: string,
278-
maxIterations = 10,
279-
): string | null {
280-
let iterations = 0;
281-
282-
// Loop until we remove all function calls and get just the column, with a maximum limit
283-
while (/\w+\([^()]*\)/.test(sql) && iterations < maxIterations) {
284-
// Replace the outermost function with its content
285-
sql = sql.replace(/\w+\(([^()]*)\)/, '$1');
286-
iterations++;
276+
/**
277+
* Returns columns referenced in given expression, where the expression is a comma-separated list of SQL expressions
278+
* E.g. "id, toStartOfInterval(timestamp, toIntervalDay(3)), user_id, json.a.b".
279+
*/
280+
export const extractColumnReferencesFromKey = (expr: string): string[] => {
281+
const parser = new SQLParser.Parser();
282+
283+
const exprs = splitAndTrimWithBracket(expr);
284+
if (!exprs?.length) {
285+
return [];
287286
}
288287

289-
// If we reached the max iterations without resolving, return null to indicate an issue
290-
return iterations < maxIterations ? sql.trim() : null;
291-
}
288+
return exprs.flatMap(expr => {
289+
try {
290+
// Extract map or array access expressions, e.g. map['key'] or array[1], since node-sql-parser does not support them.
291+
const mapAccessRegex = /\b[a-zA-Z0-9_]+\[([0-9]+|'[^']*')\]/g;
292+
const mapAccesses = expr.match(mapAccessRegex) || [];
293+
294+
// Replace map/array accesses with a literal string ('') so that node-sql-parser ignores them
295+
const exprWithoutMaps = expr.replace(mapAccessRegex, "''");
296+
297+
// Strip out any JSON type expressions, eg. in json.a.:Int64, remove the .:Int64 part
298+
const exprWithoutMapsOrJsonType = exprWithoutMaps.replace(
299+
/\.:[a-zA-Z0-9]+/g,
300+
'',
301+
);
302+
303+
// Extract out any JSON path expressions, since node-sql-parser does not support them.
304+
const jsonPathRegex = /\b[a-zA-Z0-9_]+\.[a-zA-Z0-9_.]+/g;
305+
const jsonPaths = exprWithoutMapsOrJsonType.match(jsonPathRegex) || [];
306+
307+
// Replace JSON paths and map/array accesses with a literal string ('') so that node-sql-parser ignores them
308+
const exprWithoutMapsOrJson = exprWithoutMapsOrJsonType.replace(
309+
jsonPathRegex,
310+
"''",
311+
);
312+
313+
// Parse remaining column references with node-sql-parser
314+
const parsedColumnList = parser
315+
.columnList(`select ${exprWithoutMapsOrJson}`)
316+
.map(col => col.split('::')[2]);
317+
318+
return [...new Set([...parsedColumnList, ...jsonPaths, ...mapAccesses])];
319+
} catch (e) {
320+
console.error('Error parsing column references from key', e, expr);
321+
return [];
322+
}
323+
});
324+
};
292325

293326
const castToNumber = (value: string | number) => {
294327
if (typeof value === 'string') {

0 commit comments

Comments
 (0)