@@ -6,10 +6,13 @@ import { BatchConsumer, startBatchConsumer } from '../kafka/batch-consumer'
6
6
import { createRdConnectionConfigFromEnvVars } from '../kafka/config'
7
7
import { addSentryBreadcrumbsEventListeners } from '../main/ingestion-queues/kafka-metrics'
8
8
import { runInstrumentedFunction } from '../main/utils'
9
+ import { PostgresRouter } from '../utils/db/postgres'
9
10
import {
10
11
ClickHouseEvent ,
12
+ EventDefinitionType ,
11
13
EventPropertyType ,
12
14
Hub ,
15
+ PluginsServerConfig ,
13
16
PluginServerService ,
14
17
PropertyDefinitionType ,
15
18
PropertyDefinitionTypeEnum ,
@@ -18,21 +21,41 @@ import {
18
21
} from '../types'
19
22
import { parseRawClickHouseEvent } from '../utils/event'
20
23
import { status } from '../utils/status'
24
+ import { castTimestampToClickhouseFormat } from '../utils/utils'
21
25
22
26
// Must require as `tsc` strips unused `import` statements and just requiring this seems to init some globals
23
27
require ( '@sentry/tracing' )
24
28
29
+ // TODO(eli): wire up LOTS more metrics ASAP!
30
+
25
31
export const propertyDefTypesCounter = new Counter ( {
26
32
name : 'property_defs_types_total' ,
27
33
help : 'Count of derived property types.' ,
28
34
labelNames : [ 'type' ] ,
29
35
} )
30
36
37
+ export const eventDefTypesCounter = new Counter ( {
38
+ name : 'event_defs_types_total' ,
39
+ help : 'Count of new event definitions.' ,
40
+ } )
41
+
42
+ export const eventPropTypesCounter = new Counter ( {
43
+ name : 'event_props_types_total' ,
44
+ help : 'Count of derived event properties.' ,
45
+ } )
46
+
31
47
export type CollectedPropertyDefinitions = {
48
+ teamIdsInBatch : Set < number >
49
+ teamIdsWithGroupUpdatesInBatch : Set < number >
50
+ eventDefinitionsById : Record < string , EventDefinitionType >
32
51
propertyDefinitionsById : Record < string , PropertyDefinitionType >
33
- eventPropertiesByEventById : Record < string , EventPropertyType >
52
+ eventPropertiesById : Record < string , EventPropertyType >
34
53
}
35
54
55
+ // lifted from here:
56
+ // https://github.com/PostHog/posthog/blob/021aaab04b4acd96cf8121c033ac3b0042492598/rust/property-defs-rs/src/types.rs#L457-L461
57
+ const DJANGO_MAX_CHARFIELD_LENGTH = 200
58
+
36
59
// These properties have special meaning, and are ignored
37
60
const SKIP_PROPERTIES : string [ ] = [
38
61
'$set' ,
@@ -46,7 +69,56 @@ const SKIP_PROPERTIES: string[] = [
46
69
'$groups' ,
47
70
]
48
71
49
- export const getPropertyType = ( key : string , value : any ) : PropertyType | null => {
72
+ const DATE_PROP_KEYWORDS : string [ ] = [
73
+ "time" ,
74
+ "timestamp" ,
75
+ "date" ,
76
+ "_at" ,
77
+ "-at" ,
78
+ "createdat" ,
79
+ "updatedat" ,
80
+ ]
81
+
82
+ //
83
+ // SQL queries
84
+ //
85
+
86
+ const WRITE_EVENT_PROPERTY = `
87
+ INSERT INTO posthog_eventproperty (event, property, team_id, project_id)
88
+ VALUES ($1, $2, $3, $4)
89
+ ON CONFLICT DO NOTHING
90
+ `
91
+
92
+ const WRITE_PROPERTY_DEFINITION = `
93
+ INSERT INTO posthog_propertydefinition (id, name, type, group_type_index, is_numerical, volume_30_day, query_usage_30_day, team_id, project_id, property_type)
94
+ VALUES ($1, $2, $3, $4, $5, NULL, NULL, $6, $7, $8)
95
+ ON CONFLICT (coalesce(project_id, team_id::bigint), name, type, coalesce(group_type_index, -1))
96
+ DO UPDATE SET property_type=EXCLUDED.property_type WHERE posthog_propertydefinition.property_type IS NULL
97
+ `
98
+
99
+ const WRITE_EVENT_DEFINITION = `
100
+ INSERT INTO posthog_eventdefinition (id, name, volume_30_day, query_usage_30_day, team_id, project_id, last_seen_at, created_at)
101
+ VALUES ($1, $2, NULL, NULL, $3, $4, $5, NOW())
102
+ ON CONFLICT (coalesce(project_id, team_id::bigint), name)
103
+ DO UPDATE SET last_seen_at = $5
104
+ `
105
+
106
+ // TODO(eli): TBD - replace VALUES with array of integer team IDs, maybe something like this?
107
+ // https://github.com/PostHog/posthog/blob/master/plugin-server/src/utils/db/postgres.ts#L90-L110
108
+ const FETCH_TEAM_IDS = `
109
+ SELECT id AS team_id FROM posthog_team WHERE id = ANY (ARRAY[{VALUES}])
110
+ `
111
+
112
+ // TODO(eli): same here...
113
+ const FETCH_GROUP_TYPES_BY_TEAM_IDS = `
114
+ SELECT pt.id AS team_id, pgtm.group_type, pgtm.group_type_index FROM posthog_team AS pt
115
+ JOIN posthog_grouptypemapping AS pgtm ON pt.id = pgtm.team_id
116
+ WHERE pt.id = ANY (ARRAY[{VALUES}])
117
+ `
118
+
119
+ export const getPropertyType = ( rawKey : string , value : any ) : PropertyType | null => {
120
+ const key = rawKey . trim ( ) . toLowerCase ( )
121
+
50
122
// Special cases for certain property prefixes
51
123
if ( key . startsWith ( 'utm_' ) ) {
52
124
// utm_ prefixed properties should always be detected as strings.
@@ -80,36 +152,54 @@ export const getPropertyType = (key: string, value: any): PropertyType | null =>
80
152
81
153
if ( typeof value === 'string' ) {
82
154
const s = value . trim ( )
83
- if ( s === 'true' || s === 'false' || s === 'TRUE' || s === 'FALSE' ) {
155
+ if ( s === 'true' || s === 'false' ) {
84
156
return PropertyType . Boolean
85
157
}
86
158
// Try to parse this as an ISO 8601 date
87
159
try {
160
+ if ( DATE_PROP_KEYWORDS . some ( kw => key . includes ( kw ) ) ) {
161
+ return PropertyType . DateTime
162
+ }
88
163
const date = DateTime . fromISO ( s )
89
164
if ( date . isValid ) {
90
165
return PropertyType . DateTime
91
166
}
167
+ // TODO(eli): add speculative date string matching?
168
+
92
169
} catch {
93
170
// Not a valid date, continue to string type
94
171
}
95
172
return PropertyType . String
96
173
}
97
174
175
+ if ( typeof value === 'boolean' ) {
176
+ return PropertyType . Boolean
177
+ }
178
+
98
179
if ( typeof value === 'number' ) {
99
- // Check if the key contains timestamp-related keywords
100
- if ( key . includes ( 'timestamp' ) || key . includes ( 'TIMESTAMP' ) || key . includes ( 'time' ) || key . includes ( 'TIME' ) ) {
180
+ if ( value >= sixMonthsAgoUnixSeconds ( ) ) {
101
181
return PropertyType . DateTime
102
182
}
103
183
return PropertyType . Numeric
104
184
}
105
185
106
- if ( typeof value === 'boolean' ) {
107
- return PropertyType . Boolean
108
- }
109
-
110
186
return null
111
187
}
112
188
189
+ function willFitInPostgres ( s : string ) {
190
+ return s . length < DJANGO_MAX_CHARFIELD_LENGTH
191
+ }
192
+
193
+ function sanitizeEventName ( eventName : string ) {
194
+ return eventName . replace ( '\u0000' , '\uFFFD' ) ;
195
+ }
196
+
197
+ function sixMonthsAgoUnixSeconds ( ) {
198
+ const now = new Date ( ) ;
199
+ now . setMonth ( now . getMonth ( ) - 6 ) ;
200
+ return Math . floor ( now . getTime ( ) / 1000 ) ;
201
+ }
202
+
113
203
/**
114
204
* NOTE: This is currently experimental and only used to do some testing on performance and comparisons.
115
205
*/
@@ -119,14 +209,18 @@ export class PropertyDefsConsumer {
119
209
protected topic : string
120
210
121
211
batchConsumer ?: BatchConsumer
212
+ db : PostgresRouter
213
+ config : PluginsServerConfig
122
214
isStopping = false
123
215
protected heartbeat = ( ) => { }
124
216
protected promises : Set < Promise < any > > = new Set ( )
125
217
126
- constructor ( private hub : Hub ) {
218
+ constructor ( private hub : Hub , config : PluginsServerConfig ) {
127
219
// The group and topic are configurable allowing for multiple ingestion consumers to be run in parallel
128
220
this . groupId = hub . PROPERTY_DEFS_CONSUMER_GROUP_ID
129
221
this . topic = hub . PROPERTY_DEFS_CONSUMER_CONSUME_TOPIC
222
+ this . config = config ,
223
+ this . db = hub ?. postgres ?? new PostgresRouter ( this . config )
130
224
}
131
225
132
226
public get service ( ) : PluginServerService {
@@ -175,34 +269,89 @@ export class PropertyDefsConsumer {
175
269
public async handleKafkaBatch ( messages : Message [ ] ) {
176
270
const parsedMessages = await this . runInstrumented ( 'parseKafkaMessages' , ( ) => this . parseKafkaBatch ( messages ) )
177
271
const collected = await this . runInstrumented ( 'derivePropDefs' , ( ) =>
178
- Promise . resolve ( this . derivePropDefs ( parsedMessages ) )
272
+ Promise . resolve ( this . extractPropertyDefinitions ( parsedMessages ) )
179
273
)
180
274
275
+ for ( const eventDef of Object . values ( collected . eventDefinitionsById ) ) {
276
+ eventDefTypesCounter . inc ( )
277
+ console . log ( eventDef ) // TODO(eli): temp: make linter happy
278
+ // TODO(eli): write it!
279
+ }
280
+
181
281
for ( const propDef of Object . values ( collected . propertyDefinitionsById ) ) {
182
282
propertyDefTypesCounter . inc ( { type : propDef . property_type ?? 'null' } )
283
+ // TODO(eli): write it!
183
284
}
184
285
185
- // TODO: Get all the related property defs from the DB and compare what we would have written for all those that don't exist
186
- // TODO: Write prop defs to DB
286
+ for ( const eventProp of Object . values ( collected . eventPropertiesById ) ) {
287
+ eventPropTypesCounter . inc ( )
288
+ console . log ( eventProp ) // TODO(eli): temp: make linter happy
289
+ // TODO(eli): write it!
290
+ }
187
291
188
292
status . debug ( '🔁' , `Waiting for promises` , { promises : this . promises . size } )
189
293
await this . runInstrumented ( 'awaitScheduledWork' , ( ) => Promise . all ( this . promises ) )
190
294
status . debug ( '🔁' , `Processed batch` )
191
295
}
192
296
193
- private derivePropDefs ( events : ClickHouseEvent [ ] ) : CollectedPropertyDefinitions {
297
+ private extractPropertyDefinitions ( events : ClickHouseEvent [ ] ) : CollectedPropertyDefinitions {
194
298
const collected : CollectedPropertyDefinitions = {
299
+ // TODO(eli): look these up in batches as pre-write step
300
+ teamIdsInBatch : new Set < number > ,
301
+ // TODO(eli): look these up in batches to resolve group types as pre-write step
302
+ teamIdsWithGroupUpdatesInBatch : new Set < number > ,
303
+ // deduped from batch, written to posthog_eventdefinition
304
+ eventDefinitionsById : { } ,
305
+ // deduped from batch, written to posthog_propertydefinition
195
306
propertyDefinitionsById : { } ,
196
- eventPropertiesByEventById : { } ,
307
+ // deduped from batch, written to posthog_eventproperty
308
+ eventPropertiesById : { } ,
197
309
}
198
310
199
311
for ( const event of events ) {
312
+ // these will be looked up later to trim write batches if team doesn't exist
313
+ if ( ! collected . teamIdsInBatch . has ( event . team_id ) ) {
314
+ collected . teamIdsInBatch . add ( event . team_id ) ;
315
+ }
316
+
317
+ event . event = sanitizeEventName ( event . event )
318
+
319
+ if ( ! willFitInPostgres ( event . event ) ) {
320
+ continue
321
+ }
322
+
323
+ const eventDefIdKey : string = `${ event . team_id } :${ event . event } `
324
+
325
+ if ( ! collected . eventDefinitionsById [ eventDefIdKey ] ) {
326
+ collected . eventDefinitionsById [ eventDefIdKey ] = {
327
+ id : eventDefIdKey ,
328
+ name : event . event ,
329
+ team_id : event . team_id ,
330
+ project_id : event . team_id , // TODO: add project_id
331
+ created_at : event . created_at . toISO ( ) || DateTime . now ( ) . toString ( ) ,
332
+ volume_30_day : 0 , // deprecated
333
+ query_usage_30_day : 0 , // deprecated
334
+ }
335
+ }
336
+
200
337
// Detect group identify events
201
338
if ( event . event === '$groupidentify' ) {
339
+ if ( ! collected . teamIdsWithGroupUpdatesInBatch . has ( event . team_id ) ) {
340
+ collected . teamIdsWithGroupUpdatesInBatch . add ( event . team_id ) ;
341
+ }
342
+
343
+ // bail on this event if there's no group type assigned
202
344
const groupType : string | undefined = event . properties [ '$group_type' ] // e.g. "organization"
203
- const groupProperties : Record < string , any > | undefined = event . properties [ '$group_set' ] // { name: 'value', id: 'id', foo: "bar" }
345
+ if ( typeof groupType === 'undefined' ) {
346
+ continue
347
+ }
204
348
349
+ const groupProperties : Record < string , any > | undefined = event . properties [ '$group_set' ] // { name: 'value', id: 'id', foo: "bar" }
205
350
for ( const [ property , value ] of Object . entries ( groupProperties ?? { } ) ) {
351
+ if ( ! willFitInPostgres ( property ) ) {
352
+ continue
353
+ }
354
+
206
355
const propDefId = `${ event . team_id } :${ groupType } :${ property } `
207
356
208
357
if ( collected . propertyDefinitionsById [ propDefId ] ) {
@@ -219,7 +368,8 @@ export class PropertyDefsConsumer {
219
368
project_id : event . team_id , // TODO: Add project_id
220
369
property_type : propType ,
221
370
type : PropertyDefinitionTypeEnum . Event ,
222
- group_type_index : 0 , // TODO: This!
371
+ group_type_name : groupType ,
372
+ group_type_index : 0 , // TODO(eli): resolve these w/DB query on team_id using "groupType"
223
373
}
224
374
}
225
375
}
@@ -229,6 +379,10 @@ export class PropertyDefsConsumer {
229
379
230
380
// Detect person properties
231
381
for ( const [ property , value ] of Object . entries ( event . person_properties ?? { } ) ) {
382
+ if ( ! willFitInPostgres ( property ) ) {
383
+ continue
384
+ }
385
+
232
386
const propDefPersonId = `${ event . team_id } :person:${ property } `
233
387
234
388
if ( ! collected . propertyDefinitionsById [ propDefPersonId ] ) {
@@ -249,7 +403,7 @@ export class PropertyDefsConsumer {
249
403
250
404
// Detect event properties
251
405
for ( const [ property , value ] of Object . entries ( event . properties ) ) {
252
- if ( SKIP_PROPERTIES . includes ( property ) ) {
406
+ if ( ! willFitInPostgres ( property ) || SKIP_PROPERTIES . includes ( property ) ) {
253
407
continue
254
408
}
255
409
@@ -270,11 +424,11 @@ export class PropertyDefsConsumer {
270
424
}
271
425
}
272
426
273
- const eventDefId = `${ event . team_id } :${ event . event } :${ property } `
427
+ const eventPropId = `${ event . team_id } :${ event . event } :${ property } `
274
428
275
- if ( ! collected . eventPropertiesByEventById [ eventDefId ] ) {
276
- collected . eventPropertiesByEventById [ eventDefId ] = {
277
- id : eventDefId ,
429
+ if ( ! collected . eventPropertiesById [ eventPropId ] ) {
430
+ collected . eventPropertiesById [ eventPropId ] = {
431
+ id : eventPropId ,
278
432
event : event . event ,
279
433
property,
280
434
team_id : event . team_id ,
0 commit comments