forked from ultraworkers/claw-code
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfirstPartyEventLogger.ts
More file actions
449 lines (401 loc) · 14.2 KB
/
Copy pathfirstPartyEventLogger.ts
File metadata and controls
449 lines (401 loc) · 14.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
import type { AnyValueMap, Logger, logs } from '@opentelemetry/api-logs'
import { resourceFromAttributes } from '@opentelemetry/resources'
import {
BatchLogRecordProcessor,
LoggerProvider,
} from '@opentelemetry/sdk-logs'
import {
ATTR_SERVICE_NAME,
ATTR_SERVICE_VERSION,
} from '@opentelemetry/semantic-conventions'
import { randomUUID } from 'crypto'
import { isEqual } from 'lodash-es'
import { getOrCreateUserID } from '../../utils/config.js'
import { logForDebugging } from '../../utils/debug.js'
import { logError } from '../../utils/log.js'
import { getPlatform, getWslVersion } from '../../utils/platform.js'
import { jsonStringify } from '../../utils/slowOperations.js'
import { profileCheckpoint } from '../../utils/startupProfiler.js'
import { getCoreUserData } from '../../utils/user.js'
import { isAnalyticsDisabled } from './config.js'
import { FirstPartyEventLoggingExporter } from './firstPartyEventLoggingExporter.js'
import type { GrowthBookUserAttributes } from './growthbook.js'
import { getDynamicConfig_CACHED_MAY_BE_STALE } from './growthbook.js'
import { getEventMetadata } from './metadata.js'
import { isSinkKilled } from './sinkKillswitch.js'
/**
* Configuration for sampling individual event types.
* Each event name maps to an object containing sample_rate (0-1).
* Events not in the config are logged at 100% rate.
*/
export type EventSamplingConfig = {
[eventName: string]: {
sample_rate: number
}
}
const EVENT_SAMPLING_CONFIG_NAME = 'tengu_event_sampling_config'
/**
* Get the event sampling configuration from GrowthBook.
* Uses cached value if available, updates cache in background.
*/
export function getEventSamplingConfig(): EventSamplingConfig {
return getDynamicConfig_CACHED_MAY_BE_STALE<EventSamplingConfig>(
EVENT_SAMPLING_CONFIG_NAME,
{},
)
}
/**
* Determine if an event should be sampled based on its sample rate.
* Returns the sample rate if sampled, null if not sampled.
*
* @param eventName - Name of the event to check
* @returns The sample_rate if event should be logged, null if it should be dropped
*/
export function shouldSampleEvent(eventName: string): number | null {
const config = getEventSamplingConfig()
const eventConfig = config[eventName]
// If no config for this event, log at 100% rate (no sampling)
if (!eventConfig) {
return null
}
const sampleRate = eventConfig.sample_rate
// Validate sample rate is in valid range
if (typeof sampleRate !== 'number' || sampleRate < 0 || sampleRate > 1) {
return null
}
// Sample rate of 1 means log everything (no need to add metadata)
if (sampleRate >= 1) {
return null
}
// Sample rate of 0 means drop everything
if (sampleRate <= 0) {
return 0
}
// Randomly decide whether to sample this event
return Math.random() < sampleRate ? sampleRate : 0
}
const BATCH_CONFIG_NAME = 'tengu_1p_event_batch_config'
type BatchConfig = {
scheduledDelayMillis?: number
maxExportBatchSize?: number
maxQueueSize?: number
skipAuth?: boolean
maxAttempts?: number
path?: string
baseUrl?: string
}
function getBatchConfig(): BatchConfig {
return getDynamicConfig_CACHED_MAY_BE_STALE<BatchConfig>(
BATCH_CONFIG_NAME,
{},
)
}
// Module-local state for event logging (not exposed globally)
let firstPartyEventLogger: ReturnType<typeof logs.getLogger> | null = null
let firstPartyEventLoggerProvider: LoggerProvider | null = null
// Last batch config used to construct the provider — used by
// reinitialize1PEventLoggingIfConfigChanged to decide whether a rebuild is
// needed when GrowthBook refreshes.
let lastBatchConfig: BatchConfig | null = null
/**
* Flush and shutdown the 1P event logger.
* This should be called as the final step before process exit to ensure
* all events (including late ones from API responses) are exported.
*/
export async function shutdown1PEventLogging(): Promise<void> {
if (!firstPartyEventLoggerProvider) {
return
}
try {
await firstPartyEventLoggerProvider.shutdown()
if (process.env.USER_TYPE === 'ant') {
logForDebugging('1P event logging: final shutdown complete')
}
} catch {
// Ignore shutdown errors
}
}
/**
* Check if 1P event logging is enabled.
* Respects the same opt-outs as other analytics sinks:
* - Test environment
* - Third-party cloud providers (Bedrock/Vertex)
* - Global telemetry opt-outs
* - Non-essential traffic disabled
*
* Note: Unlike BigQuery metrics, event logging does NOT check organization-level
* metrics opt-out via API. It follows the same pattern as Statsig event logging.
*/
export function is1PEventLoggingEnabled(): boolean {
// Respect standard analytics opt-outs
return !isAnalyticsDisabled()
}
/**
* Log a 1st-party event for internal analytics (async version).
* Events are batched and exported to /api/event_logging/batch
*
* This enriches the event with core metadata (model, session, env context, etc.)
* at log time, similar to logEventToStatsig.
*
* @param eventName - Name of the event (e.g., 'tengu_api_query')
* @param metadata - Additional metadata for the event (intentionally no strings, to avoid accidentally logging code/filepaths)
*/
async function logEventTo1PAsync(
firstPartyEventLogger: Logger,
eventName: string,
metadata: Record<string, number | boolean | undefined> = {},
): Promise<void> {
try {
// Enrich with core metadata at log time (similar to Statsig pattern)
const coreMetadata = await getEventMetadata({
model: metadata.model,
betas: metadata.betas,
})
// Build attributes - OTel supports nested objects natively via AnyValueMap
// Cast through unknown since our nested objects are structurally compatible
// with AnyValue but TS doesn't recognize it due to missing index signatures
const attributes = {
event_name: eventName,
event_id: randomUUID(),
// Pass objects directly - no JSON serialization needed
core_metadata: coreMetadata,
user_metadata: getCoreUserData(true),
event_metadata: metadata,
} as unknown as AnyValueMap
// Add user_id if available
const userId = getOrCreateUserID()
if (userId) {
attributes.user_id = userId
}
// Debug logging when debug mode is enabled
if (process.env.USER_TYPE === 'ant') {
logForDebugging(
`[ANT-ONLY] 1P event: ${eventName} ${jsonStringify(metadata, null, 0)}`,
)
}
// Emit log record
firstPartyEventLogger.emit({
body: eventName,
attributes,
})
} catch (e) {
if (process.env.NODE_ENV === 'development') {
throw e
}
if (process.env.USER_TYPE === 'ant') {
logError(e as Error)
}
// swallow
}
}
/**
* Log a 1st-party event for internal analytics.
* Events are batched and exported to /api/event_logging/batch
*
* @param eventName - Name of the event (e.g., 'tengu_api_query')
* @param metadata - Additional metadata for the event (intentionally no strings, to avoid accidentally logging code/filepaths)
*/
export function logEventTo1P(
eventName: string,
metadata: Record<string, number | boolean | undefined> = {},
): void {
if (!is1PEventLoggingEnabled()) {
return
}
if (!firstPartyEventLogger || isSinkKilled('firstParty')) {
return
}
// Fire and forget - don't block on metadata enrichment
void logEventTo1PAsync(firstPartyEventLogger, eventName, metadata)
}
/**
* GrowthBook experiment event data for logging
*/
export type GrowthBookExperimentData = {
experimentId: string
variationId: number
userAttributes?: GrowthBookUserAttributes
experimentMetadata?: Record<string, unknown>
}
// api.anthropic.com only serves the "production" GrowthBook environment
// (see starling/starling/cli/cli.py DEFAULT_ENVIRONMENTS). Staging and
// development environments are not exported to the prod API.
function getEnvironmentForGrowthBook(): string {
return 'production'
}
/**
* Log a GrowthBook experiment assignment event to 1P.
* Events are batched and exported to /api/event_logging/batch
*
* @param data - GrowthBook experiment assignment data
*/
export function logGrowthBookExperimentTo1P(
data: GrowthBookExperimentData,
): void {
if (!is1PEventLoggingEnabled()) {
return
}
if (!firstPartyEventLogger || isSinkKilled('firstParty')) {
return
}
const userId = getOrCreateUserID()
const { accountUuid, organizationUuid } = getCoreUserData(true)
// Build attributes for GrowthbookExperimentEvent
const attributes = {
event_type: 'GrowthbookExperimentEvent',
event_id: randomUUID(),
experiment_id: data.experimentId,
variation_id: data.variationId,
...(userId && { device_id: userId }),
...(accountUuid && { account_uuid: accountUuid }),
...(organizationUuid && { organization_uuid: organizationUuid }),
...(data.userAttributes && {
session_id: data.userAttributes.sessionId,
user_attributes: jsonStringify(data.userAttributes),
}),
...(data.experimentMetadata && {
experiment_metadata: jsonStringify(data.experimentMetadata),
}),
environment: getEnvironmentForGrowthBook(),
}
if (process.env.USER_TYPE === 'ant') {
logForDebugging(
`[ANT-ONLY] 1P GrowthBook experiment: ${data.experimentId} variation=${data.variationId}`,
)
}
firstPartyEventLogger.emit({
body: 'growthbook_experiment',
attributes,
})
}
const DEFAULT_LOGS_EXPORT_INTERVAL_MS = 10000
const DEFAULT_MAX_EXPORT_BATCH_SIZE = 200
const DEFAULT_MAX_QUEUE_SIZE = 8192
/**
* Initialize 1P event logging infrastructure.
* This creates a separate LoggerProvider for internal event logging,
* independent of customer OTLP telemetry.
*
* This uses its own minimal resource configuration with just the attributes
* we need for internal analytics (service name, version, platform info).
*/
export function initialize1PEventLogging(): void {
profileCheckpoint('1p_event_logging_start')
const enabled = is1PEventLoggingEnabled()
if (!enabled) {
if (process.env.USER_TYPE === 'ant') {
logForDebugging('1P event logging not enabled')
}
return
}
// Fetch batch processor configuration from GrowthBook dynamic config
// Uses cached value if available, refreshes in background
const batchConfig = getBatchConfig()
lastBatchConfig = batchConfig
profileCheckpoint('1p_event_after_growthbook_config')
const scheduledDelayMillis =
batchConfig.scheduledDelayMillis ||
parseInt(
process.env.OTEL_LOGS_EXPORT_INTERVAL ||
DEFAULT_LOGS_EXPORT_INTERVAL_MS.toString(),
)
const maxExportBatchSize =
batchConfig.maxExportBatchSize || DEFAULT_MAX_EXPORT_BATCH_SIZE
const maxQueueSize = batchConfig.maxQueueSize || DEFAULT_MAX_QUEUE_SIZE
// Build our own resource for 1P event logging with minimal attributes
const platform = getPlatform()
const attributes: Record<string, string> = {
[ATTR_SERVICE_NAME]: 'claude-code',
[ATTR_SERVICE_VERSION]: MACRO.VERSION,
}
// Add WSL-specific attributes if running on WSL
if (platform === 'wsl') {
const wslVersion = getWslVersion()
if (wslVersion) {
attributes['wsl.version'] = wslVersion
}
}
const resource = resourceFromAttributes(attributes)
// Create a new LoggerProvider with the EventLoggingExporter
// NOTE: This is kept separate from customer telemetry logs to ensure
// internal events don't leak to customer endpoints and vice versa.
// We don't register this globally - it's only used for internal event logging.
const eventLoggingExporter = new FirstPartyEventLoggingExporter({
maxBatchSize: maxExportBatchSize,
skipAuth: batchConfig.skipAuth,
maxAttempts: batchConfig.maxAttempts,
path: batchConfig.path,
baseUrl: batchConfig.baseUrl,
isKilled: () => isSinkKilled('firstParty'),
})
firstPartyEventLoggerProvider = new LoggerProvider({
resource,
processors: [
new BatchLogRecordProcessor(eventLoggingExporter, {
scheduledDelayMillis,
maxExportBatchSize,
maxQueueSize,
}),
],
})
// Initialize event logger from our internal provider (NOT from global API)
// IMPORTANT: We must get the logger from our local provider, not logs.getLogger()
// because logs.getLogger() returns a logger from the global provider, which is
// separate and used for customer telemetry.
firstPartyEventLogger = firstPartyEventLoggerProvider.getLogger(
'com.anthropic.claude_code.events',
MACRO.VERSION,
)
}
/**
* Rebuild the 1P event logging pipeline if the batch config changed.
* Register this with onGrowthBookRefresh so long-running sessions pick up
* changes to batch size, delay, endpoint, etc.
*
* Event-loss safety:
* 1. Null the logger first — concurrent logEventTo1P() calls hit the
* !firstPartyEventLogger guard and bail during the swap window. This drops
* a handful of events but prevents emitting to a draining provider.
* 2. forceFlush() drains the old BatchLogRecordProcessor buffer to the
* exporter. Export failures go to disk at getCurrentBatchFilePath() which
* is keyed by module-level BATCH_UUID + sessionId — unchanged across
* reinit — so the NEW exporter's disk-backed retry picks them up.
* 3. Swap to new provider/logger; old provider shutdown runs in background
* (buffer already drained, just cleanup).
*/
export async function reinitialize1PEventLoggingIfConfigChanged(): Promise<void> {
if (!is1PEventLoggingEnabled() || !firstPartyEventLoggerProvider) {
return
}
const newConfig = getBatchConfig()
if (isEqual(newConfig, lastBatchConfig)) {
return
}
if (process.env.USER_TYPE === 'ant') {
logForDebugging(
`1P event logging: ${BATCH_CONFIG_NAME} changed, reinitializing`,
)
}
const oldProvider = firstPartyEventLoggerProvider
const oldLogger = firstPartyEventLogger
firstPartyEventLogger = null
try {
await oldProvider.forceFlush()
} catch {
// Export failures are already on disk; new exporter will retry them.
}
firstPartyEventLoggerProvider = null
try {
initialize1PEventLogging()
} catch (e) {
// Restore so the next GrowthBook refresh can retry. oldProvider was
// only forceFlush()'d, not shut down — it's still functional. Without
// this, both stay null and the !firstPartyEventLoggerProvider gate at
// the top makes recovery impossible.
firstPartyEventLoggerProvider = oldProvider
firstPartyEventLogger = oldLogger
logError(e)
return
}
void oldProvider.shutdown().catch(() => {})
}