services/analytics/datadog.ts at main

oppi.li / claude-code
fork atom
source dump of claude code
fork atom
claude-code / services / analytics / datadog.ts
at main 307 lines 9.1 kB view raw
wrap content
oppi.li dump from zip 9d ago
63aada3f
  1import axios from 'axios'
  2import { createHash } from 'crypto'
  3import memoize from 'lodash-es/memoize.js'
  4import { getOrCreateUserID } from '../../utils/config.js'
  5import { logError } from '../../utils/log.js'
  6import { getCanonicalName } from '../../utils/model/model.js'
  7import { getAPIProvider } from '../../utils/model/providers.js'
  8import { MODEL_COSTS } from '../../utils/modelCost.js'
  9import { isAnalyticsDisabled } from './config.js'
 10import { getEventMetadata } from './metadata.js'
 11
 12const DATADOG_LOGS_ENDPOINT =
 13  'https://http-intake.logs.us5.datadoghq.com/api/v2/logs'
 14const DATADOG_CLIENT_TOKEN = 'pubbbf48e6d78dae54bceaa4acf463299bf'
 15const DEFAULT_FLUSH_INTERVAL_MS = 15000
 16const MAX_BATCH_SIZE = 100
 17const NETWORK_TIMEOUT_MS = 5000
 18
 19const DATADOG_ALLOWED_EVENTS = new Set([
 20  'chrome_bridge_connection_succeeded',
 21  'chrome_bridge_connection_failed',
 22  'chrome_bridge_disconnected',
 23  'chrome_bridge_tool_call_completed',
 24  'chrome_bridge_tool_call_error',
 25  'chrome_bridge_tool_call_started',
 26  'chrome_bridge_tool_call_timeout',
 27  'tengu_api_error',
 28  'tengu_api_success',
 29  'tengu_brief_mode_enabled',
 30  'tengu_brief_mode_toggled',
 31  'tengu_brief_send',
 32  'tengu_cancel',
 33  'tengu_compact_failed',
 34  'tengu_exit',
 35  'tengu_flicker',
 36  'tengu_init',
 37  'tengu_model_fallback_triggered',
 38  'tengu_oauth_error',
 39  'tengu_oauth_success',
 40  'tengu_oauth_token_refresh_failure',
 41  'tengu_oauth_token_refresh_success',
 42  'tengu_oauth_token_refresh_lock_acquiring',
 43  'tengu_oauth_token_refresh_lock_acquired',
 44  'tengu_oauth_token_refresh_starting',
 45  'tengu_oauth_token_refresh_completed',
 46  'tengu_oauth_token_refresh_lock_releasing',
 47  'tengu_oauth_token_refresh_lock_released',
 48  'tengu_query_error',
 49  'tengu_session_file_read',
 50  'tengu_started',
 51  'tengu_tool_use_error',
 52  'tengu_tool_use_granted_in_prompt_permanent',
 53  'tengu_tool_use_granted_in_prompt_temporary',
 54  'tengu_tool_use_rejected_in_prompt',
 55  'tengu_tool_use_success',
 56  'tengu_uncaught_exception',
 57  'tengu_unhandled_rejection',
 58  'tengu_voice_recording_started',
 59  'tengu_voice_toggled',
 60  'tengu_team_mem_sync_pull',
 61  'tengu_team_mem_sync_push',
 62  'tengu_team_mem_sync_started',
 63  'tengu_team_mem_entries_capped',
 64])
 65
 66const TAG_FIELDS = [
 67  'arch',
 68  'clientType',
 69  'errorType',
 70  'http_status_range',
 71  'http_status',
 72  'kairosActive',
 73  'model',
 74  'platform',
 75  'provider',
 76  'skillMode',
 77  'subscriptionType',
 78  'toolName',
 79  'userBucket',
 80  'userType',
 81  'version',
 82  'versionBase',
 83]
 84
 85function camelToSnakeCase(str: string): string {
 86  return str.replace(/[A-Z]/g, letter => `_${letter.toLowerCase()}`)
 87}
 88
 89type DatadogLog = {
 90  ddsource: string
 91  ddtags: string
 92  message: string
 93  service: string
 94  hostname: string
 95  [key: string]: unknown
 96}
 97
 98let logBatch: DatadogLog[] = []
 99let flushTimer: NodeJS.Timeout | null = null
100let datadogInitialized: boolean | null = null
101
102async function flushLogs(): Promise<void> {
103  if (logBatch.length === 0) return
104
105  const logsToSend = logBatch
106  logBatch = []
107
108  try {
109    await axios.post(DATADOG_LOGS_ENDPOINT, logsToSend, {
110      headers: {
111        'Content-Type': 'application/json',
112        'DD-API-KEY': DATADOG_CLIENT_TOKEN,
113      },
114      timeout: NETWORK_TIMEOUT_MS,
115    })
116  } catch (error) {
117    logError(error)
118  }
119}
120
121function scheduleFlush(): void {
122  if (flushTimer) return
123
124  flushTimer = setTimeout(() => {
125    flushTimer = null
126    void flushLogs()
127  }, getFlushIntervalMs()).unref()
128}
129
130export const initializeDatadog = memoize(async (): Promise<boolean> => {
131  if (isAnalyticsDisabled()) {
132    datadogInitialized = false
133    return false
134  }
135
136  try {
137    datadogInitialized = true
138    return true
139  } catch (error) {
140    logError(error)
141    datadogInitialized = false
142    return false
143  }
144})
145
146/**
147 * Flush remaining Datadog logs and shut down.
148 * Called from gracefulShutdown() before process.exit() since
149 * forceExit() prevents the beforeExit handler from firing.
150 */
151export async function shutdownDatadog(): Promise<void> {
152  if (flushTimer) {
153    clearTimeout(flushTimer)
154    flushTimer = null
155  }
156  await flushLogs()
157}
158
159// NOTE: use via src/services/analytics/index.ts > logEvent
160export async function trackDatadogEvent(
161  eventName: string,
162  properties: { [key: string]: boolean | number | undefined },
163): Promise<void> {
164  if (process.env.NODE_ENV !== 'production') {
165    return
166  }
167
168  // Don't send events for 3P providers (Bedrock, Vertex, Foundry)
169  if (getAPIProvider() !== 'firstParty') {
170    return
171  }
172
173  // Fast path: use cached result if available to avoid await overhead
174  let initialized = datadogInitialized
175  if (initialized === null) {
176    initialized = await initializeDatadog()
177  }
178  if (!initialized || !DATADOG_ALLOWED_EVENTS.has(eventName)) {
179    return
180  }
181
182  try {
183    const metadata = await getEventMetadata({
184      model: properties.model,
185      betas: properties.betas,
186    })
187    // Destructure to avoid duplicate envContext (once nested, once flattened)
188    const { envContext, ...restMetadata } = metadata
189    const allData: Record<string, unknown> = {
190      ...restMetadata,
191      ...envContext,
192      ...properties,
193      userBucket: getUserBucket(),
194    }
195
196    // Normalize MCP tool names to "mcp" for cardinality reduction
197    if (
198      typeof allData.toolName === 'string' &&
199      allData.toolName.startsWith('mcp__')
200    ) {
201      allData.toolName = 'mcp'
202    }
203
204    // Normalize model names for cardinality reduction (external users only)
205    if (process.env.USER_TYPE !== 'ant' && typeof allData.model === 'string') {
206      const shortName = getCanonicalName(allData.model.replace(/\[1m]$/i, ''))
207      allData.model = shortName in MODEL_COSTS ? shortName : 'other'
208    }
209
210    // Truncate dev version to base + date (remove timestamp and sha for cardinality reduction)
211    // e.g. "2.0.53-dev.20251124.t173302.sha526cc6a" -> "2.0.53-dev.20251124"
212    if (typeof allData.version === 'string') {
213      allData.version = allData.version.replace(
214        /^(\d+\.\d+\.\d+-dev\.\d{8})\.t\d+\.sha[a-f0-9]+$/,
215        '$1',
216      )
217    }
218
219    // Transform status to http_status and http_status_range to avoid Datadog reserved field
220    if (allData.status !== undefined && allData.status !== null) {
221      const statusCode = String(allData.status)
222      allData.http_status = statusCode
223
224      // Determine status range (1xx, 2xx, 3xx, 4xx, 5xx)
225      const firstDigit = statusCode.charAt(0)
226      if (firstDigit >= '1' && firstDigit <= '5') {
227        allData.http_status_range = `${firstDigit}xx`
228      }
229
230      // Remove original status field to avoid conflict with Datadog's reserved field
231      delete allData.status
232    }
233
234    // Build ddtags with high-cardinality fields for filtering.
235    // event:<name> is prepended so the event name is searchable via the
236    // log search API — the `message` field (where eventName also lives)
237    // is a DD reserved field and is NOT queryable from dashboard widget
238    // queries or the aggregation API. See scripts/release/MONITORING.md.
239    const allDataRecord = allData
240    const tags = [
241      `event:${eventName}`,
242      ...TAG_FIELDS.filter(
243        field =>
244          allDataRecord[field] !== undefined && allDataRecord[field] !== null,
245      ).map(field => `${camelToSnakeCase(field)}:${allDataRecord[field]}`),
246    ]
247
248    const log: DatadogLog = {
249      ddsource: 'nodejs',
250      ddtags: tags.join(','),
251      message: eventName,
252      service: 'claude-code',
253      hostname: 'claude-code',
254      env: process.env.USER_TYPE,
255    }
256
257    // Add all fields as searchable attributes (not duplicated in tags)
258    for (const [key, value] of Object.entries(allData)) {
259      if (value !== undefined && value !== null) {
260        log[camelToSnakeCase(key)] = value
261      }
262    }
263
264    logBatch.push(log)
265
266    // Flush immediately if batch is full, otherwise schedule
267    if (logBatch.length >= MAX_BATCH_SIZE) {
268      if (flushTimer) {
269        clearTimeout(flushTimer)
270        flushTimer = null
271      }
272      void flushLogs()
273    } else {
274      scheduleFlush()
275    }
276  } catch (error) {
277    logError(error)
278  }
279}
280
281const NUM_USER_BUCKETS = 30
282
283/**
284 * Gets a 'bucket' that the user ID falls into.
285 *
286 * For alerting purposes, we want to alert on the number of users impacted
287 * by an issue, rather than the number of events- often a small number of users
288 * can generate a large number of events (e.g. due to retries). To approximate
289 * this without ruining cardinality by counting user IDs directly, we hash the user ID
290 * and assign it to one of a fixed number of buckets.
291 *
292 * This allows us to estimate the number of unique users by counting unique buckets,
293 * while preserving user privacy and reducing cardinality.
294 */
295const getUserBucket = memoize((): number => {
296  const userId = getOrCreateUserID()
297  const hash = createHash('sha256').update(userId).digest('hex')
298  return parseInt(hash.slice(0, 8), 16) % NUM_USER_BUCKETS
299})
300
301function getFlushIntervalMs(): number {
302  // Allow tests to override to not block on the default flush interval.
303  return (
304    parseInt(process.env.CLAUDE_CODE_DATADOG_FLUSH_INTERVAL_MS || '', 10) ||
305    DEFAULT_FLUSH_INTERVAL_MS
306  )
307}