forked from tangled.org/core
this repo has no description

spindle: add more debug logs

Signed-off-by: oppiliappan <me@oppi.li>

oppi.li b7fa3865 fe0f3329

verified
Changed files
+96 -32
nix
spindle
engines
nixery
+6 -6
nix/vm.nix
··· 43 43 guest.port = 6000; 44 44 } 45 45 # spindle 46 - { 47 - from = "host"; 48 - host.port = 6555; 49 - guest.port = 6555; 50 - } 46 + # { 47 + # from = "host"; 48 + # host.port = 6555; 49 + # guest.port = 6555; 50 + # } 51 51 ]; 52 52 sharedDirectories = { 53 53 # We can't use the 9p mounts directly for most of these ··· 83 83 }; 84 84 }; 85 85 services.tangled-spindle = { 86 - enable = true; 86 + enable = false; 87 87 server = { 88 88 owner = envVar "TANGLED_VM_SPINDLE_OWNER"; 89 89 hostname = "localhost:6555";
+90 -26
spindle/engines/nixery/engine.go
··· 173 173 func (e *Engine) SetupWorkflow(ctx context.Context, wid models.WorkflowId, wf *models.Workflow) error { 174 174 e.l.Info("setting up workflow", "workflow", wid) 175 175 176 - _, err := e.docker.NetworkCreate(ctx, networkName(wid), network.CreateOptions{ 177 - Driver: "bridge", 178 - }) 179 - if err != nil { 180 - return err 181 - } 182 - e.registerCleanup(wid, func(ctx context.Context) error { 183 - return e.docker.NetworkRemove(ctx, networkName(wid)) 184 - }) 185 - 186 176 addl := wf.Data.(addlFields) 187 177 188 178 reader, err := e.docker.ImagePull(ctx, addl.image, image.PullOptions{}) ··· 193 183 } 194 184 defer reader.Close() 195 185 io.Copy(os.Stdout, reader) 186 + 187 + _, err = e.docker.NetworkCreate(ctx, networkName(wid), network.CreateOptions{ 188 + Driver: "bridge", 189 + }) 190 + if err != nil { 191 + return err 192 + } 193 + e.registerCleanup(wid, func(ctx context.Context) error { 194 + return e.docker.NetworkRemove(ctx, networkName(wid)) 195 + }) 196 196 197 197 resp, err := e.docker.ContainerCreate(ctx, &container.Config{ 198 198 Image: addl.image, ··· 294 294 for _, s := range secrets { 295 295 workflowEnvs.AddEnv(s.Key, s.Value) 296 296 } 297 - 298 297 step := w.Steps[idx].(Step) 299 - 300 298 select { 301 299 case <-ctx.Done(): 302 300 return ctx.Err() 303 301 default: 304 302 } 305 - 306 303 envs := append(EnvVars(nil), workflowEnvs...) 307 304 for k, v := range step.environment { 308 305 envs.AddEnv(k, v) 309 306 } 310 307 envs.AddEnv("HOME", homeDir) 308 + 309 + e.l.Info("executing step", 310 + "workflow_id", wid.String(), 311 + "step_index", idx, 312 + "step_name", step.Name, 313 + "command", step.command, 314 + ) 311 315 312 316 mkExecResp, err := e.docker.ContainerExecCreate(ctx, addl.container, container.ExecOptions{ 313 317 Cmd: []string{"bash", "-c", step.command}, ··· 327 331 328 332 select { 329 333 case <-tailDone: 330 - 331 334 case <-ctx.Done(): 332 335 // cleanup will be handled by DestroyWorkflow, since 333 336 // Docker doesn't provide an API to kill an exec run 334 337 // (sure, we could grab the PID and kill it ourselves, 335 338 // but that's wasted effort) 336 339 e.l.Warn("step timed out", "step", step.Name) 337 - 338 340 <-tailDone 339 - 340 341 return engine.ErrTimedOut 341 342 } 342 343 ··· 346 347 default: 347 348 } 348 349 349 - execInspectResp, err := e.docker.ContainerExecInspect(ctx, mkExecResp.ID) 350 + if err = e.handleStepFailure(ctx, wid, w, idx, mkExecResp.ID); err != nil { 351 + return err 352 + } 353 + 354 + e.l.Info("step completed successfully", 355 + "workflow_id", wid.String(), 356 + "step_index", idx, 357 + "step_name", step.Name, 358 + ) 359 + 360 + return nil 361 + } 362 + 363 + // logStepFailure logs detailed information about a failed workflow step 364 + func (e *Engine) handleStepFailure( 365 + ctx context.Context, 366 + wid models.WorkflowId, 367 + w *models.Workflow, 368 + idx int, 369 + execID string, 370 + ) error { 371 + addl := w.Data.(addlFields) 372 + step := w.Steps[idx].(Step) 373 + 374 + inspectResp, err := e.docker.ContainerInspect(ctx, addl.container) 350 375 if err != nil { 351 376 return err 352 377 } 353 378 354 - if execInspectResp.ExitCode != 0 { 355 - inspectResp, err := e.docker.ContainerInspect(ctx, addl.container) 356 - if err != nil { 357 - return err 379 + execInspectResp, err := e.docker.ContainerExecInspect(ctx, execID) 380 + if err != nil { 381 + return err 382 + } 383 + 384 + // no error 385 + if execInspectResp.ExitCode == 0 { 386 + return nil 387 + } 388 + 389 + logFields := []any{ 390 + "workflow_id", wid.String(), 391 + "step_index", idx, 392 + "step_name", step.Name, 393 + "command", step.command, 394 + "container_exit_code", inspectResp.State.ExitCode, 395 + "container_oom_killed", inspectResp.State.OOMKilled, 396 + "exec_exit_code", execInspectResp.ExitCode, 397 + } 398 + 399 + // Add container state information 400 + if inspectResp.State != nil { 401 + logFields = append(logFields, 402 + "container_status", inspectResp.State.Status, 403 + "container_running", inspectResp.State.Running, 404 + "container_paused", inspectResp.State.Paused, 405 + "container_restarting", inspectResp.State.Restarting, 406 + "container_dead", inspectResp.State.Dead, 407 + ) 408 + 409 + if inspectResp.State.Error != "" { 410 + logFields = append(logFields, "container_error", inspectResp.State.Error) 358 411 } 359 412 360 - e.l.Error("workflow failed!", "workflow_id", wid.String(), "exit_code", execInspectResp.ExitCode, "oom_killed", inspectResp.State.OOMKilled) 413 + if inspectResp.State.StartedAt != "" { 414 + logFields = append(logFields, "container_started_at", inspectResp.State.StartedAt) 415 + } 361 416 362 - if inspectResp.State.OOMKilled { 363 - return ErrOOMKilled 417 + if inspectResp.State.FinishedAt != "" { 418 + logFields = append(logFields, "container_finished_at", inspectResp.State.FinishedAt) 364 419 } 365 - return engine.ErrWorkflowFailed 366 420 } 367 421 368 - return nil 422 + // Add resource usage if available 423 + if inspectResp.HostConfig != nil && inspectResp.HostConfig.Memory > 0 { 424 + logFields = append(logFields, "memory_limit", inspectResp.HostConfig.Memory) 425 + } 426 + 427 + e.l.Error("workflow step failed!", logFields...) 428 + 429 + if inspectResp.State.OOMKilled { 430 + return ErrOOMKilled 431 + } 432 + return engine.ErrWorkflowFailed 369 433 } 370 434 371 435 func (e *Engine) tailStep(ctx context.Context, wfLogger *models.WorkflowLogger, execID string, wid models.WorkflowId, stepIdx int, step models.Step) error {