Monorepo for Tangled tangled.org

spindle/engine: timeout the entire workflow instead

Signed-off-by: Anirudh Oppiliappan <anirudh@tangled.sh>

anirudh.fi 87686f39 5f8231c0

verified
Changed files
+35 -22
spindle
config
engine
+3 -3
spindle/config/config.go
··· 16 16 } 17 17 18 18 type Pipelines struct { 19 - Nixery string `env:"NIXERY, default=nixery.tangled.sh"` 20 - StepTimeout string `env:"STEP_TIMEOUT, default=5m"` 21 - LogDir string `env:"LOG_DIR, default=/var/log/spindle"` 19 + Nixery string `env:"NIXERY, default=nixery.tangled.sh"` 20 + WorkflowTimeout string `env:"WORKFLOW_TIMEOUT, default=5m"` 21 + LogDir string `env:"LOG_DIR, default=/var/log/spindle"` 22 22 } 23 23 24 24 type Config struct {
+32 -19
spindle/engine/engine.go
··· 102 102 defer reader.Close() 103 103 io.Copy(os.Stdout, reader) 104 104 105 + workflowTimeoutStr := e.cfg.Pipelines.WorkflowTimeout 106 + workflowTimeout, err := time.ParseDuration(workflowTimeoutStr) 107 + if err != nil { 108 + e.l.Error("failed to parse workflow timeout", "error", err, "timeout", workflowTimeoutStr) 109 + workflowTimeout = 5 * time.Minute 110 + } 111 + e.l.Info("using workflow timeout", "timeout", workflowTimeout) 112 + ctx, cancel := context.WithTimeout(ctx, workflowTimeout) 113 + defer cancel() 114 + 105 115 err = e.StartSteps(ctx, w.Steps, wid, w.Image) 106 116 if err != nil { 107 117 if errors.Is(err, ErrTimedOut) { ··· 177 187 // All other errors are bubbled up. 178 188 // Fixed version of the step execution logic 179 189 func (e *Engine) StartSteps(ctx context.Context, steps []models.Step, wid models.WorkflowId, image string) error { 180 - stepTimeoutStr := e.cfg.Pipelines.StepTimeout 181 - stepTimeout, err := time.ParseDuration(stepTimeoutStr) 182 - if err != nil { 183 - e.l.Error("failed to parse step timeout", "error", err, "timeout", stepTimeoutStr) 184 - stepTimeout = 5 * time.Minute 185 - } 186 - e.l.Info("using step timeout", "timeout", stepTimeout) 187 190 188 191 for stepIdx, step := range steps { 192 + select { 193 + case <-ctx.Done(): 194 + return ctx.Err() 195 + default: 196 + } 197 + 189 198 envs := ConstructEnvs(step.Environment) 190 199 envs.AddEnv("HOME", workspaceDir) 191 200 e.l.Debug("envs for step", "step", step.Name, "envs", envs.Slice()) ··· 199 208 Hostname: "spindle", 200 209 Env: envs.Slice(), 201 210 }, hostConfig, nil, nil, "") 211 + defer e.DestroyStep(ctx, resp.ID) 202 212 if err != nil { 203 213 return fmt.Errorf("creating container: %w", err) 204 214 } ··· 208 218 return fmt.Errorf("connecting network: %w", err) 209 219 } 210 220 211 - stepCtx, stepCancel := context.WithTimeout(ctx, stepTimeout) 212 - 213 - err = e.docker.ContainerStart(stepCtx, resp.ID, container.StartOptions{}) 221 + err = e.docker.ContainerStart(ctx, resp.ID, container.StartOptions{}) 214 222 if err != nil { 215 - stepCancel() 216 223 return err 217 224 } 218 225 e.l.Info("started container", "name", resp.ID, "step", step.Name) ··· 220 227 // start tailing logs in background 221 228 tailDone := make(chan error, 1) 222 229 go func() { 223 - tailDone <- e.TailStep(stepCtx, resp.ID, wid, stepIdx) 230 + tailDone <- e.TailStep(ctx, resp.ID, wid, stepIdx) 224 231 }() 225 232 226 233 // wait for container completion or timeout ··· 230 237 231 238 go func() { 232 239 defer close(waitDone) 233 - state, waitErr = e.WaitStep(stepCtx, resp.ID) 240 + state, waitErr = e.WaitStep(ctx, resp.ID) 234 241 }() 235 242 236 243 select { ··· 238 245 239 246 // wait for tailing to complete 240 247 <-tailDone 241 - stepCancel() 242 248 243 - case <-stepCtx.Done(): 244 - e.l.Warn("step timed out; killing container", "container", resp.ID, "timeout", stepTimeout) 245 - 246 - _ = e.DestroyStep(ctx, resp.ID) 249 + case <-ctx.Done(): 250 + e.l.Warn("step timed out; killing container", "container", resp.ID, "step", step.Name) 251 + err = e.DestroyStep(context.Background(), resp.ID) 252 + if err != nil { 253 + e.l.Error("failed to destroy step", "container", resp.ID, "error", err) 254 + } 247 255 248 256 // wait for both goroutines to finish 249 257 <-waitDone 250 258 <-tailDone 251 259 252 - stepCancel() 253 260 return ErrTimedOut 261 + } 262 + 263 + select { 264 + case <-ctx.Done(): 265 + return ctx.Err() 266 + default: 254 267 } 255 268 256 269 if waitErr != nil {