Kubernetes Operator for Tangled Spindles
13
fork

Configure Feed

Select the types of activity you want to include in your feed.

add jobbuilder, code cleanup, try and use upstream spindle models

evan.jarrett.net 6ea91538 7f3173ca

verified
+190 -1322
-1066
Kubernetes Operator for Github Actions Runner.md
··· 1 - # Kubernetes Operator for Github Actions Runner 2 - Created at 2025-11-03 05:24:53 UTC 3 - 4 - > @ 2025-11-03 05:24:53 UTC 5 - > 6 - > I want to implement a kubernetes operator similar to github actions runner controller. I want to primarily focus on the logic for spawning a “runner” container via jobs and polling logs as it’s processing. I also want to make sure i can build docker containers within these runners. And the jobs should have logic to target nodes based on os architecture. This is supposed to be a rough poc, i already have go logic that works for reading a yaml file and figuring out what workflow to run. It just needs to send that job/step to a job in kubes 7 - 8 - <details><summary>Thinking</summary> 9 - 10 - The user is asking for help implementing a Kubernetes operator similar to GitHub Actions runner controller. They want to: 11 - 1. Spawn runner containers via Jobs 12 - 2. Poll logs as it's processing 13 - 3. Build docker containers within runners 14 - 4. Target nodes based on OS architecture 15 - 5. This is a rough POC and they already have Go logic to read YAML and figure out what workflow to run 16 - 17 - I'll provide a comprehensive answer covering the implementation of these requirements. 18 - 19 - </details> 20 - 21 - I'll help you implement a Kubernetes operator for spawning runner containers similar to GitHub Actions Runner Controller. Here's a comprehensive approach for your POC: 22 - 23 - ## **Kubernetes Job Spawning Logic** 24 - 25 - Here's the core Go code structure for creating and managing Jobs: 26 - 27 - ```go 28 - package operator 29 - 30 - import ( 31 - "context" 32 - "fmt" 33 - batchv1 "k8s.io/api/batch/v1" 34 - corev1 "k8s.io/api/core/v1" 35 - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 36 - "k8s.io/client-go/kubernetes" 37 - ) 38 - 39 - type RunnerController struct { 40 - clientset *kubernetes.Clientset 41 - namespace string 42 - } 43 - 44 - func (rc *RunnerController) CreateRunnerJob(workflowID, stepName, stepCommand string, arch string) (*batchv1.Job, error) { 45 - job := &batchv1.Job{ 46 - ObjectMeta: metav1.ObjectMeta{ 47 - Name: fmt.Sprintf("runner-%s-%s", workflowID, stepName), 48 - Namespace: rc.namespace, 49 - Labels: map[string]string{ 50 - "workflow-id": workflowID, 51 - "step-name": stepName, 52 - "type": "runner", 53 - }, 54 - }, 55 - Spec: batchv1.JobSpec{ 56 - Template: corev1.PodTemplateSpec{ 57 - ObjectMeta: metav1.ObjectMeta{ 58 - Labels: map[string]string{ 59 - "workflow-id": workflowID, 60 - "step-name": stepName, 61 - }, 62 - }, 63 - Spec: corev1.PodSpec{ 64 - RestartPolicy: corev1.RestartPolicyNever, 65 - NodeSelector: map[string]string{ 66 - "kubernetes.io/arch": arch, // amd64, arm64, etc. 67 - }, 68 - Containers: []corev1.Container{ 69 - { 70 - Name: "runner", 71 - Image: "docker:dind", // Docker in Docker image 72 - Command: []string{"/bin/sh", "-c"}, 73 - Args: []string{stepCommand}, 74 - Env: []corev1.EnvVar{ 75 - { 76 - Name: "DOCKER_HOST", 77 - Value: "tcp://localhost:2375", 78 - }, 79 - }, 80 - VolumeMounts: []corev1.VolumeMount{ 81 - { 82 - Name: "docker-graph-storage", 83 - MountPath: "/var/lib/docker", 84 - }, 85 - }, 86 - }, 87 - { 88 - Name: "dind-daemon", 89 - Image: "docker:dind", 90 - Command: []string{"dockerd"}, 91 - Args: []string{ 92 - "--host=tcp://0.0.0.0:2375", 93 - "--storage-driver=overlay2", 94 - }, 95 - SecurityContext: &corev1.SecurityContext{ 96 - Privileged: &[]bool{true}[0], 97 - }, 98 - VolumeMounts: []corev1.VolumeMount{ 99 - { 100 - Name: "docker-graph-storage", 101 - MountPath: "/var/lib/docker", 102 - }, 103 - }, 104 - }, 105 - }, 106 - Volumes: []corev1.Volume{ 107 - { 108 - Name: "docker-graph-storage", 109 - VolumeSource: corev1.VolumeSource{ 110 - EmptyDir: &corev1.EmptyDirVolumeSource{}, 111 - }, 112 - }, 113 - }, 114 - }, 115 - }, 116 - }, 117 - } 118 - 119 - return rc.clientset.BatchV1().Jobs(rc.namespace).Create(context.TODO(), job, metav1.CreateOptions{}) 120 - } 121 - ``` 122 - 123 - ## **Docker-in-Docker Configuration** 124 - 125 - For building Docker containers within your runners, you have two main approaches [^6][^2]: 126 - 127 - ### **Option 1: Docker-in-Docker (DinD)** 128 - The above example uses a sidecar container approach with the `docker:dind` image. This provides full Docker functionality but requires **privileged mode**. 129 - 130 - ### **Option 2: Kaniko (Rootless Alternative)** 131 - For better security, consider using Kaniko for building images without Docker daemon: 132 - 133 - ```go 134 - // Alternative container spec for Kaniko 135 - { 136 - Name: "runner", 137 - Image: "gcr.io/kaniko-project/executor:latest", 138 - Args: []string{ 139 - "--dockerfile=Dockerfile", 140 - "--context=git://github.com/your-repo.git", 141 - "--destination=your-registry/image:tag", 142 - }, 143 - } 144 - ``` 145 - 146 - ## **Log Polling Implementation** 147 - 148 - Here's how to poll and stream logs from your runner Jobs [^1][^3]: 149 - 150 - ```go 151 - import ( 152 - "bufio" 153 - "io" 154 - "time" 155 - "k8s.io/client-go/kubernetes" 156 - ) 157 - 158 - func (rc *RunnerController) PollJobLogs(jobName string) error { 159 - // Wait for pod to be created 160 - time.Sleep(2 * time.Second) 161 - 162 - // Get pods for the job 163 - labelSelector := fmt.Sprintf("job-name=%s", jobName) 164 - pods, err := rc.clientset.CoreV1().Pods(rc.namespace).List( 165 - context.TODO(), 166 - metav1.ListOptions{LabelSelector: labelSelector}, 167 - ) 168 - if err != nil || len(pods.Items) == 0 { 169 - return fmt.Errorf("no pods found for job %s", jobName) 170 - } 171 - 172 - podName := pods.Items[0].Name 173 - 174 - // Stream logs 175 - req := rc.clientset.CoreV1().Pods(rc.namespace).GetLogs(podName, &corev1.PodLogOptions{ 176 - Follow: true, 177 - Container: "runner", 178 - Timestamps: true, 179 - }) 180 - 181 - stream, err := req.Stream(context.TODO()) 182 - if err != nil { 183 - return err 184 - } 185 - defer stream.Close() 186 - 187 - // Read and process logs 188 - reader := bufio.NewReader(stream) 189 - for { 190 - line, err := reader.ReadString('\n') 191 - if err != nil { 192 - if err == io.EOF { 193 - break 194 - } 195 - return err 196 - } 197 - // Process log line (send to channel, write to file, etc.) 198 - fmt.Print(line) 199 - } 200 - 201 - return nil 202 - } 203 - ``` 204 - 205 - ## **Architecture-Based Node Targeting** 206 - 207 - To target specific node architectures, use **nodeSelector** or **nodeAffinity** [^1]: 208 - 209 - ```go 210 - // Enhanced node selection with affinity 211 - Spec: corev1.PodSpec{ 212 - Affinity: &corev1.Affinity{ 213 - NodeAffinity: &corev1.NodeAffinity{ 214 - RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ 215 - NodeSelectorTerms: []corev1.NodeSelectorTerm{ 216 - { 217 - MatchExpressions: []corev1.NodeSelectorRequirement{ 218 - { 219 - Key: "kubernetes.io/arch", 220 - Operator: corev1.NodeSelectorOpIn, 221 - Values: []string{arch}, // "amd64", "arm64" 222 - }, 223 - { 224 - Key: "kubernetes.io/os", 225 - Operator: corev1.NodeSelectorOpIn, 226 - Values: []string{"linux"}, 227 - }, 228 - }, 229 - }, 230 - }, 231 - }, 232 - }, 233 - }, 234 - // ... rest of pod spec 235 - } 236 - ``` 237 - 238 - ## **Job Monitoring and Cleanup** 239 - 240 - Add logic to monitor job completion and cleanup: 241 - 242 - ```go 243 - func (rc *RunnerController) WaitForJobCompletion(jobName string, timeout time.Duration) (*batchv1.Job, error) { 244 - ctx, cancel := context.WithTimeout(context.Background(), timeout) 245 - defer cancel() 246 - 247 - ticker := time.NewTicker(5 * time.Second) 248 - defer ticker.Stop() 249 - 250 - for { 251 - select { 252 - case <-ctx.Done(): 253 - return nil, fmt.Errorf("job %s timed out", jobName) 254 - case <-ticker.C: 255 - job, err := rc.clientset.BatchV1().Jobs(rc.namespace).Get( 256 - context.TODO(), jobName, metav1.GetOptions{}, 257 - ) 258 - if err != nil { 259 - return nil, err 260 - } 261 - 262 - if job.Status.Succeeded > 0 { 263 - return job, nil 264 - } 265 - if job.Status.Failed > 0 { 266 - return job, fmt.Errorf("job %s failed", jobName) 267 - } 268 - } 269 - } 270 - } 271 - 272 - func (rc *RunnerController) CleanupJob(jobName string) error { 273 - propagationPolicy := metav1.DeletePropagationBackground 274 - return rc.clientset.BatchV1().Jobs(rc.namespace).Delete( 275 - context.TODO(), 276 - jobName, 277 - metav1.DeleteOptions{ 278 - PropagationPolicy: &propagationPolicy, 279 - }, 280 - ) 281 - } 282 - ``` 283 - 284 - ## **Integration with Your Workflow Parser** 285 - 286 - Here's how to tie it all together with your existing YAML parser: 287 - 288 - ```go 289 - type WorkflowExecutor struct { 290 - controller *RunnerController 291 - } 292 - 293 - func (we *WorkflowExecutor) ExecuteWorkflow(workflow *WorkflowSpec) error { 294 - for _, step := range workflow.Steps { 295 - // Create job for step 296 - job, err := we.controller.CreateRunnerJob( 297 - workflow.ID, 298 - step.Name, 299 - step.Command, 300 - step.Architecture, // from your YAML 301 - ) 302 - if err != nil { 303 - return err 304 - } 305 - 306 - // Start log polling in goroutine 307 - go we.controller.PollJobLogs(job.Name) 308 - 309 - // Wait for completion 310 - completedJob, err := we.controller.WaitForJobCompletion(job.Name, 30*time.Minute) 311 - if err != nil { 312 - return err 313 - } 314 - 315 - // Optional: cleanup 316 - if workflow.CleanupOnComplete { 317 - we.controller.CleanupJob(job.Name) 318 - } 319 - } 320 - return nil 321 - } 322 - ``` 323 - 324 - ## **Key Considerations** 325 - 326 - - **Security**: Running Docker-in-Docker requires privileged containers. Consider using Kaniko or Buildah for rootless builds 327 - - **Resource Management**: Set appropriate resource limits on Jobs to prevent resource exhaustion 328 - - **Persistent Storage**: For caching Docker layers between builds, consider using PersistentVolumes instead of EmptyDir 329 - - **RBAC**: Ensure your operator has proper ServiceAccount permissions to create/delete Jobs and read logs 330 - 331 - This POC gives you a solid foundation similar to the GitHub Actions Runner Controller [^4][^5], with the core functionality you need for spawning runners, building containers, and monitoring execution. 332 - 333 - 334 - _References_: 335 - [^1]: [Jobs | Kubernetes](https://kubernetes.io/docs/concepts/workloads/controllers/job/) 336 - [^2]: [Jenkins: running workers in Kubernetes and Docker images build](https://dev.to/setevoy/jenkins-running-workers-in-kubernetes-and-docker-images-build-1lef) 337 - [^3]: [kubectl logs - Kubernetes](https://kubernetes.io/docs/reference/kubectl/generated/kubectl_logs/) 338 - [^4]: [Kubernetes controller for GitHub Actions self-hosted runners](https://github.com/actions/actions-runner-controller) 339 - [^5]: [GitHub - evryfs/github-actions-runner-operator: K8S operator for...](https://github.com/evryfs/github-actions-runner-operator) 340 - [^6]: [Automating Docker Builds in Kubernetes | by Sebastien Goasguen](https://sebgoa.medium.com/automating-docker-builds-in-kubernetes-15ef047795ab) 341 - 342 - > @ 2025-11-03 05:37:24 UTC 343 - > 344 - > Yeah i think i want to support buildah primarily. Not to say we can’t have ways to run alternative images later. Why would i want to choose node affinity over nodeselector? 345 - 346 - <details><summary>Thinking</summary> 347 - 348 - The user is asking about why they would choose node affinity over nodeSelector, and they want to use Buildah primarily for container builds. Let me provide a comprehensive answer about the differences between nodeSelector and nodeAffinity, and then update the code examples to use Buildah instead of Docker-in-Docker. 349 - 350 - Based on the research provided: 351 - 1. NodeSelector is simpler but less flexible 352 - 2. NodeAffinity provides more advanced scheduling options 353 - 3. Buildah can run rootless, which is better for security 354 - 355 - Let me structure a response that addresses both points clearly. 356 - 357 - </details> 358 - 359 - Great choice on **Buildah**! It's much more secure since it can run rootless without requiring privileged containers. Let me explain the nodeAffinity vs nodeSelector decision and provide updated code for Buildah. 360 - 361 - ## **NodeSelector vs NodeAffinity: When to Choose Which** 362 - 363 - ### **NodeSelector** 364 - - **Simple equality-based selection** [^4][^2] 365 - - Only supports "AND" logic between labels 366 - - More concise for basic requirements 367 - - Example: `kubernetes.io/arch: amd64` 368 - 369 - ### **NodeAffinity** 370 - NodeAffinity offers **significantly more flexibility** that you'll likely want for a runner controller [^2][^1]: 371 - 372 - 1. **Complex expressions**: Use operators like `In`, `NotIn`, `Exists`, `DoesNotExist`, `Gt`, `Lt` 373 - 2. **Soft preferences**: Use `preferredDuringSchedulingIgnoredDuringExecution` for optional requirements 374 - 3. **Multiple criteria**: Combine multiple conditions with OR logic 375 - 4. **Weighted preferences**: Assign weights to preferred nodes 376 - 377 - For your runner controller, **nodeAffinity is better** because: 378 - - You can handle fallback scenarios (prefer arm64 but accept amd64) 379 - - You can exclude certain nodes (avoid nodes with specific taints) 380 - - You can combine architecture with other requirements (GPU nodes, specific OS versions) 381 - 382 - ## **Buildah-Based Runner Implementation** 383 - 384 - Here's the updated implementation using Buildah: 385 - 386 - ```go 387 - package operator 388 - 389 - import ( 390 - "context" 391 - "fmt" 392 - batchv1 "k8s.io/api/batch/v1" 393 - corev1 "k8s.io/api/core/v1" 394 - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 395 - "k8s.io/client-go/kubernetes" 396 - ) 397 - 398 - type BuildahRunnerConfig struct { 399 - Registry string 400 - RegistryAuth string // base64 encoded auth 401 - StorageDriver string // overlay, vfs 402 - Architecture string // amd64, arm64 403 - } 404 - 405 - func (rc *RunnerController) CreateBuildahRunnerJob( 406 - workflowID, stepName string, 407 - buildContext string, 408 - config BuildahRunnerConfig, 409 - ) (*batchv1.Job, error) { 410 - 411 - job := &batchv1.Job{ 412 - ObjectMeta: metav1.ObjectMeta{ 413 - Name: fmt.Sprintf("runner-%s-%s", workflowID, stepName), 414 - Namespace: rc.namespace, 415 - Labels: map[string]string{ 416 - "workflow-id": workflowID, 417 - "step-name": stepName, 418 - "builder": "buildah", 419 - }, 420 - }, 421 - Spec: batchv1.JobSpec{ 422 - BackoffLimit: &[]int32{2}[0], 423 - Template: corev1.PodTemplateSpec{ 424 - ObjectMeta: metav1.ObjectMeta{ 425 - Labels: map[string]string{ 426 - "workflow-id": workflowID, 427 - "step-name": stepName, 428 - }, 429 - Annotations: map[string]string{ 430 - "container.apparmor.security.beta.kubernetes.io/buildah": "unconfined", 431 - }, 432 - }, 433 - Spec: corev1.PodSpec{ 434 - RestartPolicy: corev1.RestartPolicyNever, 435 - 436 - // Using NodeAffinity for flexible architecture targeting 437 - Affinity: &corev1.Affinity{ 438 - NodeAffinity: &corev1.NodeAffinity{ 439 - RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ 440 - NodeSelectorTerms: []corev1.NodeSelectorTerm{ 441 - { 442 - MatchExpressions: []corev1.NodeSelectorRequirement{ 443 - { 444 - Key: "kubernetes.io/arch", 445 - Operator: corev1.NodeSelectorOpIn, 446 - Values: []string{config.Architecture}, 447 - }, 448 - }, 449 - }, 450 - }, 451 - }, 452 - // Optional: Prefer nodes with more resources 453 - PreferredDuringSchedulingIgnoredDuringExecution: []corev1.PreferredSchedulingTerm{ 454 - { 455 - Weight: 100, 456 - Preference: corev1.NodeSelectorTerm{ 457 - MatchExpressions: []corev1.NodeSelectorRequirement{ 458 - { 459 - Key: "node-role.kubernetes.io/runner", 460 - Operator: corev1.NodeSelectorOpExists, 461 - }, 462 - }, 463 - }, 464 - }, 465 - }, 466 - }, 467 - }, 468 - 469 - Containers: []corev1.Container{ 470 - { 471 - Name: "buildah", 472 - Image: "quay.io/buildah/stable:latest", 473 - Command: []string{"/bin/bash", "-c"}, 474 - Args: []string{fmt.Sprintf(` 475 - set -e 476 - echo "Starting build process..." 477 - 478 - # Clone or prepare build context 479 - %s 480 - 481 - # Build the image 482 - buildah bud \ 483 - --storage-driver=%s \ 484 - --format=docker \ 485 - --tls-verify=false \ 486 - -t %s/%s:%s \ 487 - -f Dockerfile . 488 - 489 - # Push to registry (if configured) 490 - if [ -n "$REGISTRY_AUTH" ]; then 491 - echo "$REGISTRY_AUTH" | base64 -d > /tmp/auth.json 492 - buildah push \ 493 - --authfile /tmp/auth.json \ 494 - --tls-verify=false \ 495 - %s/%s:%s 496 - rm /tmp/auth.json 497 - fi 498 - 499 - echo "Build completed successfully" 500 - `, 501 - buildContext, 502 - config.StorageDriver, 503 - config.Registry, workflowID, stepName, 504 - config.Registry, workflowID, stepName, 505 - )}, 506 - 507 - SecurityContext: &corev1.SecurityContext{ 508 - // Buildah can run rootless! 509 - RunAsUser: &[]int64{1000}[0], 510 - RunAsGroup: &[]int64{1000}[0], 511 - Capabilities: &corev1.Capabilities{ 512 - Add: []corev1.Capability{ 513 - "SETUID", 514 - "SETGID", 515 - }, 516 - }, 517 - }, 518 - 519 - Env: []corev1.EnvVar{ 520 - { 521 - Name: "STORAGE_DRIVER", 522 - Value: config.StorageDriver, 523 - }, 524 - { 525 - Name: "REGISTRY_AUTH", 526 - Value: config.RegistryAuth, 527 - }, 528 - { 529 - Name: "BUILDAH_ISOLATION", 530 - Value: "chroot", 531 - }, 532 - }, 533 - 534 - Resources: corev1.ResourceRequirements{ 535 - Requests: corev1.ResourceList{ 536 - corev1.ResourceCPU: resource.MustParse("500m"), 537 - corev1.ResourceMemory: resource.MustParse("1Gi"), 538 - }, 539 - Limits: corev1.ResourceList{ 540 - corev1.ResourceCPU: resource.MustParse("2"), 541 - corev1.ResourceMemory: resource.MustParse("4Gi"), 542 - }, 543 - }, 544 - 545 - VolumeMounts: []corev1.VolumeMount{ 546 - { 547 - Name: "buildah-storage", 548 - MountPath: "/var/lib/containers", 549 - }, 550 - }, 551 - }, 552 - }, 553 - 554 - Volumes: []corev1.Volume{ 555 - { 556 - Name: "buildah-storage", 557 - VolumeSource: corev1.VolumeSource{ 558 - EmptyDir: &corev1.EmptyDirVolumeSource{ 559 - Medium: corev1.StorageMediumMemory, // Use RAM for speed 560 - }, 561 - }, 562 - }, 563 - }, 564 - }, 565 - }, 566 - }, 567 - } 568 - 569 - return rc.clientset.BatchV1().Jobs(rc.namespace).Create(context.TODO(), job, metav1.CreateOptions{}) 570 - } 571 - ``` 572 - 573 - ## **Supporting Multiple Build Tools with Strategy Pattern** 574 - 575 - Since you mentioned supporting alternative images later, here's an extensible approach: 576 - 577 - ```go 578 - type BuilderStrategy interface { 579 - CreateJob(workflowID, stepName string, config BuildConfig) *batchv1.Job 580 - GetImage() string 581 - GetSecurityContext() *corev1.SecurityContext 582 - } 583 - 584 - type BuildahStrategy struct{} 585 - type KanikoStrategy struct{} 586 - type DockerStrategy struct{} 587 - 588 - func (b *BuildahStrategy) GetSecurityContext() *corev1.SecurityContext { 589 - return &corev1.SecurityContext{ 590 - RunAsUser: &[]int64{1000}[0], 591 - RunAsGroup: &[]int64{1000}[0], 592 - Capabilities: &corev1.Capabilities{ 593 - Add: []corev1.Capability{"SETUID", "SETGID"}, 594 - }, 595 - } 596 - } 597 - 598 - func (k *KanikoStrategy) GetSecurityContext() *corev1.SecurityContext { 599 - return &corev1.SecurityContext{ 600 - RunAsUser: &[]int64{0}[0], // Kaniko needs root but doesn't need privileged 601 - } 602 - } 603 - 604 - // Factory pattern for builder selection 605 - func GetBuilderStrategy(builderType string) BuilderStrategy { 606 - switch builderType { 607 - case "kaniko": 608 - return &KanikoStrategy{} 609 - case "docker": 610 - return &DockerStrategy{} 611 - default: 612 - return &BuildahStrategy{} // Default to Buildah 613 - } 614 - } 615 - ``` 616 - 617 - ## **Advanced NodeAffinity Example** 618 - 619 - Here's a more sophisticated nodeAffinity setup that showcases why it's superior for complex requirements: 620 - 621 - ```go 622 - Affinity: &corev1.Affinity{ 623 - NodeAffinity: &corev1.NodeAffinity{ 624 - RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ 625 - NodeSelectorTerms: []corev1.NodeSelectorTerm{ 626 - { 627 - MatchExpressions: []corev1.NodeSelectorRequirement{ 628 - { 629 - Key: "kubernetes.io/arch", 630 - Operator: corev1.NodeSelectorOpIn, 631 - Values: []string{"amd64", "arm64"}, // Either architecture 632 - }, 633 - { 634 - Key: "node.kubernetes.io/instance-type", 635 - Operator: corev1.NodeSelectorOpNotIn, 636 - Values: []string{"t2.micro", "t2.small"}, // Exclude small instances 637 - }, 638 - }, 639 - }, 640 - }, 641 - }, 642 - PreferredDuringSchedulingIgnoredDuringExecution: []corev1.PreferredSchedulingTerm{ 643 - { 644 - Weight: 100, 645 - Preference: corev1.NodeSelectorTerm{ 646 - MatchExpressions: []corev1.NodeSelectorRequirement{ 647 - { 648 - Key: "kubernetes.io/arch", 649 - Operator: corev1.NodeSelectorOpIn, 650 - Values: []string{config.PreferredArch}, // Prefer specific arch 651 - }, 652 - }, 653 - }, 654 - }, 655 - { 656 - Weight: 50, 657 - Preference: corev1.NodeSelectorTerm{ 658 - MatchExpressions: []corev1.NodeSelectorRequirement{ 659 - { 660 - Key: "workload-type", 661 - Operator: corev1.NodeSelectorOpIn, 662 - Values: []string{"build-optimized"}, 663 - }, 664 - }, 665 - }, 666 - }, 667 - }, 668 - }, 669 - } 670 - ``` 671 - 672 - **Buildah** is an excellent choice for your runner controller because it: 673 - - Runs without root privileges (better security) [^5][^3] 674 - - Doesn't require a daemon 675 - - Has lower resource overhead 676 - - Supports multiple storage drivers 677 - - Works well in Kubernetes environments 678 - 679 - The combination of Buildah with nodeAffinity gives you a secure, flexible runner system that can intelligently place workloads across your cluster. 680 - 681 - 682 - _References_: 683 - [^1]: [Mastering Node Placement in Kubernetes: Node Selectors vs. Node Affinity](https://medium.com/@rahulbansod519/mastering-node-placement-in-kubernetes-node-selectors-vs-node-affinity-f6a24cda0ddd) 684 - [^2]: [Node Selector vs Node Affinity in Kubernetes - Medium](https://medium.com/@reach2shristi.81/node-selector-vs-node-affinity-in-kubernetes-3292e9552aa0) 685 - [^3]: [Buildah and Podman Relationship | buildah.io](https://buildah.io/blogs/2018/10/31/podman-buildah-relationship.html) 686 - [^4]: [Kubernetes: Node Selector and Node Affinity | by Claire Lee | Medium](https://yuminlee2.medium.com/kubernetes-node-selector-and-node-affinity-ecb3a4d69165) 687 - [^5]: [Building a Buildah container image for Kubernetes](https://buildah.io/blogs/2018/03/01/building-buildah-container-image-for-kubernetes.html) 688 - 689 - > @ 2025-11-03 05:43:40 UTC 690 - > 691 - > How does GitHub handle isolation? I believe each step in a job is the same runner, but different jobs are isolated? This code appears to make a single step into a job? Is there shared context between steps? Ie environment variables, cached files etc? 692 - 693 - <details><summary>Thinking</summary> 694 - 695 - The user is asking about GitHub's isolation model and noticing that my code creates a separate Kubernetes Job for each step, which is different from how GitHub Actions works. They're right - GitHub Actions runs all steps in a job on the same runner with shared context, while different jobs are isolated. Let me explain this architecture and provide updated code that better mimics GitHub's model. 696 - 697 - </details> 698 - 699 - Excellent observation! You're absolutely right - **GitHub Actions runs all steps within a job on the same runner**, sharing context between steps. Your current code creates separate Kubernetes Jobs for each step, which breaks this model. Let me explain GitHub's architecture and provide a better implementation. 700 - 701 - ## **GitHub Actions Isolation Model** 702 - 703 - ### **How GitHub Actually Works:** 704 - - **Job Level**: Each job gets its own runner (isolated environment) [^1] 705 - - **Step Level**: All steps within a job execute sequentially on the same runner [^1] 706 - - **Shared Context**: Steps share: 707 - - Filesystem/workspace 708 - - Environment variables 709 - - Docker volumes [^4] 710 - - Network context 711 - - Installed tools/dependencies 712 - 713 - ### **Isolation Boundaries:** 714 - - **Between Jobs**: Complete isolation (different runners/pods) 715 - - **Between Steps**: No isolation (same container/environment) 716 - - **Between Workflows**: Complete isolation 717 - 718 - ## **Corrected Implementation: Job = Pod, Steps = Sequential Commands** 719 - 720 - Here's a redesigned approach that properly mimics GitHub's model: 721 - 722 - ```go 723 - package operator 724 - 725 - import ( 726 - "context" 727 - "encoding/json" 728 - "fmt" 729 - "strings" 730 - batchv1 "k8s.io/api/batch/v1" 731 - corev1 "k8s.io/api/core/v1" 732 - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 733 - ) 734 - 735 - type WorkflowJob struct { 736 - ID string 737 - Name string 738 - Steps []Step 739 - Architecture string 740 - Environment map[string]string 741 - } 742 - 743 - type Step struct { 744 - Name string 745 - Command string 746 - Env map[string]string 747 - Uses string // For action references 748 - } 749 - 750 - func (rc *RunnerController) CreateJobRunner(job WorkflowJob) (*batchv1.Job, error) { 751 - // Build a single script that executes all steps sequentially 752 - runnerScript := rc.buildRunnerScript(job.Steps, job.Environment) 753 - 754 - k8sJob := &batchv1.Job{ 755 - ObjectMeta: metav1.ObjectMeta{ 756 - Name: fmt.Sprintf("runner-job-%s", job.ID), 757 - Namespace: rc.namespace, 758 - Labels: map[string]string{ 759 - "workflow-id": job.ID, 760 - "job-name": job.Name, 761 - "type": "github-runner", 762 - }, 763 - }, 764 - Spec: batchv1.JobSpec{ 765 - BackoffLimit: &[]int32{0}[0], // No retries for workflow jobs 766 - Template: corev1.PodTemplateSpec{ 767 - Spec: corev1.PodSpec{ 768 - RestartPolicy: corev1.RestartPolicyNever, 769 - 770 - // Single init container to set up workspace 771 - InitContainers: []corev1.Container{ 772 - { 773 - Name: "setup-workspace", 774 - Image: "alpine/git:latest", 775 - Command: []string{"/bin/sh", "-c"}, 776 - Args: []string{` 777 - # Clone repository or set up workspace 778 - git clone ${REPO_URL} /workspace || true 779 - cd /workspace 780 - git checkout ${GITHUB_SHA:-main} 781 - `}, 782 - VolumeMounts: []corev1.VolumeMount{ 783 - { 784 - Name: "workspace", 785 - MountPath: "/workspace", 786 - }, 787 - }, 788 - }, 789 - }, 790 - 791 - // Main runner container executes ALL steps 792 - Containers: []corev1.Container{ 793 - { 794 - Name: "runner", 795 - Image: "quay.io/buildah/stable:latest", 796 - Command: []string{"/bin/bash", "-c"}, 797 - Args: []string{runnerScript}, 798 - 799 - WorkingDir: "/workspace", 800 - 801 - Env: rc.buildEnvironment(job), 802 - 803 - VolumeMounts: []corev1.VolumeMount{ 804 - { 805 - Name: "workspace", 806 - MountPath: "/workspace", 807 - }, 808 - { 809 - Name: "step-outputs", 810 - MountPath: "/tmp/step-outputs", 811 - }, 812 - { 813 - Name: "github-env", 814 - MountPath: "/tmp/github", 815 - }, 816 - { 817 - Name: "buildah-storage", 818 - MountPath: "/var/lib/containers", 819 - }, 820 - }, 821 - 822 - SecurityContext: &corev1.SecurityContext{ 823 - RunAsUser: &[]int64{1000}[0], 824 - RunAsGroup: &[]int64{1000}[0], 825 - }, 826 - }, 827 - }, 828 - 829 - Volumes: []corev1.Volume{ 830 - { 831 - Name: "workspace", 832 - VolumeSource: corev1.VolumeSource{ 833 - EmptyDir: &corev1.EmptyDirVolumeSource{}, 834 - }, 835 - }, 836 - { 837 - Name: "step-outputs", 838 - VolumeSource: corev1.VolumeSource{ 839 - EmptyDir: &corev1.EmptyDirVolumeSource{}, 840 - }, 841 - }, 842 - { 843 - Name: "github-env", 844 - VolumeSource: corev1.VolumeSource{ 845 - EmptyDir: &corev1.EmptyDirVolumeSource{}, 846 - }, 847 - }, 848 - { 849 - Name: "buildah-storage", 850 - VolumeSource: corev1.VolumeSource{ 851 - EmptyDir: &corev1.EmptyDirVolumeSource{ 852 - Medium: corev1.StorageMediumMemory, 853 - }, 854 - }, 855 - }, 856 - }, 857 - 858 - NodeSelector: map[string]string{ 859 - "kubernetes.io/arch": job.Architecture, 860 - }, 861 - }, 862 - }, 863 - }, 864 - } 865 - 866 - return rc.clientset.BatchV1().Jobs(rc.namespace).Create(context.TODO(), k8sJob, metav1.CreateOptions{}) 867 - } 868 - ``` 869 - 870 - ## **Step Execution Script Builder** 871 - 872 - This critical function creates a bash script that maintains context between steps: 873 - 874 - ```go 875 - func (rc *RunnerController) buildRunnerScript(steps []Step, jobEnv map[string]string) string { 876 - var script strings.Builder 877 - 878 - script.WriteString(`#!/bin/bash 879 - set -e 880 - 881 - # GitHub Actions compatible environment setup 882 - export GITHUB_ENV=/tmp/github/env 883 - export GITHUB_PATH=/tmp/github/path 884 - export GITHUB_OUTPUT=/tmp/github/output 885 - export GITHUB_STEP_SUMMARY=/tmp/github/step-summary.md 886 - 887 - # Initialize files 888 - touch $GITHUB_ENV $GITHUB_PATH $GITHUB_OUTPUT $GITHUB_STEP_SUMMARY 889 - 890 - # Helper functions matching GitHub Actions 891 - add_to_env() { 892 - echo "$1=$2" >> $GITHUB_ENV 893 - export $1="$2" 894 - } 895 - 896 - add_to_path() { 897 - echo "$1" >> $GITHUB_PATH 898 - export PATH="$1:$PATH" 899 - } 900 - 901 - set_output() { 902 - echo "$1=$2" >> $GITHUB_OUTPUT 903 - } 904 - 905 - # Function to load environment between steps 906 - load_github_env() { 907 - if [ -f "$GITHUB_ENV" ]; then 908 - set -a 909 - source $GITHUB_ENV 910 - set +a 911 - fi 912 - if [ -f "$GITHUB_PATH" ]; then 913 - while IFS= read -r line; do 914 - export PATH="$line:$PATH" 915 - done < $GITHUB_PATH 916 - fi 917 - } 918 - 919 - `) 920 - 921 - // Add each step 922 - for i, step := range steps { 923 - script.WriteString(fmt.Sprintf(` 924 - echo "==== Starting Step: %s ====" 925 - load_github_env 926 - 927 - # Step %d environment variables 928 - `, step.Name, i+1)) 929 - 930 - // Add step-specific environment variables 931 - for k, v := range step.Env { 932 - script.WriteString(fmt.Sprintf("export %s='%s'\n", k, v)) 933 - } 934 - 935 - // Execute step command 936 - script.WriteString(fmt.Sprintf(` 937 - # Execute step 938 - ( 939 - %s 940 - ) || { 941 - echo "Step '%s' failed with exit code $?" 942 - exit 1 943 - } 944 - 945 - echo "==== Completed Step: %s ====" 946 - `, step.Command, step.Name, step.Name)) 947 - } 948 - 949 - return script.String() 950 - } 951 - ``` 952 - 953 - ## **Shared Context Implementation** 954 - 955 - Here's how to properly share context between steps like GitHub Actions does [^2][^3]: 956 - 957 - ```go 958 - type StepOutput struct { 959 - StepName string 960 - Outputs map[string]string 961 - } 962 - 963 - // Enhanced step structure with outputs 964 - type EnhancedStep struct { 965 - Step 966 - ID string // For referencing outputs 967 - Outputs []string // Expected output keys 968 - } 969 - 970 - func (rc *RunnerController) buildEnhancedRunnerScript(steps []EnhancedStep) string { 971 - var script strings.Builder 972 - 973 - // ... (previous setup code) ... 974 - 975 - for i, step := range steps { 976 - script.WriteString(fmt.Sprintf(` 977 - # Step: %s (id: %s) 978 - STEP_OUTPUT_FILE="/tmp/step-outputs/%s.json" 979 - 980 - # Run step and capture outputs 981 - ( 982 - %s 983 - 984 - # Save step outputs if any were set 985 - if [ -f "$GITHUB_OUTPUT" ]; then 986 - # Parse and save outputs for this step 987 - python3 -c " 988 - import json 989 - outputs = {} 990 - with open('$GITHUB_OUTPUT', 'r') as f: 991 - for line in f: 992 - if '=' in line: 993 - key, value = line.strip().split('=', 1) 994 - outputs[key] = value 995 - with open('$STEP_OUTPUT_FILE', 'w') as f: 996 - json.dump(outputs, f) 997 - " 998 - fi 999 - ) 1000 - 1001 - # Make outputs available as environment variables for next steps 1002 - if [ -f "$STEP_OUTPUT_FILE" ]; then 1003 - export STEP_%s_OUTPUTS=$(cat $STEP_OUTPUT_FILE) 1004 - fi 1005 - `, step.Name, step.ID, step.ID, step.Command, strings.ToUpper(step.ID))) 1006 - } 1007 - 1008 - return script.String() 1009 - } 1010 - ``` 1011 - 1012 - ## **Multi-Container Pattern for Complex Workflows** 1013 - 1014 - For workflows that need services (like databases), use a multi-container pod: 1015 - 1016 - ```go 1017 - // For jobs that need service containers (like GitHub Actions services) 1018 - func (rc *RunnerController) CreateJobWithServices(job WorkflowJob, services []ServiceContainer) (*batchv1.Job, error) { 1019 - containers := []corev1.Container{ 1020 - // Main runner container 1021 - { 1022 - Name: "runner", 1023 - // ... (as before) 1024 - }, 1025 - } 1026 - 1027 - // Add service containers that run alongside the main runner 1028 - for _, svc := range services { 1029 - containers = append(containers, corev1.Container{ 1030 - Name: svc.Name, 1031 - Image: svc.Image, 1032 - Env: convertToEnvVar(svc.Environment), 1033 - Ports: svc.Ports, 1034 - // Services run for the lifetime of the job 1035 - Command: svc.Command, 1036 - }) 1037 - } 1038 - 1039 - // Update pod spec with all containers 1040 - // ... 1041 - } 1042 - ``` 1043 - 1044 - ## **Key Differences from Original Implementation** 1045 - 1046 - | Aspect | GitHub Actions Model | Your Original Code | Corrected Approach | 1047 - |--------|---------------------|-------------------|-------------------| 1048 - | **Job Mapping** | Job = Runner Instance | Step = K8s Job | Job = K8s Job/Pod | 1049 - | **Step Execution** | Sequential in same environment | Separate pods | Sequential in same container | 1050 - | **Filesystem** | Shared across steps | Isolated per step | Shared via volumes | 1051 - | **Environment Variables** | Persistent across steps | Isolated | Persistent via GITHUB_ENV | 1052 - | **Docker/Buildah Cache** | Shared in job | Lost between steps | Shared via volume | 1053 - 1054 - This corrected approach properly mimics GitHub Actions' execution model while maintaining the isolation boundaries that make sense in Kubernetes. 1055 - 1056 - 1057 - _References_: 1058 - [^1]: [Understanding GitHub Actions](https://docs.github.com/articles/getting-started-with-github-actions) 1059 - [^2]: [Store information in variables - GitHub Docs](https://docs.github.com/en/actions/how-tos/write-workflows/choose-what-workflows-do/use-variables) 1060 - [^3]: [Contexts - GitHub Docs](https://docs.github.com/en/actions/concepts/workflows-and-actions/contexts) 1061 - [^4]: [Workflow syntax for GitHub Actions](https://docs.github.com/actions/using-workflows/workflow-syntax-for-github-actions) 1062 - 1063 - 1064 - --- 1065 - 1066 - _Generated by [Kagi Assistant](https://kagi.com/assistant)_
+9
api/v1alpha1/spindleset_types.go
··· 57 57 // +kubebuilder:validation:Required 58 58 CommitSHA string `json:"commitSHA"` 59 59 60 + // CloneCommands are the git commands to run in the clone init container. 61 + // Generated by tangled.org/core/spindle/steps.BuildCloneCommands(). 62 + // +optional 63 + CloneCommands []string `json:"cloneCommands,omitempty"` 64 + 65 + // SkipClone indicates whether to skip the clone init container entirely. 66 + // +optional 67 + SkipClone bool `json:"skipClone,omitempty"` 68 + 60 69 // Workflows is the list of workflows to execute in this pipeline. 61 70 // +kubebuilder:validation:MinItems=1 62 71 Workflows []WorkflowSpec `json:"workflows"`
+5
api/v1alpha1/zz_generated.deepcopy.go
··· 29 29 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 30 30 func (in *PipelineRunSpec) DeepCopyInto(out *PipelineRunSpec) { 31 31 *out = *in 32 + if in.CloneCommands != nil { 33 + in, out := &in.CloneCommands, &out.CloneCommands 34 + *out = make([]string, len(*in)) 35 + copy(*out, *in) 36 + } 32 37 if in.Workflows != nil { 33 38 in, out := &in.Workflows, &out.Workflows 34 39 *out = make([]WorkflowSpec, len(*in))
+40 -45
cmd/runner/main.go
··· 8 8 "io" 9 9 "os" 10 10 "os/exec" 11 - "time" 12 11 12 + "tangled.org/core/spindle/models" 13 13 loomv1alpha1 "tangled.org/evan.jarrett.net/loom/api/v1alpha1" 14 14 ) 15 15 16 - // LogEvent represents a structured log event emitted by the runner. 17 - type LogEvent struct { 18 - Kind string `json:"kind"` // "control" or "data" 19 - Event string `json:"event,omitempty"` // "start", "end" (for control events) 20 - StepID int `json:"step_id"` // 0-based step index 21 - StepName string `json:"step_name,omitempty"` // Step name 22 - WorkflowName string `json:"workflow_name"` // Workflow name for log separation 23 - Stream string `json:"stream,omitempty"` // "stdout" or "stderr" (for data events) 24 - Content string `json:"content,omitempty"` // Log line content (for data events) 25 - ExitCode *int `json:"exit_code,omitempty"` // Exit code (for control/end events) 26 - Timestamp string `json:"timestamp"` // ISO 8601 timestamp 16 + // simpleStep implements the models.Step interface 17 + type simpleStep struct { 18 + name string 19 + command string 20 + } 21 + 22 + func (s *simpleStep) Name() string { 23 + return s.name 24 + } 25 + 26 + func (s *simpleStep) Command() string { 27 + return s.command 28 + } 29 + 30 + func (s *simpleStep) Kind() models.StepKind { 31 + return models.StepKindUser 27 32 } 28 33 29 34 func main() { ··· 57 62 // Execute each step 58 63 ctx := context.Background() 59 64 for i, step := range workflow.Steps { 60 - if err := executeStep(ctx, i, step, workflow.Name); err != nil { 65 + if err := executeStep(ctx, i, step); err != nil { 61 66 return fmt.Errorf("step %d (%s) failed: %w", i, step.Name, err) 62 67 } 63 68 } ··· 65 70 return nil 66 71 } 67 72 68 - func executeStep(ctx context.Context, stepID int, step loomv1alpha1.WorkflowStep, workflowName string) error { 73 + func executeStep(ctx context.Context, stepID int, step loomv1alpha1.WorkflowStep) error { 74 + // Create a simple step for logging 75 + simpleStep := &simpleStep{ 76 + name: step.Name, 77 + command: step.Command, 78 + } 79 + 69 80 // Emit step start event 70 - emitControlEvent(stepID, step.Name, workflowName, "start", nil) 81 + emitControlEvent(stepID, simpleStep, models.StepStatusStart) 71 82 72 83 // Set step-specific environment variables 73 84 if step.Environment != nil { ··· 95 106 96 107 // Start the command 97 108 if err := cmd.Start(); err != nil { 98 - exitCode := 1 99 - emitControlEvent(stepID, step.Name, workflowName, "end", &exitCode) 109 + emitControlEvent(stepID, simpleStep, models.StepStatusEnd) 100 110 return fmt.Errorf("failed to start command: %w", err) 101 111 } 102 112 103 113 // Stream stdout and stderr concurrently 104 114 done := make(chan error, 2) 105 - go streamOutput(stdout, stepID, workflowName, "stdout", done) 106 - go streamOutput(stderr, stepID, workflowName, "stderr", done) 115 + go streamOutput(stdout, stepID, "stdout", done) 116 + go streamOutput(stderr, stepID, "stderr", done) 107 117 108 118 // Wait for both streams to complete 109 119 for i := 0; i < 2; i++ { ··· 125 135 } 126 136 127 137 // Emit step end event 128 - emitControlEvent(stepID, step.Name, workflowName, "end", &exitCode) 138 + emitControlEvent(stepID, simpleStep, models.StepStatusEnd) 129 139 130 140 if exitCode != 0 { 131 141 return fmt.Errorf("command exited with code %d", exitCode) ··· 134 144 return nil 135 145 } 136 146 137 - func streamOutput(reader io.Reader, stepID int, workflowName, stream string, done chan<- error) { 147 + func streamOutput(reader io.Reader, stepID int, stream string, done chan<- error) { 138 148 scanner := bufio.NewScanner(reader) 139 149 // Increase buffer size for long lines 140 150 buf := make([]byte, 0, 64*1024) ··· 142 152 143 153 for scanner.Scan() { 144 154 line := scanner.Text() 145 - emitDataEvent(stepID, workflowName, stream, line) 155 + emitDataEvent(stepID, stream, line) 146 156 } 147 157 148 158 done <- scanner.Err() 149 159 } 150 160 151 - func emitControlEvent(stepID int, stepName, workflowName, event string, exitCode *int) { 152 - ev := LogEvent{ 153 - Kind: "control", 154 - Event: event, 155 - StepID: stepID, 156 - StepName: stepName, 157 - WorkflowName: workflowName, 158 - ExitCode: exitCode, 159 - Timestamp: time.Now().UTC().Format(time.RFC3339Nano), 160 - } 161 - emitJSON(ev) 161 + func emitControlEvent(stepID int, step models.Step, status models.StepStatus) { 162 + logLine := models.NewControlLogLine(stepID, step, status) 163 + emitJSON(logLine) 162 164 } 163 165 164 - func emitDataEvent(stepID int, workflowName, stream, content string) { 165 - ev := LogEvent{ 166 - Kind: "data", 167 - StepID: stepID, 168 - WorkflowName: workflowName, 169 - Stream: stream, 170 - Content: content, 171 - Timestamp: time.Now().UTC().Format(time.RFC3339Nano), 172 - } 173 - emitJSON(ev) 166 + func emitDataEvent(stepID int, stream, content string) { 167 + logLine := models.NewDataLogLine(stepID, content, stream) 168 + emitJSON(logLine) 174 169 } 175 170 176 - func emitJSON(event LogEvent) { 177 - data, err := json.Marshal(event) 171 + func emitJSON(logLine models.LogLine) { 172 + data, err := json.Marshal(logLine) 178 173 if err != nil { 179 174 fmt.Fprintf(os.Stderr, "ERROR: failed to marshal JSON: %v\n", err) 180 175 return
+11
config/crd/bases/loom.j5t.io_spindlesets.yaml
··· 69 69 PipelineRun contains pipeline-specific information for this pipeline execution. 70 70 This SpindleSet is ephemeral and represents a single pipeline run. 71 71 properties: 72 + cloneCommands: 73 + description: |- 74 + CloneCommands are the git commands to run in the clone init container. 75 + Generated by tangled.org/core/spindle/steps.BuildCloneCommands(). 76 + items: 77 + type: string 78 + type: array 72 79 commitSHA: 73 80 description: CommitSHA is the Git commit to checkout. 74 81 type: string ··· 83 90 repoURL: 84 91 description: RepoURL is the Git repository URL to clone. 85 92 type: string 93 + skipClone: 94 + description: SkipClone indicates whether to skip the clone init 95 + container entirely. 96 + type: boolean 86 97 workflows: 87 98 description: Workflows is the list of workflows to execute in 88 99 this pipeline.
+1 -1
go.mod
··· 197 197 ) 198 198 199 199 // Use our custom version of tangled until its upstreamed 200 - replace tangled.org/core => tangled.org/evan.jarrett.net/core v1.9.1-alpha.0.20251109203816-d73c73e8728b 200 + replace tangled.org/core => tangled.org/evan.jarrett.net/core v1.9.1-alpha.0.20251111023958-287f7ea82ba6
+2 -2
go.sum
··· 687 687 sigs.k8s.io/structured-merge-diff/v4 v4.6.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= 688 688 sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= 689 689 sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= 690 - tangled.org/evan.jarrett.net/core v1.9.1-alpha.0.20251109203816-d73c73e8728b h1:m30BNzXLTO/V7tyh9levCuavrjgxH0KD5Y6lvX3S69s= 691 - tangled.org/evan.jarrett.net/core v1.9.1-alpha.0.20251109203816-d73c73e8728b/go.mod h1:CUO6beA36K/Cwt0u2yrO5CG+L7+LzAc6zi6WudwO7qs= 690 + tangled.org/evan.jarrett.net/core v1.9.1-alpha.0.20251111023958-287f7ea82ba6 h1:0vac5SqnzXzZi2HBZskjdUFdJflXTMMS9TM7mwttXeA= 691 + tangled.org/evan.jarrett.net/core v1.9.1-alpha.0.20251111023958-287f7ea82ba6/go.mod h1:CUO6beA36K/Cwt0u2yrO5CG+L7+LzAc6zi6WudwO7qs=
+3 -1
internal/controller/spindleset_controller.go
··· 35 35 "sigs.k8s.io/controller-runtime/pkg/log" 36 36 37 37 loomv1alpha1 "tangled.org/evan.jarrett.net/loom/api/v1alpha1" 38 - "tangled.org/evan.jarrett.net/loom/pkg/jobbuilder" 38 + "tangled.org/evan.jarrett.net/loom/internal/jobbuilder" 39 39 ) 40 40 41 41 // SpindleSetReconciler reconciles a SpindleSet object ··· 341 341 WorkflowSpec: workflowSpec, // Pass full workflow spec to runner 342 342 RepoURL: pipelineRun.RepoURL, 343 343 CommitSHA: pipelineRun.CommitSHA, 344 + CloneCommands: pipelineRun.CloneCommands, 345 + SkipClone: pipelineRun.SkipClone, 344 346 Secrets: nil, // TODO: Handle secrets 345 347 Template: spindleSet.Spec.Template, 346 348 Namespace: spindleSet.Namespace,
+22 -1
internal/controller/spindleset_controller_test.go
··· 51 51 Name: resourceName, 52 52 Namespace: "default", 53 53 }, 54 - // TODO(user): Specify other spec details if needed. 54 + Spec: loomv1alpha1.SpindleSetSpec{ 55 + Template: loomv1alpha1.SpindleTemplate{}, 56 + PipelineRun: &loomv1alpha1.PipelineRunSpec{ 57 + PipelineID: "test-pipeline-123", 58 + Knot: "knot.example.com", 59 + RepoURL: "https://knot.example.com/did:plc:test/repo", 60 + CommitSHA: "abc123", 61 + Workflows: []loomv1alpha1.WorkflowSpec{ 62 + { 63 + Name: "test-workflow", 64 + Image: "alpine:latest", 65 + Architecture: "amd64", 66 + Steps: []loomv1alpha1.WorkflowStep{ 67 + { 68 + Name: "test-step", 69 + Command: "echo 'hello'", 70 + }, 71 + }, 72 + }, 73 + }, 74 + }, 75 + }, 55 76 } 56 77 Expect(k8sClient.Create(ctx, resource)).To(Succeed()) 57 78 }
+50 -43
internal/engine/kubernetes_engine.go
··· 22 22 "tangled.org/core/api/tangled" 23 23 "tangled.org/core/spindle/models" 24 24 "tangled.org/core/spindle/secrets" 25 + "tangled.org/core/spindle/steps" 25 26 26 27 loomv1alpha1 "tangled.org/evan.jarrett.net/loom/api/v1alpha1" 27 28 ) ··· 115 116 116 117 // Store the parsed spec in Data field for later use 117 118 workflowData := map[string]interface{}{ 118 - "spec": spec, 119 - "triggerRepo": tpl.TriggerMetadata.Repo, 120 - "cloneOpts": twf.Clone, 119 + "spec": spec, 120 + "triggerMetadata": tpl.TriggerMetadata, 121 + "cloneOpts": twf.Clone, 121 122 } 122 123 123 124 workflow := &models.Workflow{ ··· 144 145 return fmt.Errorf("workflow spec not found in data") 145 146 } 146 147 147 - triggerRepo, ok := workflowData["triggerRepo"].(*tangled.Pipeline_TriggerRepo) 148 + triggerMetadata, ok := workflowData["triggerMetadata"].(*tangled.Pipeline_TriggerMetadata) 149 + if !ok { 150 + return fmt.Errorf("trigger metadata not found in data") 151 + } 152 + 153 + cloneOpts, ok := workflowData["cloneOpts"].(*tangled.Pipeline_CloneOpts) 148 154 if !ok { 149 - return fmt.Errorf("trigger repo not found in data") 155 + return fmt.Errorf("clone options not found in data") 156 + } 157 + 158 + // Use shared clone command builder to extract commit SHA and build repo URL 159 + // Note: We're not using the actual commands here, just extracting the metadata 160 + // The Job builder will regenerate the commands for the init container 161 + cloneConfig := steps.CloneConfig{ 162 + Workflow: tangled.Pipeline_Workflow{ 163 + Clone: cloneOpts, 164 + }, 165 + TriggerMetadata: *triggerMetadata, 166 + DevMode: false, // TODO: Make this configurable 167 + WorkspaceDir: "/tangled/workspace", 150 168 } 151 169 152 - // Determine commit SHA based on trigger type 153 - // For now, assume we're handling push events with NewSha 154 - // TODO: Handle other trigger types (PR, manual) 155 - commitSHA := "" // Need to extract from trigger metadata 170 + cloneCommands, err := steps.BuildCloneCommands(cloneConfig) 171 + if err != nil { 172 + return fmt.Errorf("failed to build clone commands: %w", err) 173 + } 156 174 157 - // Build repository clone URL 158 - repoURL := fmt.Sprintf("https://%s/%s/%s", triggerRepo.Knot, triggerRepo.Did, triggerRepo.Repo) 175 + commitSHA := cloneCommands.CommitSHA 176 + repoURL := cloneCommands.RepoURL 159 177 160 178 // Store knot for status reporting 161 - e.currentKnot = triggerRepo.Knot 179 + e.currentKnot = triggerMetadata.Repo.Knot 162 180 163 181 // Generate SpindleSet name: spindle-{workflowName}-{pipelineID} 164 182 // Sanitize workflow name (remove .yaml/.yml, replace dots) ··· 186 204 Spec: loomv1alpha1.SpindleSetSpec{ 187 205 Template: e.template, 188 206 PipelineRun: &loomv1alpha1.PipelineRunSpec{ 189 - PipelineID: wid.PipelineId.Rkey, 190 - Knot: e.currentKnot, 191 - RepoURL: repoURL, 192 - CommitSHA: commitSHA, 193 - Workflows: []loomv1alpha1.WorkflowSpec{spec}, // Single workflow per SpindleSet 207 + PipelineID: wid.PipelineId.Rkey, 208 + Knot: e.currentKnot, 209 + RepoURL: repoURL, 210 + CommitSHA: commitSHA, 211 + CloneCommands: cloneCommands.All, 212 + SkipClone: cloneCommands.Skip, 213 + Workflows: []loomv1alpha1.WorkflowSpec{spec}, // Single workflow per SpindleSet 194 214 }, 195 215 }, 196 216 } ··· 321 341 return nil 322 342 } 323 343 324 - // LogEvent represents a structured log event from the runner binary 325 - type LogEvent struct { 326 - Kind string `json:"kind"` // "control" or "data" 327 - Event string `json:"event,omitempty"` // "start", "end" (for control events) 328 - StepID int `json:"step_id"` // 0-based step index 329 - StepName string `json:"step_name,omitempty"` // Step name 330 - WorkflowName string `json:"workflow_name"` // Workflow name for log separation 331 - Stream string `json:"stream,omitempty"` // "stdout" or "stderr" (for data events) 332 - Content string `json:"content,omitempty"` // Log line content (for data events) 333 - ExitCode *int `json:"exit_code,omitempty"` // Exit code (for control/end events) 334 - Timestamp string `json:"timestamp"` // ISO 8601 timestamp 335 - } 336 - 337 344 // getOrCreateLogStream gets an existing log stream or creates a new one for step 0 338 345 func (e *KubernetesEngine) getOrCreateLogStream(ctx context.Context, wid models.WorkflowId, job *batchv1.Job) (*workflowLogStream, error) { 339 346 widKey := wid.String() ··· 456 463 for scanner.Scan() { 457 464 line := scanner.Text() 458 465 459 - // Try to parse as JSON event 460 - var event LogEvent 461 - if err := json.Unmarshal([]byte(line), &event); err != nil { 466 + // Try to parse as models.LogLine from the runner binary 467 + var logLine models.LogLine 468 + if err := json.Unmarshal([]byte(line), &logLine); err != nil { 462 469 // Not JSON or parse error - skip 463 470 continue 464 471 } 465 472 466 473 // Validate step index 467 - if event.StepID < 0 || event.StepID >= len(workflow.Steps) { 474 + if logLine.StepId < 0 || logLine.StepId >= len(workflow.Steps) { 468 475 continue 469 476 } 470 477 471 478 // Only process events for the current step 472 - if event.StepID != stepID { 479 + if logLine.StepId != stepID { 473 480 // Got event for a different step - this shouldn't happen in sequential execution 474 481 // but log it and continue 475 482 continue 476 483 } 477 484 478 - switch event.Kind { 479 - case "control": 485 + switch logLine.Kind { 486 + case models.LogKindControl: 480 487 // Use control events from runner for flow control only 481 488 // Don't write them - the core spindle engine writes control events 482 - if event.Event == "end" { 489 + if logLine.StepStatus == models.StepStatusEnd { 483 490 // Step is done, return 484 491 return nil 485 492 } 486 493 // For "start" events, just continue reading 487 494 488 - case "data": 495 + case models.LogKindData: 489 496 // Log output from step 490 - if event.Stream == "" { 491 - event.Stream = "stdout" // Default to stdout 497 + if logLine.Stream == "" { 498 + logLine.Stream = "stdout" // Default to stdout 492 499 } 493 - dataWriter := wfLogger.DataWriter(event.StepID, event.Stream) 494 - dataWriter.Write([]byte(event.Content + "\n")) 500 + dataWriter := wfLogger.DataWriter(logLine.StepId, logLine.Stream) 501 + dataWriter.Write([]byte(logLine.Content + "\n")) 495 502 } 496 503 } 497 504
pkg/jobbuilder/affinity.go internal/jobbuilder/affinity.go
+47 -34
pkg/jobbuilder/job_template.go internal/jobbuilder/job_template.go
··· 13 13 loomv1alpha1 "tangled.org/evan.jarrett.net/loom/api/v1alpha1" 14 14 ) 15 15 16 + // WorkflowStep represents a single step in a workflow. 17 + type WorkflowStep struct { 18 + Name string 19 + Command string 20 + Env map[string]string 21 + } 22 + 16 23 // WorkflowConfig contains all the information needed to create a spindle Job. 17 24 type WorkflowConfig struct { 18 25 // WorkflowName is the name of the workflow (used in Job name and labels) ··· 41 48 42 49 // CommitSHA is the git commit to checkout 43 50 CommitSHA string 51 + 52 + // CloneCommands are the git commands to run in the init container 53 + // Generated by steps.BuildCloneCommands() 54 + CloneCommands []string 55 + 56 + // SkipClone indicates whether to skip the clone init container entirely 57 + SkipClone bool 44 58 45 59 // Secrets contains auth tokens or other secrets needed 46 60 Secrets []string ··· 169 183 }, 170 184 }, 171 185 }, 172 - { 173 - Name: "clone-repo", 174 - Image: "alpine/git:latest", 175 - Command: []string{"/bin/sh", "-c"}, 176 - SecurityContext: &corev1.SecurityContext{ 177 - AllowPrivilegeEscalation: &[]bool{false}[0], 178 - Capabilities: &corev1.Capabilities{ 179 - Drop: []corev1.Capability{"ALL"}, 180 - }, 181 - }, 182 - Args: []string{ 183 - fmt.Sprintf(` 184 - set -e 185 - echo "Cloning repository..." 186 - git clone %s /tangled/workspace || { 187 - echo "Failed to clone repository" 188 - exit 1 189 - } 190 - cd /tangled/workspace 191 - echo "Checking out commit %s..." 192 - git checkout %s || { 193 - echo "Failed to checkout commit" 194 - exit 1 195 - } 196 - echo "Repository ready" 197 - `, config.RepoURL, config.CommitSHA, config.CommitSHA), 198 - }, 199 - VolumeMounts: []corev1.VolumeMount{ 200 - { 201 - Name: "workspace", 202 - MountPath: "/tangled/workspace", 203 - }, 204 - }, 205 - }, 186 + buildCloneInitContainer(config), 206 187 }, 207 188 208 189 // Main container: run loom-runner binary in user's image ··· 317 298 318 299 return env 319 300 } 301 + 302 + // buildCloneInitContainer creates the init container for cloning the git repository. 303 + // Uses the shared clone command builder from tangled.org/core/spindle/steps. 304 + func buildCloneInitContainer(config WorkflowConfig) corev1.Container { 305 + // If clone is skipped, return an empty container (will be filtered out) 306 + if config.SkipClone { 307 + return corev1.Container{} 308 + } 309 + 310 + // Build the shell script from clone commands 311 + // Add set -e for error handling and echo for visibility 312 + script := "set -e\n" + strings.Join(config.CloneCommands, "\n") + "\necho \"Repository ready\"" 313 + 314 + return corev1.Container{ 315 + Name: "clone-repo", 316 + Image: "alpine/git:latest", 317 + Command: []string{"/bin/sh", "-c"}, 318 + SecurityContext: &corev1.SecurityContext{ 319 + AllowPrivilegeEscalation: &[]bool{false}[0], 320 + Capabilities: &corev1.Capabilities{ 321 + Drop: []corev1.Capability{"ALL"}, 322 + }, 323 + }, 324 + Args: []string{script}, 325 + VolumeMounts: []corev1.VolumeMount{ 326 + { 327 + Name: "workspace", 328 + MountPath: "/tangled/workspace", 329 + }, 330 + }, 331 + } 332 + }
-129
pkg/jobbuilder/script_builder.go
··· 1 - package jobbuilder 2 - 3 - import ( 4 - "fmt" 5 - "strings" 6 - ) 7 - 8 - // WorkflowStep represents a single step in a workflow. 9 - type WorkflowStep struct { 10 - Name string 11 - Command string 12 - Env map[string]string 13 - } 14 - 15 - // BuildStepExecutionScript generates a bash script that executes all workflow steps sequentially. 16 - // The script implements GitHub Actions-compatible environment passing between steps. 17 - func BuildStepExecutionScript(steps []WorkflowStep) string { 18 - var script strings.Builder 19 - 20 - // Script header with error handling 21 - script.WriteString(`#!/bin/bash 22 - set -e 23 - set -o pipefail 24 - 25 - `) 26 - 27 - // GitHub Actions-compatible environment setup 28 - script.WriteString(`# GitHub Actions-compatible environment setup 29 - export GITHUB_ENV=/tmp/github/env 30 - export GITHUB_PATH=/tmp/github/path 31 - export GITHUB_OUTPUT=/tmp/github/output 32 - export GITHUB_STEP_SUMMARY=/tmp/github/step-summary.md 33 - 34 - # Initialize files 35 - mkdir -p /tmp/github 36 - touch $GITHUB_ENV $GITHUB_PATH $GITHUB_OUTPUT $GITHUB_STEP_SUMMARY 37 - 38 - # Helper functions for environment passing 39 - add_to_env() { 40 - echo "$1=$2" >> $GITHUB_ENV 41 - export "$1"="$2" 42 - } 43 - 44 - add_to_path() { 45 - echo "$1" >> $GITHUB_PATH 46 - export PATH="$1:$PATH" 47 - } 48 - 49 - set_output() { 50 - echo "$1=$2" >> $GITHUB_OUTPUT 51 - } 52 - 53 - # Load environment variables set by previous steps 54 - load_github_env() { 55 - if [ -f "$GITHUB_ENV" ]; then 56 - set -a 57 - source "$GITHUB_ENV" 2>/dev/null || true 58 - set +a 59 - fi 60 - if [ -f "$GITHUB_PATH" ]; then 61 - while IFS= read -r line; do 62 - if [ -n "$line" ]; then 63 - export PATH="$line:$PATH" 64 - fi 65 - done < "$GITHUB_PATH" 66 - fi 67 - } 68 - 69 - # Timestamp helper 70 - timestamp() { 71 - date -u +"%Y-%m-%dT%H:%M:%S.%3NZ" 72 - } 73 - 74 - `) 75 - 76 - // Generate code for each step 77 - for i, step := range steps { 78 - script.WriteString(fmt.Sprintf(` 79 - ############################################################################## 80 - # Step %d: %s 81 - ############################################################################## 82 - echo "" 83 - echo "$(timestamp) ===== Starting Step %d: %s =====" 84 - echo "" 85 - 86 - # Load environment from previous steps 87 - load_github_env 88 - 89 - `, i+1, step.Name, i+1, step.Name)) 90 - 91 - // Set step-specific environment variables 92 - if len(step.Env) > 0 { 93 - script.WriteString("# Step-specific environment variables\n") 94 - for k, v := range step.Env { 95 - // Escape single quotes in the value 96 - escapedValue := strings.ReplaceAll(v, "'", "'\\''") 97 - script.WriteString(fmt.Sprintf("export %s='%s'\n", k, escapedValue)) 98 - } 99 - script.WriteString("\n") 100 - } 101 - 102 - // Execute the step command 103 - script.WriteString("# Execute step command\n") 104 - script.WriteString("(\n") 105 - script.WriteString(" cd /tangled/workspace\n") 106 - script.WriteString(" " + step.Command + "\n") 107 - script.WriteString(") || {\n") 108 - script.WriteString(fmt.Sprintf(" echo \"$(timestamp) Step '%s' failed with exit code $?\"\n", step.Name)) 109 - script.WriteString(" exit 1\n") 110 - script.WriteString("}\n\n") 111 - 112 - script.WriteString(fmt.Sprintf("echo \"$(timestamp) ===== Completed Step %d: %s =====\"\n", i+1, step.Name)) 113 - } 114 - 115 - // Script footer 116 - script.WriteString(` 117 - echo "" 118 - echo "$(timestamp) ===== All steps completed successfully =====" 119 - exit 0 120 - `) 121 - 122 - return script.String() 123 - } 124 - 125 - // EscapeShellString escapes a string for safe use in shell scripts. 126 - func EscapeShellString(s string) string { 127 - // Replace single quotes with '\'' 128 - return strings.ReplaceAll(s, "'", "'\\''") 129 - }