Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changes/unreleased/Feature-20251210-091850.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
kind: Feature
body: Add a new --job-agent-mode option, for creating privileged pods capable of container-in-container management
time: 2025-12-10T09:18:50.207809-08:00
2 changes: 2 additions & 0 deletions src/cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ func init() {
rootCmd.PersistentFlags().String("job-pod-workdir", "/jobs", "The job pod working directory.")
rootCmd.PersistentFlags().Int("job-pod-log-max-interval", 30, "The max amount of time between when pod logs are shipped to OpsLevel. Works in tandem with 'job-pod-log-max-size'")
rootCmd.PersistentFlags().Int("job-pod-log-max-size", 1000000, "The max amount in bytes to buffer before pod logs are shipped to OpsLevel. Works in tandem with 'job-pod-log-max-interval'")
rootCmd.PersistentFlags().Bool("job-agent-mode", false, "Enable agent mode with privileged security context for Container-in-Container support. WARNING: This grants elevated privileges and should only be enabled for trusted workloads.")
Copy link
Contributor Author

@derek-etherton-opslevel derek-etherton-opslevel Dec 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

now that we're using a job arg, we can be oober confident that nothing will use this mode until we explicitly add it to opslevel-kubernetes 👍

so sequencing will be:

  1. ship new image to ECR (done)
  2. ship this PR - no effect since flag isn't used
  3. Test on staging, then ship the OpsLevel MR
  4. Once changes are live, fast follow with an opslevel-kubernetes MR adding the --job-agent-mode flag
    a. since the feature isn't "on" yet we can get away with this teeny gap.


rootCmd.PersistentFlags().String("runner-pod-name", "", "overrides environment variable 'RUNNER_POD_NAME'")
rootCmd.PersistentFlags().String("runner-pod-namespace", "default", "The kubernetes namespace the runner pod is deployed in. Overrides environment variable 'RUNNER_POD_NAMESPACE'")
Expand All @@ -71,6 +72,7 @@ func init() {
viper.BindEnv("job-pod-workdir", "OPSLEVEL_JOB_POD_WORKDIR")
viper.BindEnv("job-pod-log-max-interval", "OPSLEVEL_JOB_POD_LOG_MAX_INTERVAL")
viper.BindEnv("job-pod-log-max-size", "OPSLEVEL_JOB_POD_LOG_MAX_SIZE")
viper.BindEnv("job-agent-mode", "OPSLEVEL_JOB_AGENT_MODE")

viper.BindEnv("runner-pod-name", "RUNNER_POD_NAME")
viper.BindEnv("runner-pod-namespace", "RUNNER_POD_NAMESPACE")
Expand Down
28 changes: 25 additions & 3 deletions src/pkg/k8s.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,27 @@ func executable() *int32 {
func (s *JobRunner) getPodObject(identifier string, labels map[string]string, job opslevel.RunnerJob) *corev1.Pod {
// TODO: Allow configuration of Labels
// TODO: Allow configuration of Pod Command

podSecurityContext := s.podConfig.SecurityContext
if s.podConfig.AgentMode {
// Agent mode jobs need root user for Docker daemon
runAsUser := int64(0)
fsGroup := int64(0)
podSecurityContext = corev1.PodSecurityContext{
RunAsUser: &runAsUser,
FSGroup: &fsGroup,
}
}

var containerSecurityContext *corev1.SecurityContext
if s.podConfig.AgentMode {
// Agent mode jobs need privileged mode for creating containers within container
privileged := true
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have an existing network policy for runner-jobs blocking egress to internal networks: https://gitlab.com/jklabsinc/opslevel-kubernetes/-/tree/main/clusters/new-prod-runners/runner-jobs?ref_type=heads

containerSecurityContext = &corev1.SecurityContext{
Privileged: &privileged,
}
}

return &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: identifier,
Expand All @@ -163,7 +184,7 @@ func (s *JobRunner) getPodObject(identifier string, labels map[string]string, jo
Spec: corev1.PodSpec{
TerminationGracePeriodSeconds: &s.podConfig.TerminationGracePeriodSeconds,
RestartPolicy: corev1.RestartPolicyNever,
SecurityContext: &s.podConfig.SecurityContext,
SecurityContext: &podSecurityContext,
ServiceAccountName: s.podConfig.ServiceAccountName,
NodeSelector: s.podConfig.NodeSelector,
InitContainers: []corev1.Container{
Expand Down Expand Up @@ -195,8 +216,9 @@ func (s *JobRunner) getPodObject(identifier string, labels map[string]string, jo
"-c",
fmt.Sprintf("sleep %d", s.podConfig.Lifetime),
},
Resources: s.podConfig.Resources,
Env: s.getPodEnv(job.Variables),
Resources: s.podConfig.Resources,
Env: s.getPodEnv(job.Variables),
SecurityContext: containerSecurityContext,
VolumeMounts: []corev1.VolumeMount{
{
Name: "scripts",
Expand Down
2 changes: 2 additions & 0 deletions src/pkg/k8s_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ type K8SPodConfig struct {
PullPolicy corev1.PullPolicy `yaml:"pullPolicy"`
SecurityContext corev1.PodSecurityContext `yaml:"securityContext"`
NodeSelector map[string]string `yaml:"nodeSelector"`
AgentMode bool `yaml:"agentMode"`
}

func ReadPodConfig(path string) (*K8SPodConfig, error) {
Expand All @@ -46,6 +47,7 @@ func ReadPodConfig(path string) (*K8SPodConfig, error) {
},
},
TerminationGracePeriodSeconds: 5,
AgentMode: viper.GetBool("job-agent-mode"),
},
}
// Early out with viper defaults if config file doesn't exist
Expand Down
52 changes: 52 additions & 0 deletions src/pkg/k8s_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@ package pkg
import (
"testing"

"github.com/opslevel/opslevel-go/v2024"
"github.com/rocktavious/autopilot/v2023"
"github.com/rs/zerolog"
corev1 "k8s.io/api/core/v1"
)

func TestCreateLabelSelector(t *testing.T) {
Expand All @@ -18,3 +21,52 @@ func TestCreateLabelSelector(t *testing.T) {
autopilot.Ok(t, err)
autopilot.Equals(t, labels, labelSelector.MatchLabels)
}

func TestGetPodObject_AgentModePrivileged(t *testing.T) {
// Arrange
runner := &JobRunner{
logger: zerolog.Nop(),
podConfig: &K8SPodConfig{
Namespace: "test",
SecurityContext: corev1.PodSecurityContext{},
TerminationGracePeriodSeconds: 30,
AgentMode: true,
},
}
job := opslevel.RunnerJob{
Image: "alpine:latest",
}
labels := map[string]string{"app": "test"}

// Act
pod := runner.getPodObject("test-pod", labels, job)

// Assert
autopilot.Assert(t, pod.Spec.Containers[0].SecurityContext != nil, "SecurityContext should be set for agent mode")
autopilot.Assert(t, pod.Spec.Containers[0].SecurityContext.Privileged != nil, "Privileged should be set for agent mode")
autopilot.Equals(t, true, *pod.Spec.Containers[0].SecurityContext.Privileged)
autopilot.Equals(t, int64(0), *pod.Spec.SecurityContext.RunAsUser)
autopilot.Equals(t, int64(0), *pod.Spec.SecurityContext.FSGroup)
}

func TestGetPodObject_RegularJobNotPrivileged(t *testing.T) {
// Arrange
runner := &JobRunner{
logger: zerolog.Nop(),
podConfig: &K8SPodConfig{
Namespace: "test",
SecurityContext: corev1.PodSecurityContext{},
TerminationGracePeriodSeconds: 30,
},
}
job := opslevel.RunnerJob{
Image: "alpine:latest",
}
labels := map[string]string{"app": "test"}

// Act
pod := runner.getPodObject("test-pod", labels, job)

// Assert
autopilot.Equals(t, (*corev1.SecurityContext)(nil), pod.Spec.Containers[0].SecurityContext)
}
Loading