diff --git a/.env.example b/.env.example index d4ac17d..6f2f184 100644 --- a/.env.example +++ b/.env.example @@ -8,6 +8,10 @@ APP_ENV=development GATEWAY_PORT=8000 LOG_LEVEL=INFO ENABLE_DOCS=false +# Set PUBLIC_BETA_MODE=true, or APP_ENV=public_beta, before exposing this +# to arbitrary untrusted beta users. Public beta mode rejects risky sandbox +# features and requires a stronger runtime such as gVisor/runsc or Kata. +PUBLIC_BETA_MODE=false # --- Authentication --- REQUIRE_AUTH=true @@ -32,6 +36,12 @@ CORS_ALLOW_CREDENTIALS=true # --- Docker and state --- USE_DOCKER_DEFAULT_SECCOMP=true +DOCKER_CLIENT_TIMEOUT=30 +# Optional stronger isolation runtime configured on the Docker daemon host. +# Examples: runsc for gVisor, kata-runtime for Kata Containers. +SANDBOX_RUNTIME= +STRONG_SANDBOX_RUNTIMES=runsc,kata,kata-runtime,io.containerd.runsc.v1,io.containerd.kata.v2 +REQUIRE_STRONG_SANDBOX_ISOLATION=false # If you disable Docker's RuntimeDefault seccomp policy, point this at a file # path that exists on the Docker daemon host. The checked-in profile under # ./security is only a source artifact; the daemon cannot read it from inside @@ -61,6 +71,9 @@ MAX_CONCURRENT_EXECUTIONS=10 MAX_ACTIVE_SESSIONS=100 MAX_CONTAINERS_PER_PRINCIPAL=3 CONTAINER_CREATE_GUARD_TIMEOUT=30 +SESSION_TIMEOUT_SECONDS=1200 +MAX_SESSION_LIFETIME_SECONDS=3600 +MAX_EXECUTIONS_PER_SESSION=100 DEFAULT_TIMEOUT=30 MAX_TIMEOUT=120 RATE_LIMIT_REQUESTS_PER_WINDOW=30 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5a84d2c..a0a8adc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,9 +32,9 @@ jobs: - name: Install gateway test dependencies run: python -m pip install -r gateway/requirements.txt - name: Compile Python sources - run: python -m compileall gateway sandbox tests/verification_client.py tests/verify_vm_flow.py tests/verify_playwright.py tests/verify_features.py tests/test_execution.py tests/test_gateway_unit.py .github/scripts + run: python -m compileall gateway sandbox tests/verification_client.py tests/verify_vm_flow.py tests/verify_playwright.py tests/verify_features.py tests/test_execution.py tests/test_gateway_unit.py tests/test_executor_unit.py .github/scripts - name: Run unit tests - run: python -m unittest -q tests/test_gateway_unit.py + run: python -m unittest -q tests/test_gateway_unit.py tests/test_executor_unit.py - name: Validate version metadata run: | python - <<'PY' @@ -53,7 +53,7 @@ jobs: print(f"Validated version {version}") PY - name: Run Bandit - run: bandit -q -r gateway sandbox + run: bandit -q -r gateway sandbox -s B102,B108,B404,B603 - name: Audit gateway dependencies run: pip-audit -r gateway/requirements.txt - name: Audit sandbox dependencies @@ -62,6 +62,8 @@ jobs: integration: runs-on: ubuntu-latest needs: validate + env: + CONTAINER_RATE_LIMIT_REQUESTS_PER_WINDOW: "30" steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Build images @@ -91,14 +93,14 @@ jobs: API_TOKEN: 0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef run: python3 tests/test_execution.py - name: Scan gateway image - uses: aquasecurity/trivy-action@6c175e9c4083a92bbca2f9724c8a5e33bc2d97a5 # 0.28.0 + uses: aquasecurity/trivy-action@57a97c7e7821a5776cebc9bb87c984fa69cba8f1 # v0.35.0 with: image-ref: code-gateway:latest severity: HIGH,CRITICAL ignore-unfixed: true exit-code: "1" - name: Scan sandbox image - uses: aquasecurity/trivy-action@6c175e9c4083a92bbca2f9724c8a5e33bc2d97a5 # 0.28.0 + uses: aquasecurity/trivy-action@57a97c7e7821a5776cebc9bb87c984fa69cba8f1 # v0.35.0 with: image-ref: code-sandbox:latest severity: HIGH,CRITICAL @@ -107,3 +109,49 @@ jobs: - name: Shutdown stack if: always() run: docker compose --profile local-docker down --remove-orphans --volumes + + hardened-config: + runs-on: ubuntu-latest + needs: validate + env: + APP_ENV: public_beta + PUBLIC_BETA_MODE: "true" + REQUIRE_AUTH: "true" + METRICS_AUTH_REQUIRED: "true" + API_KEYS: ci:0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef + CORS_ALLOW_ORIGINS: https://ci.example.invalid + USE_DOCKER_DEFAULT_SECCOMP: "true" + SANDBOX_NETWORK_MODE: none + ALLOW_PIP_INSTALLS: "false" + ALLOW_SANDBOX_ENV_INJECTION: "false" + SANDBOX_IMAGE: code-sandbox:ci + SANDBOX_RUNTIME: runsc + STRONG_SANDBOX_RUNTIMES: runsc,kata-runtime + REQUIRE_STRONG_SANDBOX_ISOLATION: "true" + REQUIRE_SHARED_STATE: "true" + REDIS_URL: redis://redis:6379/0 + DOCKER_HOST: tcp://remote-docker:2376 + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + with: + python-version: "3.12" + - name: Install gateway dependencies + run: python -m pip install -r gateway/requirements.txt + - name: Validate public beta guardrail configuration + run: | + python - <<'PY' + import sys + from pathlib import Path + + sys.path.insert(0, str(Path.cwd() / "gateway")) + import app + + app.validate_runtime_configuration() + assert app.PUBLIC_BETA_MODE is True + assert app.SANDBOX_NETWORK_MODE == "none" + assert app.ALLOW_PIP_INSTALLS is False + assert app.ALLOW_SANDBOX_ENV_INJECTION is False + assert app.strong_sandbox_runtime_configured() is True + print("Public beta guardrails validated") + PY diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index dafabeb..da58669 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -58,9 +58,9 @@ jobs: - name: Validate bumped files run: | docker compose config -q - python -m compileall -q gateway sandbox tests/verification_client.py tests/verify_vm_flow.py tests/verify_playwright.py tests/verify_features.py tests/test_execution.py tests/test_gateway_unit.py .github/scripts - python -m unittest -q tests/test_gateway_unit.py - bandit -q -r gateway sandbox + python -m compileall -q gateway sandbox tests/verification_client.py tests/verify_vm_flow.py tests/verify_playwright.py tests/verify_features.py tests/test_execution.py tests/test_gateway_unit.py tests/test_executor_unit.py .github/scripts + python -m unittest -q tests/test_gateway_unit.py tests/test_executor_unit.py + bandit -q -r gateway sandbox -s B102,B108,B404,B603 pip-audit -r gateway/requirements.txt pip-audit -r sandbox/requirements.txt python - <<'PY' diff --git a/README.md b/README.md index a968eb0..41d62b6 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ This service uses Docker containers as the sandbox boundary. It does not create On Linux hosts, sandbox containers share the host kernel. On macOS and Windows Docker Desktop, containers usually run inside Docker Desktop's Linux VM, but this project still manages Docker containers, not VMs. Treat this as hardened container isolation, not VM-grade isolation. -Do not expose this service to arbitrary hostile users unless you add stronger isolation and operational controls. For high-risk workloads, use dedicated disposable hosts or a stronger boundary such as microVMs, VMs, gVisor, or Kata Containers. Protect Docker daemon access carefully; access to the Docker socket or an overly permissive Docker API proxy can be equivalent to host-level control. +Do not expose this service to arbitrary hostile users unless you add stronger isolation and operational controls. For public beta or high-risk workloads, use dedicated disposable hosts or a stronger boundary such as microVMs, VMs, gVisor, or Kata Containers, then enable `PUBLIC_BETA_MODE=true` so the gateway rejects risky configuration. Protect Docker daemon access carefully; access to the Docker socket or an overly permissive Docker API proxy can be equivalent to host-level control. ## What It Does @@ -503,15 +503,18 @@ Do not expose this service to the internet or a shared network until all of the - `REQUIRE_AUTH=true` and `API_KEYS` or JWT auth is configured with fresh, long, random secrets created for this deployment. - Traffic is protected by TLS at an upstream reverse proxy, load balancer, ingress, or service mesh. -- `APP_ENV=production`. +- `APP_ENV=production` for controlled live deployments, or `APP_ENV=public_beta` / `PUBLIC_BETA_MODE=true` for arbitrary untrusted beta users. - `ENABLE_DOCS=false`. - `REQUIRE_SHARED_STATE=true` and `REDIS_URL` points at a durable, access-controlled Redis deployment. - `GATEWAY_DOCKER_HOST` or `DOCKER_HOST` points at a dedicated remote Docker daemon over TLS (`tcp://...:2376`) or SSH (`ssh://...`), not the local socket proxy. - `CORS_ALLOW_ORIGINS` is restricted to the real ChatUI origin or origins. Do not use wildcard CORS with credentials. - `SANDBOX_NETWORK_MODE=none` unless network access is explicitly required and isolated. +- Public beta deployments configure `SANDBOX_RUNTIME` to a stronger runtime such as gVisor/runsc or Kata Containers and set `REQUIRE_STRONG_SANDBOX_ISOLATION=true`. +- `SANDBOX_IMAGE` uses an immutable tag or digest, not `latest`. - `SANDBOX_READ_ONLY_ROOTFS=true`. - `ALLOW_PIP_INSTALLS=false` for untrusted workloads. - `ALLOW_SANDBOX_ENV_INJECTION=false` unless submitted code is trusted. +- `SESSION_TIMEOUT_SECONDS`, `MAX_SESSION_LIFETIME_SECONDS`, and `MAX_EXECUTIONS_PER_SESSION` are set to realistic abuse budgets. - CPU, memory, PID, request-size, file-size, timeout, session, and rate limits are tuned for your host capacity. - Real secrets are stored outside source control and rotated if they were ever shared, logged, or used in another environment. @@ -540,7 +543,7 @@ Implemented controls include: - Redis-backed shared state for multi-replica coordination. - Prometheus metrics and health endpoints for operations. -These controls reduce risk but do not make Docker containers equivalent to VMs. +These controls reduce risk but do not make Docker's default container runtime equivalent to VMs. Use a stronger runtime or dedicated disposable worker hosts before accepting arbitrary public users. ### Vulnerability Disclosure @@ -554,7 +557,8 @@ See `.env.example` for source defaults. `setup.sh` and `setup.ps1` create `.env` | Variable | Default | Description | Best practices | | --- | --- | --- | --- | -| `APP_ENV` | `development` | Deployment environment. Production guardrails are enforced when this is `production` or `prod`. | Use `development` locally, `staging` before launch, and `production` for live deployments. | +| `APP_ENV` | `development` | Deployment environment. Production guardrails are enforced when this is `production` or `prod`; public beta guardrails are enabled when this is `public_beta`, `public-beta`, or `beta`. | Use `development` locally, `staging` before launch, `production` for controlled live deployments, and `public_beta` only with stronger sandbox isolation. | +| `PUBLIC_BETA_MODE` | `false` | Enables strict public beta validation regardless of `APP_ENV`. | Set `true` before exposing arbitrary untrusted beta users. This requires no sandbox network, no pip installs, no env injection, immutable images, and a stronger runtime. | | `GATEWAY_PORT` | `8000` | Host port mapped to the gateway container's port `8000`. | Keep `8000` locally unless it conflicts. In production, place the service behind TLS infrastructure and expose only required ports. | | `LOG_LEVEL` | `INFO` | Gateway Python logging level. | Use `INFO` normally. Use `DEBUG` only for temporary debugging because logs may contain operational details. | | `ENABLE_DOCS` | `false` | Enables FastAPI `/docs` and `/openapi.json`. | Keep `false` in production. Enable only for local debugging or restricted non-production environments. | @@ -590,6 +594,7 @@ See `.env.example` for source defaults. `setup.sh` and `setup.ps1` create `.env` | --- | --- | --- | --- | | `GATEWAY_DOCKER_HOST` | `tcp://docker-proxy:2375` locally | Compose variable passed into the gateway as `DOCKER_HOST`. | Use local docker-proxy only for development. In production, point at a dedicated remote daemon over TLS or SSH. | | `DOCKER_HOST` | empty in direct process runs | Docker daemon endpoint read by `docker.from_env()` inside the gateway. | For non-Compose deployments, set this directly to a safe remote daemon. | +| `DOCKER_CLIENT_TIMEOUT` | `30` | Docker API client timeout in seconds. | Keep bounded so Docker API hangs do not pin request workers indefinitely. | | `USE_DOCKER_DEFAULT_SECCOMP` | `true` | Uses Docker runtime default seccomp policy. | Keep `true` unless you have a tested daemon-visible profile. | | `SECCOMP_PROFILE_DAEMON_PATH` | empty | Absolute path to a seccomp profile on the Docker daemon host when default seccomp is disabled. `SECCOMP_PROFILE_PATH` is accepted as a legacy alias. | Set only if `USE_DOCKER_DEFAULT_SECCOMP=false`; the path must exist on the daemon host, not merely in this repository. | | `REDIS_URL` | `redis://redis:6379/0` | Redis URL for shared sessions, locks, and rate limits. | Use Redis in production and for multi-replica deployments. | @@ -623,12 +628,18 @@ See `.env.example` for source defaults. `setup.sh` and `setup.ps1` create `.env` | `MAX_ACTIVE_SESSIONS` | `100` | Maximum active sessions tracked by the gateway. | Size to host capacity and Redis/state expectations. | | `MAX_CONTAINERS_PER_PRINCIPAL` | `3` | Maximum active sessions per authenticated subject and tenant. | Keep small for shared deployments. | | `CONTAINER_CREATE_GUARD_TIMEOUT` | `30` | Timeout in seconds while waiting for the serialized container creation guard. | Increase only if Docker is slow during normal operation. | +| `SESSION_TIMEOUT_SECONDS` | `1200` | Idle timeout for sandbox sessions. | Keep short for public/shared deployments. | +| `MAX_SESSION_LIFETIME_SECONDS` | `3600` | Hard lifetime for a sandbox session, regardless of activity. | Prevents users from keeping containers alive forever. Lower for public beta. | +| `MAX_EXECUTIONS_PER_SESSION` | `100` | Maximum number of executions allowed in one session before it is removed. | Use this as a per-session abuse budget. Lower for public beta. | ### Sandbox Runtime | Variable | Default | Description | Best practices | | --- | --- | --- | --- | | `SANDBOX_IMAGE` | `code-sandbox:latest` | Docker image used for sandbox sessions. | Use immutable image tags or digests in production. | +| `SANDBOX_RUNTIME` | empty | Optional Docker runtime for sandbox containers, for example `runsc` for gVisor or `kata-runtime` for Kata Containers. | Required in public beta mode. Configure the runtime on the Docker daemon host first. | +| `STRONG_SANDBOX_RUNTIMES` | `runsc,kata,kata-runtime,io.containerd.runsc.v1,io.containerd.kata.v2` | Comma-separated runtime names that satisfy strong isolation checks. | Keep narrow and aligned with runtimes actually installed on workers. | +| `REQUIRE_STRONG_SANDBOX_ISOLATION` | public-beta-aware, `.env.example`: `false` | Requires `SANDBOX_RUNTIME` to match `STRONG_SANDBOX_RUNTIMES`. | Set `true` for any deployment that accepts arbitrary untrusted users. | | `SANDBOX_USER` | `sandbox` | User name recorded for sandbox behavior and defaults. | Keep aligned with the sandbox image. | | `SANDBOX_UID` | `10001` | Sandbox Linux user ID. | Keep non-root. | | `SANDBOX_GID` | `10001` | Sandbox Linux group ID. | Keep non-root. | @@ -765,10 +776,11 @@ Recommended production deployment pattern: 2. Run the gateway behind TLS infrastructure. 3. Run Redis as a managed or persistent service. 4. Run sandbox containers on dedicated worker hosts or a dedicated remote Docker daemon. -5. Keep Docker daemon credentials and API keys out of source control. -6. Monitor request rates, execution latency, error rates, `429` responses, active executions, active sessions, Redis health, Docker daemon health, container restarts, memory, CPU, and disk pressure. -7. Rotate API keys and JWT secrets on a schedule. -8. Keep base images, Python dependencies, Docker, and host kernels patched. +5. For public beta, configure gVisor/runsc, Kata Containers, or an equivalent stronger runtime and set `PUBLIC_BETA_MODE=true`. +6. Keep Docker daemon credentials and API keys out of source control. +7. Monitor request rates, execution latency, error rates, `429` responses, active executions, active sessions, session expirations, Redis health, Docker daemon health, container restarts, memory, CPU, and disk pressure. +8. Rotate API keys and JWT secrets on a schedule. +9. Keep base images, Python dependencies, Docker, runtimes, and host kernels patched. Development notes: diff --git a/SECURITY.md b/SECURITY.md index 99caba5..597b5d2 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -8,9 +8,13 @@ Please help us keep ChatUI secure by reporting suspected vulnerabilities respons **Please do not report security vulnerabilities through public GitHub issues, pull requests, discussions, or comments.** -Instead, please report vulnerabilities privately by contacting us at: +Instead, please report vulnerabilities privately through GitHub private vulnerability +reporting for this repository. If you are reviewing a self-hosted deployment, +use the monitored security contact published by that deployment operator. -**[TODO: Add security contact]** +Self-hosted operators should publish a dedicated security email or intake form +before any public beta, and route it to an on-call owner who can triage container +escape, data exposure, and denial-of-service reports quickly. Please include as much detail as possible, including: @@ -53,4 +57,4 @@ Security-related changes may be described more generally until disclosure is app We appreciate your help in making the ChatUI Project more secure for everyone. -Thank you for supporting responsible disclosure. \ No newline at end of file +Thank you for supporting responsible disclosure. diff --git a/docker-compose.yml b/docker-compose.yml index 7e1471c..fb25095 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -107,6 +107,9 @@ services: - MAX_ACTIVE_SESSIONS=${MAX_ACTIVE_SESSIONS:-100} - MAX_CONTAINERS_PER_PRINCIPAL=${MAX_CONTAINERS_PER_PRINCIPAL:-3} - CONTAINER_CREATE_GUARD_TIMEOUT=${CONTAINER_CREATE_GUARD_TIMEOUT:-30} + - SESSION_TIMEOUT_SECONDS=${SESSION_TIMEOUT_SECONDS:-1200} + - MAX_SESSION_LIFETIME_SECONDS=${MAX_SESSION_LIFETIME_SECONDS:-3600} + - MAX_EXECUTIONS_PER_SESSION=${MAX_EXECUTIONS_PER_SESSION:-100} - DEFAULT_TIMEOUT=${DEFAULT_TIMEOUT:-30} - MAX_TIMEOUT=${MAX_TIMEOUT:-120} - RATE_LIMIT_REQUESTS_PER_WINDOW=${RATE_LIMIT_REQUESTS_PER_WINDOW:-30} @@ -127,6 +130,10 @@ services: - ALLOW_PIP_INSTALLS=${ALLOW_PIP_INSTALLS:-false} - MAX_PIP_PACKAGES=${MAX_PIP_PACKAGES:-5} - MAX_PIP_PACKAGE_NAME_LENGTH=${MAX_PIP_PACKAGE_NAME_LENGTH:-64} + - DOCKER_CLIENT_TIMEOUT=${DOCKER_CLIENT_TIMEOUT:-30} + - SANDBOX_RUNTIME=${SANDBOX_RUNTIME:-} + - STRONG_SANDBOX_RUNTIMES=${STRONG_SANDBOX_RUNTIMES:-runsc,kata,kata-runtime,io.containerd.runsc.v1,io.containerd.kata.v2} + - REQUIRE_STRONG_SANDBOX_ISOLATION=${REQUIRE_STRONG_SANDBOX_ISOLATION:-false} - USE_DOCKER_DEFAULT_SECCOMP=${USE_DOCKER_DEFAULT_SECCOMP:-true} - SECCOMP_PROFILE_DAEMON_PATH=${SECCOMP_PROFILE_DAEMON_PATH:-} - ALLOW_SANDBOX_ENV_INJECTION=${ALLOW_SANDBOX_ENV_INJECTION:-false} @@ -136,6 +143,7 @@ services: - SANDBOX_ENV_SOURCE_PATH=/etc/code-execution/.env_sandbox - FILE_PROVISION_TIMEOUT=${FILE_PROVISION_TIMEOUT:-30} - LOG_LEVEL=${LOG_LEVEL:-INFO} + - PUBLIC_BETA_MODE=${PUBLIC_BETA_MODE:-false} - ENABLE_DOCS=${ENABLE_DOCS:-false} - ENABLE_CORS=${ENABLE_CORS:-true} - CORS_ALLOW_ORIGINS=${CORS_ALLOW_ORIGINS:-http://localhost:3000} diff --git a/gateway/Dockerfile b/gateway/Dockerfile index 88c8b2c..33a3b56 100644 --- a/gateway/Dockerfile +++ b/gateway/Dockerfile @@ -9,6 +9,7 @@ FROM ${PYTHON_BASE_IMAGE} AS base # --- System Setup --- RUN apt-get update && \ + apt-get upgrade -y && \ apt-get install -y --no-install-recommends curl && \ rm -rf /var/lib/apt/lists/* && \ apt-get clean diff --git a/gateway/app.py b/gateway/app.py index f37b3a4..d042ef4 100644 --- a/gateway/app.py +++ b/gateway/app.py @@ -54,8 +54,25 @@ def split_csv(value: Optional[str]) -> list[str]: APP_ENV = os.getenv("APP_ENV", "production").strip().lower() IS_PRODUCTION = APP_ENV in {"prod", "production"} +PUBLIC_BETA_MODE = str_to_bool( + os.getenv("PUBLIC_BETA_MODE"), + default=APP_ENV in {"beta", "public_beta", "public-beta"}, +) SANDBOX_IMAGE = os.getenv("SANDBOX_IMAGE", "code-sandbox:latest") +SANDBOX_RUNTIME = os.getenv("SANDBOX_RUNTIME", "").strip() +STRONG_SANDBOX_RUNTIMES = split_csv(os.getenv("STRONG_SANDBOX_RUNTIMES")) or [ + "runsc", + "kata", + "kata-runtime", + "io.containerd.runsc.v1", + "io.containerd.kata.v2", +] +REQUIRE_STRONG_SANDBOX_ISOLATION = str_to_bool( + os.getenv("REQUIRE_STRONG_SANDBOX_ISOLATION"), + default=PUBLIC_BETA_MODE, +) +DOCKER_CLIENT_TIMEOUT = int(os.getenv("DOCKER_CLIENT_TIMEOUT", "30")) MAX_CONCURRENT = int(os.getenv("MAX_CONCURRENT_EXECUTIONS", "10")) DEFAULT_TIMEOUT = int(os.getenv("DEFAULT_TIMEOUT", "30")) MAX_TIMEOUT = int(os.getenv("MAX_TIMEOUT", "120")) @@ -218,7 +235,9 @@ class RequestBodyTooLarge(ValueError): state_backend: StateBackend local_docker_daemon_id: Optional[str] = None local_docker_daemon_name: Optional[str] = None -SESSION_TIMEOUT_SECONDS = 20 * 60 +SESSION_TIMEOUT_SECONDS = int(os.getenv("SESSION_TIMEOUT_SECONDS", str(20 * 60))) +MAX_SESSION_LIFETIME_SECONDS = int(os.getenv("MAX_SESSION_LIFETIME_SECONDS", str(60 * 60))) +MAX_EXECUTIONS_PER_SESSION = int(os.getenv("MAX_EXECUTIONS_PER_SESSION", "100")) REQUEST_COUNTER = Counter( "gateway_http_requests_total", @@ -273,6 +292,47 @@ def docker_host_hostname(docker_host: str) -> Optional[str]: return parsed.hostname.lower() if parsed.hostname else None +def image_reference_is_immutable(image: str) -> bool: + """Return whether an image reference is pinned enough for production use.""" + if "@sha256:" in image: + return True + last_component = image.rsplit("/", 1)[-1] + if ":" not in last_component: + return False + tag = last_component.rsplit(":", 1)[-1].strip().lower() + return bool(tag and tag != "latest") + + +def strong_sandbox_runtime_configured() -> bool: + """Return whether the configured Docker runtime is a known stronger isolation runtime.""" + if not SANDBOX_RUNTIME: + return False + return SANDBOX_RUNTIME in set(STRONG_SANDBOX_RUNTIMES) + + +def parse_optional_float( + value: Optional[str], + default: Optional[float] = None, +) -> Optional[float]: + """Parse a string float, returning a default for missing or invalid values.""" + if not value: + return default + try: + return float(value) + except (TypeError, ValueError): + return default + + +def parse_optional_int(value: Optional[str], default: int = 0) -> int: + """Parse a string integer, returning a default for missing or invalid values.""" + if not value: + return default + try: + return int(value) + except (TypeError, ValueError): + return default + + def principal_scope(auth: AuthContext) -> str: """Generate a scope string for rate limiting based on subject and tenant.""" return f"{auth.subject}:{auth.tenant or '-'}" @@ -283,9 +343,23 @@ def validate_runtime_configuration() -> None: if DEFAULT_TIMEOUT > MAX_TIMEOUT: raise RuntimeError("DEFAULT_TIMEOUT must be less than or equal to MAX_TIMEOUT") + if DOCKER_CLIENT_TIMEOUT < 1: + raise RuntimeError("DOCKER_CLIENT_TIMEOUT must be at least 1 second") + if MAX_REQUEST_BODY_SIZE < 1: raise RuntimeError("MAX_REQUEST_BODY_SIZE must be at least 1 byte") + if SESSION_TIMEOUT_SECONDS < 1: + raise RuntimeError("SESSION_TIMEOUT_SECONDS must be at least 1 second") + + if MAX_SESSION_LIFETIME_SECONDS < SESSION_TIMEOUT_SECONDS: + raise RuntimeError( + "MAX_SESSION_LIFETIME_SECONDS must be greater than or equal to SESSION_TIMEOUT_SECONDS" + ) + + if MAX_EXECUTIONS_PER_SESSION < 1: + raise RuntimeError("MAX_EXECUTIONS_PER_SESSION must be at least 1") + if REQUIRE_AUTH and not (JWT_SECRET or STATIC_API_KEYS): raise RuntimeError( "Authentication is required, but neither JWT nor static API keys are configured." @@ -312,6 +386,13 @@ def validate_runtime_configuration() -> None: if SANDBOX_NETWORK_MODE not in {"bridge", "none"}: raise RuntimeError("SANDBOX_NETWORK_MODE must be either 'bridge' or 'none'") + if REQUIRE_STRONG_SANDBOX_ISOLATION and not strong_sandbox_runtime_configured(): + allowed = ", ".join(STRONG_SANDBOX_RUNTIMES) + raise RuntimeError( + "Strong sandbox isolation is required, but SANDBOX_RUNTIME is not configured " + f"with a recognized runtime. Set SANDBOX_RUNTIME to one of: {allowed}" + ) + if not USE_DOCKER_DEFAULT_SECCOMP: if not SECCOMP_PROFILE_DAEMON_PATH: raise RuntimeError( @@ -327,25 +408,31 @@ def validate_runtime_configuration() -> None: if REQUIRE_SHARED_STATE and not REDIS_URL: raise RuntimeError("REDIS_URL must be configured when shared state is required.") - if IS_PRODUCTION: + if IS_PRODUCTION or PUBLIC_BETA_MODE: + mode_name = "production/public beta" if ENABLE_DOCS: - raise RuntimeError("ENABLE_DOCS must be false in production.") + raise RuntimeError(f"ENABLE_DOCS must be false in {mode_name}.") + if not image_reference_is_immutable(SANDBOX_IMAGE): + raise RuntimeError( + f"SANDBOX_IMAGE must use an immutable tag or digest in {mode_name}; " + "floating references such as ':latest' are not allowed." + ) if not DOCKER_HOST: - raise RuntimeError("DOCKER_HOST must be configured explicitly in production.") + raise RuntimeError(f"DOCKER_HOST must be configured explicitly in {mode_name}.") if DOCKER_HOST.startswith("unix://"): raise RuntimeError( - "DOCKER_HOST must point at a restricted TCP proxy or remote daemon in production; " + f"DOCKER_HOST must point at a restricted TCP proxy or remote daemon in {mode_name}; " "raw Unix socket access is not allowed." ) parsed_docker = urlparse(DOCKER_HOST) if parsed_docker.scheme == "tcp" and parsed_docker.port == 2375: raise RuntimeError( - "Production DOCKER_HOST must use TLS (port 2376) or ssh://. " + "DOCKER_HOST must use TLS (port 2376) or ssh:// in production/public beta. " "Plain TCP on port 2375 is unencrypted and unsafe." ) if parsed_docker.scheme not in {"tcp", "ssh"}: raise RuntimeError( - "Production DOCKER_HOST must use tcp:// (with TLS) or ssh://." + "DOCKER_HOST must use tcp:// (with TLS) or ssh:// in production/public beta." ) if docker_host_hostname(DOCKER_HOST) in { "docker-proxy", @@ -355,10 +442,27 @@ def validate_runtime_configuration() -> None: "host.docker.internal", }: raise RuntimeError( - "Production gateways must use a dedicated remote Docker daemon. " + "Production/public beta gateways must use a dedicated remote Docker daemon. " "Local Docker socket proxies and loopback targets are not allowed." ) + if PUBLIC_BETA_MODE: + if not REQUIRE_AUTH: + raise RuntimeError("REQUIRE_AUTH must be true in public beta mode.") + if SANDBOX_NETWORK_MODE != "none": + raise RuntimeError("SANDBOX_NETWORK_MODE must be 'none' in public beta mode.") + if ALLOW_PIP_INSTALLS: + raise RuntimeError("ALLOW_PIP_INSTALLS must be false in public beta mode.") + if ALLOW_SANDBOX_ENV_INJECTION: + raise RuntimeError("ALLOW_SANDBOX_ENV_INJECTION must be false in public beta mode.") + if not image_reference_is_immutable(SANDBOX_IMAGE): + raise RuntimeError("SANDBOX_IMAGE must be immutable in public beta mode.") + if not strong_sandbox_runtime_configured(): + raise RuntimeError( + "Public beta mode requires a stronger Docker runtime such as gVisor/runsc or Kata. " + "Configure SANDBOX_RUNTIME and STRONG_SANDBOX_RUNTIMES." + ) + if MAX_CONTAINERS_PER_PRINCIPAL < 1: raise RuntimeError("MAX_CONTAINERS_PER_PRINCIPAL must be at least 1") @@ -476,6 +580,18 @@ def session_is_local(session: SessionInfo) -> bool: return session.docker_daemon_id == local_docker_daemon_id +def session_hard_expired(session: SessionInfo, *, now: Optional[float] = None) -> bool: + """Return whether a session has exceeded its hard lifetime.""" + if session.expires_at is None: + return False + return (now or time.time()) >= session.expires_at + + +def session_idle_expired(session: SessionInfo, *, now: Optional[float] = None) -> bool: + """Return whether a session has exceeded its idle timeout.""" + return (now or time.time()) - session.last_activity > SESSION_TIMEOUT_SECONDS + + def enforce_session_daemon_affinity(session: SessionInfo) -> None: """Raise HTTPException if session belongs to a different Docker daemon.""" if session_is_local(session): @@ -508,6 +624,11 @@ def recover_session_info(container: docker.models.containers.Container) -> Sessi owner_tenant=labels.get("owner-tenant") or None, docker_daemon_id=labels.get("docker-daemon-id") or local_docker_daemon_id, inject_sandbox_env=(labels.get("inject-sandbox-env") or "0") == "1", + expires_at=parse_optional_float( + labels.get("expires-at"), + created_at + MAX_SESSION_LIFETIME_SECONDS, + ), + execution_count=parse_optional_int(labels.get("execution-count"), 0), ) @@ -519,6 +640,33 @@ async def touch_session(container_id: str) -> Optional[SessionInfo]: ) +async def recover_or_remove_managed_container( + container: docker.models.containers.Container, + *, + missing_state_reason: str, +) -> Optional[SessionInfo]: + """Recover a managed container without resetting shared-state session budgets.""" + existing = await state_backend.get_session(container.id) + if existing is None: + if REQUIRE_SHARED_STATE: + await remove_container( + container.id, + reason=missing_state_reason, + container=container, + ) + return None + session = recover_session_info(container) + else: + session = existing + + await state_backend.save_session( + container.id, + session, + session_timeout_seconds=SESSION_TIMEOUT_SECONDS, + ) + return session + + async def ensure_session_access(container_id: str, auth: AuthContext) -> SessionInfo: """Ensure the authenticated user has access to the container session.""" session = await state_backend.get_session(container_id) @@ -538,6 +686,17 @@ async def ensure_session_access(container_id: str, auth: AuthContext) -> Session detail="Container session not found, or it was shut down due to inactivity.", ) + if REQUIRE_SHARED_STATE: + await remove_container( + container_id, + reason="missing-shared-session-state", + container=container, + ) + raise HTTPException( + status_code=404, + detail="Container session state is unavailable; the sandbox was removed.", + ) + session = recover_session_info(container) await state_backend.save_session( container_id, @@ -552,6 +711,14 @@ async def ensure_session_access(container_id: str, auth: AuthContext) -> Session raise HTTPException(status_code=403, detail="Container session belongs to another tenant") enforce_session_daemon_affinity(session) + + if session_hard_expired(session): + await remove_container(container_id, reason="max-session-lifetime") + raise HTTPException( + status_code=404, + detail="Container session expired after reaching its maximum lifetime.", + ) + return session @@ -656,7 +823,8 @@ async def cleanup_idle_containers() -> None: idle_ids = [ cid for cid, session in sessions.items() - if session_is_local(session) and now - session.last_activity > SESSION_TIMEOUT_SECONDS + if session_is_local(session) + and (session_idle_expired(session, now=now) or session_hard_expired(session, now=now)) ] for cid in idle_ids: @@ -671,12 +839,12 @@ async def cleanup_idle_containers() -> None: tracked_ids = set(sessions) for container in managed_containers: if container.id not in tracked_ids and container.name.startswith("sandbox-"): - recovered = recover_session_info(container) - await state_backend.save_session( - container.id, - recovered, - session_timeout_seconds=SESSION_TIMEOUT_SECONDS, + recovered = await recover_or_remove_managed_container( + container, + missing_state_reason="untracked-shared-session-state", ) + if recovered is None: + continue logger.info("Recovered untracked managed container %s during cleanup", container.id) except Exception as exc: logger.error("Error cleaning up untracked containers: %s", exc) @@ -693,7 +861,7 @@ async def lifespan(app: FastAPI): global docker_client, execution_semaphore, state_backend, local_docker_daemon_id, local_docker_daemon_name validate_runtime_configuration() - docker_client = docker.from_env() + docker_client = docker.from_env(timeout=DOCKER_CLIENT_TIMEOUT) execution_semaphore = asyncio.Semaphore(MAX_CONCURRENT) state_backend = RedisStateBackend(REDIS_URL) if REDIS_URL else InMemoryStateBackend() await state_backend.connect() @@ -722,12 +890,12 @@ async def lifespan(app: FastAPI): filters={"label": "managed-by=code-execution-gateway"}, ) for container in managed_containers: - session = recover_session_info(container) - await state_backend.save_session( - container.id, - session, - session_timeout_seconds=SESSION_TIMEOUT_SECONDS, + session = await recover_or_remove_managed_container( + container, + missing_state_reason="missing-shared-session-state", ) + if session is None: + continue logger.info( "Recovered container %s for subject=%s tenant=%s network=%s daemon=%s", container.id, @@ -740,13 +908,18 @@ async def lifespan(app: FastAPI): logger.warning("Failed to recover existing containers: %s", exc) logger.info( - "Gateway started env=%s auth=%s max_concurrent=%s default_timeout=%ss network=%s read_only_rootfs=%s docker_default_seccomp=%s state_backend=%s docker_daemon_id=%s", + "Gateway started env=%s public_beta=%s auth=%s max_concurrent=%s default_timeout=%ss session_ttl=%ss max_session_lifetime=%ss max_executions_per_session=%s network=%s read_only_rootfs=%s runtime=%s docker_default_seccomp=%s state_backend=%s docker_daemon_id=%s", APP_ENV, + PUBLIC_BETA_MODE, auth_mode_summary(), MAX_CONCURRENT, DEFAULT_TIMEOUT, + SESSION_TIMEOUT_SECONDS, + MAX_SESSION_LIFETIME_SECONDS, + MAX_EXECUTIONS_PER_SESSION, SANDBOX_NETWORK_MODE, SANDBOX_READ_ONLY_ROOTFS, + SANDBOX_RUNTIME or "default", USE_DOCKER_DEFAULT_SECCOMP, type(state_backend).__name__, local_docker_daemon_id or "-", @@ -989,6 +1162,9 @@ class ContainerResponse(BaseModel): status: str uptime_seconds: float last_activity: float + expires_at: Optional[float] = None + execution_count: int = 0 + max_executions: int = MAX_EXECUTIONS_PER_SESSION docker_daemon_id: Optional[str] = None @@ -1292,6 +1468,8 @@ async def create_container_session( execution_id = str(uuid.uuid4())[:12] network_mode = SANDBOX_NETWORK_MODE if enable_network else "none" network_enabled = network_mode != "none" + now = time.time() + expires_at = now + MAX_SESSION_LIFETIME_SECONDS security_opts = ["no-new-privileges:true"] if not USE_DOCKER_DEFAULT_SECCOMP: @@ -1324,6 +1502,8 @@ async def create_container_session( "owner-tenant": auth.tenant or "", "docker-daemon-id": local_docker_daemon_id or "", "inject-sandbox-env": "1" if inject_sandbox_env else "0", + "expires-at": str(expires_at), + "execution-count": "0", }, "name": f"sandbox-{execution_id}", "read_only": SANDBOX_READ_ONLY_ROOTFS, @@ -1331,6 +1511,8 @@ async def create_container_session( "user": f"{SANDBOX_UID}:{SANDBOX_GID}", "working_dir": "/home/sandbox", } + if SANDBOX_RUNTIME: + container_config["runtime"] = SANDBOX_RUNTIME container = await asyncio.to_thread( docker_client.containers.run, @@ -1352,7 +1534,6 @@ async def create_container_session( ) raise - now = time.time() session = SessionInfo( created_at=now, last_activity=now, @@ -1361,6 +1542,8 @@ async def create_container_session( owner_tenant=auth.tenant, docker_daemon_id=local_docker_daemon_id, inject_sandbox_env=inject_sandbox_env, + expires_at=expires_at, + execution_count=0, ) try: await state_backend.save_session( @@ -1433,6 +1616,38 @@ async def run_code_in_sandbox( await state_backend.delete_session(container_id) raise HTTPException(status_code=404, detail="Container session not found.") from exc + if session_hard_expired(session): + await remove_container( + container_id, + execution_id=execution_id, + reason="max-session-lifetime", + container=container, + ) + raise HTTPException( + status_code=404, + detail="Container session expired after reaching its maximum lifetime.", + ) + + if session.execution_count >= MAX_EXECUTIONS_PER_SESSION: + await remove_container( + container_id, + execution_id=execution_id, + reason="max-executions", + container=container, + ) + raise HTTPException( + status_code=429, + detail="Container session reached its maximum number of executions.", + ) + + session.execution_count += 1 + session.last_activity = time.time() + await state_backend.save_session( + container_id, + session, + session_timeout_seconds=SESSION_TIMEOUT_SECONDS, + ) + await touch_session(container_id) try: await ensure_sandbox_env_file( @@ -1580,6 +1795,8 @@ async def create_container( status="active", uptime_seconds=0.0, last_activity=session.last_activity, + expires_at=session.expires_at, + execution_count=session.execution_count, docker_daemon_id=session.docker_daemon_id, ) except HTTPException: @@ -1602,6 +1819,8 @@ async def get_container(container_id: str, auth: AuthContext = Depends(verify_au status="active", uptime_seconds=max(0.0, now - session.created_at), last_activity=session.last_activity, + expires_at=session.expires_at, + execution_count=session.execution_count, docker_daemon_id=session.docker_daemon_id, ) except docker.errors.NotFound as exc: @@ -1735,12 +1954,18 @@ async def build_health_payload() -> tuple[bool, dict]: "docker_daemon_name": local_docker_daemon_name, "sandbox_image_available": image_ok, "sandbox_image": SANDBOX_IMAGE, + "sandbox_runtime": SANDBOX_RUNTIME or "default", + "public_beta_mode": PUBLIC_BETA_MODE, + "strong_sandbox_isolation_required": REQUIRE_STRONG_SANDBOX_ISOLATION, "state_backend": type(state_backend).__name__, "state_backend_healthy": state_ok, "cors_enabled": ENABLE_CORS, "cors_origins_configured": CORS_ALLOW_ORIGINS, "max_concurrent_executions": MAX_CONCURRENT, "default_timeout": DEFAULT_TIMEOUT, + "session_timeout_seconds": SESSION_TIMEOUT_SECONDS, + "max_session_lifetime_seconds": MAX_SESSION_LIFETIME_SECONDS, + "max_executions_per_session": MAX_EXECUTIONS_PER_SESSION, "metrics": metrics, } return healthy, payload diff --git a/gateway/requirements.txt b/gateway/requirements.txt index 2190d99..6ff22dc 100644 --- a/gateway/requirements.txt +++ b/gateway/requirements.txt @@ -1,7 +1,7 @@ docker==7.1.0 -fastapi==0.115.12 +fastapi==0.136.1 prometheus-client==0.21.1 -PyJWT==2.10.1 +PyJWT==2.12.1 pydantic==2.11.3 redis==5.2.1 uvicorn[standard]==0.34.0 diff --git a/gateway/state.py b/gateway/state.py index 0b60afc..b1baadf 100644 --- a/gateway/state.py +++ b/gateway/state.py @@ -23,6 +23,8 @@ class SessionInfo: owner_tenant: Optional[str] docker_daemon_id: Optional[str] = None inject_sandbox_env: bool = False + expires_at: Optional[float] = None + execution_count: int = 0 class StateBackend: @@ -270,7 +272,7 @@ async def save_session( session_timeout_seconds: int, ) -> None: """Save session to Redis with TTL.""" - ttl = session_timeout_seconds + SESSION_TTL_GRACE_SECONDS + ttl = self._session_ttl(session, session_timeout_seconds) pipeline = self.client.pipeline() pipeline.hset(self._session_key(container_id), mapping=self._session_to_mapping(session)) pipeline.expire(self._session_key(container_id), ttl) @@ -289,7 +291,7 @@ async def touch_session( return None session.last_activity = time.time() - ttl = session_timeout_seconds + SESSION_TTL_GRACE_SECONDS + ttl = self._session_ttl(session, session_timeout_seconds) pipeline = self.client.pipeline() pipeline.hset(self._session_key(container_id), mapping=self._session_to_mapping(session)) pipeline.expire(self._session_key(container_id), ttl) @@ -402,6 +404,8 @@ def _session_to_mapping(session: SessionInfo) -> dict[str, str]: "owner_tenant": session.owner_tenant or "", "docker_daemon_id": session.docker_daemon_id or "", "inject_sandbox_env": "1" if session.inject_sandbox_env else "0", + "expires_at": str(session.expires_at or ""), + "execution_count": str(session.execution_count), } @staticmethod @@ -415,8 +419,18 @@ def _session_from_mapping(mapping: dict[str, str]) -> SessionInfo: owner_tenant=mapping.get("owner_tenant") or None, docker_daemon_id=mapping.get("docker_daemon_id") or None, inject_sandbox_env=mapping.get("inject_sandbox_env", "0") == "1", + expires_at=float(mapping["expires_at"]) if mapping.get("expires_at") else None, + execution_count=int(mapping.get("execution_count") or 0), ) + @staticmethod + def _session_ttl(session: SessionInfo, session_timeout_seconds: int) -> int: + idle_ttl = session_timeout_seconds + SESSION_TTL_GRACE_SECONDS + if session.expires_at is None: + return idle_ttl + lifetime_ttl = max(1, int(session.expires_at - time.time())) + SESSION_TTL_GRACE_SECONDS + return max(1, min(idle_ttl, lifetime_ttl)) + @staticmethod def _session_key(container_id: str) -> str: """Generate Redis key for session storage.""" diff --git a/gateway/version.json b/gateway/version.json index ddde6f0..c96b8fd 100644 --- a/gateway/version.json +++ b/gateway/version.json @@ -1,4 +1,4 @@ { - "version": "1.1.0", - "tag": "v1.1.0" + "version": "0.9.0", + "tag": "v0.9.0" } diff --git a/gateway/version.py b/gateway/version.py index d54cd5d..070ca34 100644 --- a/gateway/version.py +++ b/gateway/version.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +import os import re from pathlib import Path from typing import Any @@ -39,11 +40,15 @@ def load_app_version() -> tuple[str, str]: def get_version_payload() -> dict[str, Any]: version, tag = load_app_version() + app_env = os.getenv("APP_ENV", "").strip().lower() + public_beta = app_env in {"beta", "public_beta", "public-beta"} or os.getenv( + "PUBLIC_BETA_MODE", "" + ).strip().lower() in {"1", "true", "yes", "on"} return { "version": version, "tag": tag, "api_contract_version": 1, - "beta": False, + "beta": public_beta, "active_execution_version": "v1", "default_execution_version": "v1", "supported_execution_versions": ["v1"], diff --git a/sandbox/Dockerfile b/sandbox/Dockerfile index bec37c4..9a61ece 100644 --- a/sandbox/Dockerfile +++ b/sandbox/Dockerfile @@ -1,9 +1,9 @@ # ============================================================ # Sandbox Container — Executes untrusted Python code securely # ============================================================ -# This image is used as an ephemeral container by the gateway. -# Each code execution gets its own container instance that is -# destroyed after completion. +# This image is used for bounded sandbox sessions managed by the gateway. +# A session can run multiple serialized executions and is destroyed after +# idle timeout, hard lifetime, execution budget exhaustion, or client delete. # # Build: docker build -t code-sandbox -f Dockerfile . # ============================================================ @@ -15,6 +15,7 @@ ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright # --- System Setup --- RUN apt-get update && \ + apt-get upgrade -y && \ apt-get install -y --no-install-recommends \ # Required for some scientific packages libgomp1 \ diff --git a/sandbox/requirements.txt b/sandbox/requirements.txt index 815d6f5..aae57f9 100644 --- a/sandbox/requirements.txt +++ b/sandbox/requirements.txt @@ -6,7 +6,7 @@ scipy==1.14.1 # Visualization matplotlib==3.9.2 seaborn==0.13.2 -Pillow==10.4.0 +Pillow==12.2.0 plotly==5.24.1 # Machine learning @@ -16,7 +16,7 @@ scikit-learn==1.5.2 sympy==1.13.3 # HTTP requests (internet access enabled) -requests==2.32.3 +requests==2.33.0 # Browser automation playwright==1.52.0 diff --git a/tests/test_gateway_unit.py b/tests/test_gateway_unit.py index b34c9cf..4d38ae5 100644 --- a/tests/test_gateway_unit.py +++ b/tests/test_gateway_unit.py @@ -102,6 +102,70 @@ async def test_remove_container_keeps_state_on_removal_failure(self) -> None: session = await self.state_backend.get_session("ctr-fail") self.assertIsNotNone(session, "Session state must be preserved when container removal fails") + async def test_recover_or_remove_managed_container_preserves_existing_budget_state(self) -> None: + await self.state_backend.save_session( + "ctr-1", + SessionInfo( + created_at=1.0, + last_activity=2.0, + network_enabled=False, + owner_subject="subject-1", + owner_tenant=None, + docker_daemon_id="daemon-local", + expires_at=100.0, + execution_count=7, + ), + session_timeout_seconds=60, + ) + fake_container = SimpleNamespace( + id="ctr-1", + labels={ + "managed-by": "code-execution-gateway", + "owner-subject": "subject-1", + "execution-count": "0", + }, + attrs={ + "Created": "2025-01-15T10:00:00Z", + "HostConfig": {"NetworkMode": "none"}, + }, + ) + + session = await gateway_app.recover_or_remove_managed_container( + fake_container, + missing_state_reason="unit-test-missing-state", + ) + + self.assertIsNotNone(session) + self.assertEqual(session.execution_count, 7) + self.assertEqual(session.expires_at, 100.0) + saved = await self.state_backend.get_session("ctr-1") + self.assertIsNotNone(saved) + self.assertEqual(saved.execution_count, 7) + + async def test_ensure_session_access_removes_managed_container_when_shared_state_missing(self) -> None: + fake_container = SimpleNamespace( + id="ctr-missing-state", + labels={ + "managed-by": "code-execution-gateway", + "owner-subject": "subject-1", + }, + ) + gateway_app.docker_client = SimpleNamespace( + containers=SimpleNamespace(get=mock.Mock(return_value=fake_container)), + ) + + with mock.patch.object(gateway_app, "REQUIRE_SHARED_STATE", True): + with mock.patch.object(gateway_app, "remove_container", mock.AsyncMock()) as remove_mock: + with self.assertRaises(HTTPException) as ctx: + await gateway_app.ensure_session_access( + "ctr-missing-state", + gateway_app.AuthContext(subject="subject-1", tenant=None, auth_type="api_key"), + ) + + self.assertEqual(ctx.exception.status_code, 404) + remove_mock.assert_awaited_once() + self.assertEqual(remove_mock.await_args.kwargs["reason"], "missing-shared-session-state") + async def test_create_container_session_cleans_up_when_state_save_fails(self) -> None: fake_container = SimpleNamespace(id="ctr-save-fail") gateway_app.docker_client = SimpleNamespace( @@ -253,6 +317,41 @@ async def test_run_code_in_sandbox_provisions_input_files_via_exec(self) -> None self.assertEqual(provision_mock.await_args.kwargs["target_dir"], "/home/sandbox") self.assertEqual(provision_mock.await_args.kwargs["files"][0].name, "input.txt") + async def test_run_code_in_sandbox_removes_session_after_execution_budget(self) -> None: + await self.state_backend.save_session( + "ctr-1", + SessionInfo( + created_at=1.0, + last_activity=1.0, + network_enabled=False, + owner_subject="subject-1", + owner_tenant=None, + docker_daemon_id="daemon-local", + execution_count=1, + ), + session_timeout_seconds=60, + ) + + fake_container = SimpleNamespace(id="ctr-1") + gateway_app.docker_client = SimpleNamespace( + containers=SimpleNamespace(get=mock.Mock(return_value=fake_container)), + ) + + with mock.patch.object(gateway_app, "MAX_EXECUTIONS_PER_SESSION", 1): + with mock.patch.object(gateway_app, "remove_container", mock.AsyncMock()) as remove_mock: + with self.assertRaises(HTTPException) as ctx: + await gateway_app.run_code_in_sandbox( + container_id="ctr-1", + language="python", + code="print('ok')", + timeout=10, + execution_id="exec-123", + ) + + self.assertEqual(ctx.exception.status_code, 429) + remove_mock.assert_awaited_once() + self.assertEqual(remove_mock.await_args.kwargs["reason"], "max-executions") + async def test_ensure_sandbox_env_file_provisions_via_exec(self) -> None: fake_container = SimpleNamespace(id="ctr-1") @@ -280,16 +379,27 @@ def _base_overrides(self, **extra): overrides = { "DEFAULT_TIMEOUT": 30, "MAX_TIMEOUT": 120, + "DOCKER_CLIENT_TIMEOUT": 30, + "SESSION_TIMEOUT_SECONDS": 1200, + "MAX_SESSION_LIFETIME_SECONDS": 3600, + "MAX_EXECUTIONS_PER_SESSION": 100, "REQUIRE_AUTH": False, "JWT_SECRET": None, "STATIC_API_KEYS": [], "ENABLE_CORS": False, "ENABLE_DOCS": False, "IS_PRODUCTION": False, + "PUBLIC_BETA_MODE": False, "DOCKER_HOST": "", + "SANDBOX_IMAGE": "code-sandbox:1.1.0", + "SANDBOX_RUNTIME": "", + "STRONG_SANDBOX_RUNTIMES": ["runsc", "kata-runtime"], + "REQUIRE_STRONG_SANDBOX_ISOLATION": False, "USE_DOCKER_DEFAULT_SECCOMP": True, "SECCOMP_PROFILE_DAEMON_PATH": "", "SANDBOX_NETWORK_MODE": "bridge", + "ALLOW_PIP_INSTALLS": False, + "ALLOW_SANDBOX_ENV_INJECTION": False, "REQUIRE_SHARED_STATE": False, "MAX_CONTAINERS_PER_PRINCIPAL": 1, "MAX_ACTIVE_SESSIONS": 1, @@ -346,6 +456,63 @@ def test_validate_runtime_configuration_rejects_docs_in_production(self) -> None with self.assertRaisesRegex(RuntimeError, "ENABLE_DOCS"): gateway_app.validate_runtime_configuration() + def test_validate_runtime_configuration_rejects_latest_image_in_production(self) -> None: + overrides = self._base_overrides( + REQUIRE_AUTH=True, + JWT_SECRET="secret", + IS_PRODUCTION=True, + DOCKER_HOST="tcp://remote-docker:2376", + REQUIRE_SHARED_STATE=True, + REDIS_URL="redis://redis:6379/0", + SANDBOX_IMAGE="code-sandbox:latest", + ) + + with ExitStack() as stack: + for name, value in overrides.items(): + stack.enter_context(mock.patch.object(gateway_app, name, value)) + + with self.assertRaisesRegex(RuntimeError, "SANDBOX_IMAGE"): + gateway_app.validate_runtime_configuration() + + def test_validate_runtime_configuration_requires_strong_runtime_for_public_beta(self) -> None: + overrides = self._base_overrides( + REQUIRE_AUTH=True, + JWT_SECRET="secret", + PUBLIC_BETA_MODE=True, + SANDBOX_NETWORK_MODE="none", + SANDBOX_IMAGE="code-sandbox:1.1.0", + SANDBOX_RUNTIME="", + DOCKER_HOST="tcp://remote-docker:2376", + REQUIRE_SHARED_STATE=True, + REDIS_URL="redis://redis:6379/0", + ) + + with ExitStack() as stack: + for name, value in overrides.items(): + stack.enter_context(mock.patch.object(gateway_app, name, value)) + + with self.assertRaisesRegex(RuntimeError, "Public beta mode requires"): + gateway_app.validate_runtime_configuration() + + def test_validate_runtime_configuration_accepts_public_beta_with_runsc(self) -> None: + overrides = self._base_overrides( + REQUIRE_AUTH=True, + JWT_SECRET="secret", + PUBLIC_BETA_MODE=True, + SANDBOX_NETWORK_MODE="none", + SANDBOX_IMAGE="code-sandbox:1.1.0", + SANDBOX_RUNTIME="runsc", + DOCKER_HOST="tcp://remote-docker:2376", + REQUIRE_SHARED_STATE=True, + REDIS_URL="redis://redis:6379/0", + ) + + with ExitStack() as stack: + for name, value in overrides.items(): + stack.enter_context(mock.patch.object(gateway_app, name, value)) + + gateway_app.validate_runtime_configuration() + def test_validate_runtime_configuration_requires_daemon_visible_seccomp_path(self) -> None: overrides = self._base_overrides( USE_DOCKER_DEFAULT_SECCOMP=False,