diff --git a/docs/features/remote-mcp/index.md b/docs/features/remote-mcp/index.md index 70ee4083f..8729fbc95 100644 --- a/docs/features/remote-mcp/index.md +++ b/docs/features/remote-mcp/index.md @@ -114,19 +114,19 @@ When running `docker-agent serve api` (no local browser, no callback server), th | Key | Value | | ---------------------------- | ---------------------------------------------------------------- | - | `cagent/type` | `"oauth_flow"` | - | `cagent/server_url` | The MCP server URL (for display / favicon) | - | `cagent/authorize_url` | The full URL the client should open in the user's browser | - | `cagent/state` | The `state` value the client must echo back when replying | + | `docker-agent/type` | `"oauth_flow"` | + | `docker-agent/server_url` | The MCP server URL (for display / favicon) | + | `docker-agent/authorize_url` | The full URL the client should open in the user's browser | + | `docker-agent/state` | The `state` value the client must echo back when replying | | `auth_server` | Issuer of the authorization server | | `auth_server_metadata` | RFC 8414 authorization-server metadata document | | `resource_metadata` | RFC 9728 protected-resource metadata document | - The client opens the browser at `cagent/authorize_url`, receives the OAuth callback at whatever endpoint the configured `redirect_uri` resolves to (typically a host-controlled bouncer that 302s into a deeplink), and replies to the elicitation with `accept` and `Content = {"code": "...", "state": "..."}`. The runtime verifies the `state`, exchanges the `code` at the token endpoint (using the same `redirect_uri` for RFC 6749 §4.1.3 binding), stores the token, and replays the original MCP request with `Authorization: Bearer ...`. + The client opens the browser at `docker-agent/authorize_url`, receives the OAuth callback at whatever endpoint the configured `redirect_uri` resolves to (typically a host-controlled bouncer that 302s into a deeplink), and replies to the elicitation with `accept` and `Content = {"code": "...", "state": "..."}`. The runtime verifies the `state`, exchanges the `code` at the token endpoint (using the same `redirect_uri` for RFC 6749 §4.1.3 binding), stores the token, and replays the original MCP request with `Authorization: Bearer ...`. -- **Flag not set** (legacy): the runtime emits only `auth_server_metadata` + `resource_metadata`; the client is expected to drive the OAuth flow itself (PKCE, DCR, token exchange) and reply with `Content = {"access_token": "...", "refresh_token": "...", ...}`. +- **Flag not set** (client-driven): the runtime emits only `auth_server_metadata` + `resource_metadata`; the client is expected to drive the OAuth flow itself (PKCE, DCR, token exchange) and reply with `Content = {"access_token": "...", "refresh_token": "...", ...}`. -The legacy `{access_token, ...}` reply shape is still accepted on the `--mcp-oauth-redirect-uri` path too: a client that prefers to do the exchange itself can ignore the `cagent/authorize_url`/`cagent/state` keys. +The client-driven `{access_token, ...}` reply shape is still accepted on the `--mcp-oauth-redirect-uri` path too: a client that prefers to do the exchange itself can ignore the `docker-agent/authorize_url`/`docker-agent/state` keys. A per-toolset `callbackRedirectURL` (in the YAML) overrides the runtime-wide `--mcp-oauth-redirect-uri` for that toolset. diff --git a/pkg/server/server.go b/pkg/server/server.go index 72aa57a40..3cc2bbad9 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -418,10 +418,30 @@ func (s *Server) elicitation(c echo.Context) error { // The state value is opaque, high-entropy and was generated in-process by // docker-agent's unmanaged OAuth flow (see GenerateState in // pkg/tools/mcp). Looking it up in the pending-oauth registry IS the -// authentication: docker-agent only accepts callbacks for states it is -// currently awaiting. An unknown state returns 404 (which is the -// expected outcome for replays and any state value the agent did not -// itself generate). +// per-request authorization: docker-agent only accepts callbacks for +// states it is currently awaiting. An unknown state returns 404 (which +// is the expected outcome for replays and any state value the agent did +// not itself generate). +// +// Threat model: +// +// - The registry is the primary defence. State values are >=128-bit +// opaque tokens from GenerateState; an attacker that has not +// observed a live state cannot brute-force one in a useful window. +// - State values DO appear in transit: in the elicitation Meta on the +// session SSE stream (visible only to the connected client), and in +// the authorize URL the user opens (visible to the user's browser +// and the authorization server). They are also written to debug +// logs when --debug is on. +// - If an attacker DOES observe a live state (e.g. via leaked debug +// logs or a compromised host), they could POST here with an +// attacker-controlled code; the resulting token would be bound to +// the attacker's account, not the user's. Setting --auth-token +// blocks this regardless of state leakage, because the route then +// also requires bearer auth. +// - Operators running docker-agent on a network-reachable interface +// SHOULD configure --auth-token. Defaults to localhost-only via the +// existing socket binding when not overridden. // // The handler never blocks: it hands the callback to the buffered // channel of the waiting flow and returns immediately. The token diff --git a/pkg/tools/mcp/oauth.go b/pkg/tools/mcp/oauth.go index e73ce36a3..0358e6115 100644 --- a/pkg/tools/mcp/oauth.go +++ b/pkg/tools/mcp/oauth.go @@ -27,6 +27,14 @@ import ( // resourceMetadataFromWWWAuth extracts resource metadata URL from WWW-Authenticate header var re = regexp.MustCompile(`resource="([^"]+)"`) +// unmanagedOAuthWaitTimeout is the upper bound on how long the unmanaged +// OAuth flow blocks waiting for a reply (elicitation result or +// out-of-band callback). Generous enough to accommodate a user clicking +// through an IdP consent screen and any IdP-side prompts; small enough +// that a silently-disconnected MCP client can't hold the per-session +// streaming lock indefinitely. +var unmanagedOAuthWaitTimeout = 10 * time.Minute + // oauth is a simple struct for compatibility with existing code type oauth struct { metadataClient *http.Client @@ -997,12 +1005,22 @@ func (t *oauthTransport) handleUnmanagedOAuthFlow(ctx context.Context, authServe // `elicCtx` -- when the direct callback wins we cancel elicCtx to // release the goroutine, but the surrounding ctx (used for the token // exchange below) stays alive. + // + // elicCtx also carries an upper bound on how long the flow waits. + // `ctx` here is the detached ctx from clientConnector.Connect, whose + // Done channel never fires on its own. Without a deadline, a silent + // MCP-client disconnect (TCP RST, idle timeout, process kill) leaves + // `requestElicitation` blocked forever, holding the per-session + // streaming lock at the SessionManager level. Subsequent user + // messages would then all return 409 / ErrSessionBusy until a + // process restart. unmanagedOAuthWaitTimeout caps that window; + // user-initiated cancellation still wins instantly via userCancelCh. type elicResult struct { result tools.ElicitationResult err error } elicCh := make(chan elicResult, 1) - elicCtx, elicCancel := context.WithCancel(ctx) + elicCtx, elicCancel := context.WithTimeout(ctx, unmanagedOAuthWaitTimeout) defer elicCancel() go func() { r, e := t.client.requestElicitation(elicCtx, &mcpsdk.ElicitParams{ @@ -1038,7 +1056,10 @@ func (t *oauthTransport) handleUnmanagedOAuthFlow(ctx context.Context, authServe userCancelCh = parentCtx.Done() } - var content map[string]any + var ( + token *OAuthToken + consumeErr error + ) select { case <-ctx.Done(): return ctx.Err() @@ -1050,6 +1071,14 @@ func (t *oauthTransport) handleUnmanagedOAuthFlow(ctx context.Context, authServe // per-session streaming lock is released by the SessionManager // goroutine's deferred Unlock). return parentCtx.Err() + case <-elicCtx.Done(): + // Defensive timeout: if the MCP client disconnected silently + // (TCP RST, idle timeout, process kill) AND requestElicitation + // does not honor its ctx, this case prevents the streaming + // lock from being held indefinitely. In practice the elicCh + // case below usually fires first with a deadline-exceeded + // error wrapped from requestElicitation. + return fmt.Errorf("OAuth flow timed out waiting for a reply after %s", unmanagedOAuthWaitTimeout) case cb := <-callbackCh: // Direct deeplink callback won. Release the in-flight // elicitation goroutine; any UI the embedder showed for this @@ -1064,9 +1093,13 @@ func (t *oauthTransport) handleUnmanagedOAuthFlow(ctx context.Context, authServe return fmt.Errorf("OAuth flow declined by provider: %s", msg) } slog.DebugContext(ctx, "OAuth callback received via deeplink, exchanging code for token") - // Synthesize the {code, state} payload that - // consumeUnmanagedElicitationReply expects. - content = map[string]any{"code": cb.Code, "state": expectedState} + // State validation is performed by the registry lookup in + // mcpOAuthCallback: the only state keys present in the + // pending-OAuth registry are ones the runtime itself + // generated and is currently awaiting. Re-validating against + // `expectedState` here would be a tautology, so we go + // straight to the token exchange. + token, consumeErr = t.exchangeAuthorizationCode(ctx, cb.Code, authServerMetadata, pkceVerifier, clientID, clientSecret, redirectURI, resourceIndicator) case er := <-elicCh: if er.err != nil { return fmt.Errorf("failed to send elicitation request: %w", er.err) @@ -1078,24 +1111,24 @@ func (t *oauthTransport) handleUnmanagedOAuthFlow(ctx context.Context, authServe if er.result.Content == nil { return errors.New("no payload received from client") } - content = er.result.Content + // On the elicitation path the state arrives from the client + // and MUST be validated against expectedState; the registry + // did not see this delivery. + token, consumeErr = t.consumeUnmanagedElicitationReply( + ctx, + er.result.Content, + authServerMetadata, + driveFlow, + expectedState, + pkceVerifier, + clientID, + clientSecret, + redirectURI, + resourceIndicator, + ) } - - token, err := t.consumeUnmanagedElicitationReply( - ctx, - content, - authServerMetadata, - resourceMetadata, - driveFlow, - expectedState, - pkceVerifier, - clientID, - clientSecret, - redirectURI, - resourceIndicator, - ) - if err != nil { - return err + if consumeErr != nil { + return consumeErr } if driveFlow { @@ -1133,12 +1166,11 @@ func (t *oauthTransport) handleUnmanagedOAuthFlow(ctx context.Context, authServe // the stored PKCE verifier + client credentials to make the exchange). // // If the client mixes shapes (e.g. supplies both access_token and code), the -// access_token wins to preserve the legacy behavior. +// access_token wins to preserve the client-driven behavior. func (t *oauthTransport) consumeUnmanagedElicitationReply( ctx context.Context, content map[string]any, authServerMetadata *AuthorizationServerMetadata, - resourceMetadata protectedResourceMetadata, driveFlow bool, expectedState, pkceVerifier, clientID, clientSecret, redirectURI, resourceIndicator string, ) (*OAuthToken, error) { @@ -1170,7 +1202,20 @@ func (t *oauthTransport) consumeUnmanagedElicitationReply( if subtle.ConstantTimeCompare([]byte(state), []byte(expectedState)) != 1 { return nil, errors.New("state mismatch in elicitation reply") } + return t.exchangeAuthorizationCode(ctx, code, authServerMetadata, pkceVerifier, clientID, clientSecret, redirectURI, resourceIndicator) +} +// exchangeAuthorizationCode posts to the auth server's token endpoint and +// returns the resulting token. Shared between the elicitation-reply path +// (after state validation against the client-supplied state) and the +// out-of-band callback path (where state was already validated by the +// pending-OAuth registry lookup, so no in-flow state check is needed). +func (t *oauthTransport) exchangeAuthorizationCode( + ctx context.Context, + code string, + authServerMetadata *AuthorizationServerMetadata, + pkceVerifier, clientID, clientSecret, redirectURI, resourceIndicator string, +) (*OAuthToken, error) { slog.DebugContext(ctx, "Exchanging authorization code received from client") token, err := exchangeCodeForToken( ctx, @@ -1186,6 +1231,5 @@ func (t *oauthTransport) consumeUnmanagedElicitationReply( if err != nil { return nil, fmt.Errorf("failed to exchange code for token: %w", err) } - _ = resourceMetadata // captured for future audit logging; intentionally unused here return token, nil } diff --git a/pkg/tools/mcp/oauth_test.go b/pkg/tools/mcp/oauth_test.go index b947df679..6663c1d10 100644 --- a/pkg/tools/mcp/oauth_test.go +++ b/pkg/tools/mcp/oauth_test.go @@ -1764,6 +1764,53 @@ func TestUnmanagedOAuthFlow_DriveFlow_AbortsOnParentCtxCancellation(t *testing.T "token endpoint must NOT be hit when the parent ctx is cancelled before any callback") } +// TestUnmanagedOAuthFlow_DriveFlow_TimesOutWhenNoReplyArrives ensures the +// per-flow deadline releases the streaming lock even when the MCP client +// disconnects silently and never produces an elicitation reply. +// +// Without unmanagedOAuthWaitTimeout, requestElicitation would block on a +// dead session indefinitely, the per-session lock at the SessionManager +// level would stay held, and every subsequent message from that session +// would return 409 / ErrSessionBusy until a process restart. +func TestUnmanagedOAuthFlow_DriveFlow_TimesOutWhenNoReplyArrives(t *testing.T) { + original := unmanagedOAuthWaitTimeout + unmanagedOAuthWaitTimeout = 200 * time.Millisecond + t.Cleanup(func() { unmanagedOAuthWaitTimeout = original }) + + srv := newUnmanagedOAuthTestServer(t) + defer srv.Close() + + const redirectURI = "https://example.test/oauth/cb" + capture := &elicitCaptured{} + // Block until the test's t.Context is cancelled (after the + // roundtrip has returned). This emulates a silent client + // disconnect: requestElicitation honors its ctx and returns when + // elicCtx hits its deadline, surfacing the deadline-exceeded + // error on elicCh. + capture.replyFn = func(req *gomcp.ElicitParams) tools.ElicitationResult { + <-t.Context().Done() + return tools.ElicitationResult{Action: tools.ElicitationActionDecline} + } + transport, _ := newUnmanagedTestTransport(t, srv.URL, redirectURI, capture) + + req, err := http.NewRequestWithContext(t.Context(), http.MethodPost, srv.URL, strings.NewReader("{}")) + require.NoError(t, err) + req.Header.Set("Content-Type", "application/json") + + resp, rtErr := transport.RoundTrip(req) + if resp != nil { + _ = resp.Body.Close() + } + require.Error(t, rtErr) + assert.True(t, + strings.Contains(rtErr.Error(), "timed out") || + strings.Contains(rtErr.Error(), "context deadline exceeded"), + "expected timeout error, got: %v", rtErr, + ) + assert.Equal(t, int32(0), srv.tokenCalls.Load(), + "token endpoint must NOT be hit on timeout") +} + // TestUnmanagedRedirectURI_PerToolsetTakesPrecedence verifies the precedence // order: per-toolset RemoteOAuthConfig.CallbackRedirectURL overrides the // runtime-wide --mcp-oauth-redirect-uri.