diff --git a/CMakeLists.txt b/CMakeLists.txt
index c42de48..f7259cb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,7 +86,7 @@ if (GGML_METAL)
 
     # copy ggml-metal.metal to bin directory
 
-    configure_file(ggml/src/ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
+    configure_file(ggml/src/ggml-metal/ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
 
 endif()
 
diff --git a/bert.cpp b/bert.cpp
index d5b90f4..da19f81 100644
--- a/bert.cpp
+++ b/bert.cpp
@@ -64,7 +64,9 @@ For these use the llama.cpp interface.
 #include <cmath>
 #include <fstream>
 #include <algorithm>
+#include <memory>
 #include <stdexcept>
+#include <utility>
 
 #define BERT_MAX_NODES 4096
 #define KEY_TOKEN_LIST "tokenizer.ggml.tokens"
@@ -1045,6 +1047,13 @@ BERT_API struct bert_ctx * bert_load_from_file_wordpiece(const char * fname, boo
 // Buffer management
 // ---------------------------------------------------------------------------
 
+// Forward declarations for graph caching — full definitions live with the
+// graph-construction code further below. bert_free needs the complete type
+// to call ~bert_graph_cache_map, so we provide a small helper there too.
+struct bert_cached_graph;
+struct bert_graph_cache_map;
+static void bert_destroy_graph_cache(void * p);
+
 int32_t bert_n_embd      (bert_ctx * ctx) { return ctx->model.hparams.n_embd; }
 int32_t bert_n_max_tokens(bert_ctx * ctx) { return ctx->model.hparams.n_max_tokens; }
 
@@ -1088,6 +1097,10 @@ void bert_deallocate_buffers(bert_ctx * ctx) {
 
 void bert_free(bert_ctx * ctx) {
     if (!ctx) return;
+    if (ctx->graph_cache) {
+        bert_destroy_graph_cache(ctx->graph_cache);
+        ctx->graph_cache = nullptr;
+    }
     if (ctx->compute_alloc) {
         ggml_gallocr_free(ctx->compute_alloc);
         ctx->compute_alloc = nullptr;
@@ -1114,184 +1127,264 @@ void bert_free(bert_ctx * ctx) {
 // ---------------------------------------------------------------------------
 // Graph construction
 // ---------------------------------------------------------------------------
+//
+// The graph is parameterised by (max_len, n_batch) only — input contents are
+// filled at forward time. We cache one graph per distinct (max_len, n_batch)
+// in ctx->graph_cache so repeated forwards at the same shape skip both the
+// graph build and the gallocr planning step. This is the dominant cost for
+// short-input latency.
+
+struct bert_cached_graph {
+    int                  max_len    = 0;
+    int                  n_batch    = 0;
+
+    std::vector<uint8_t> buf_meta;             // backing for ctx_compute
+    ggml_context       * ctx_compute = nullptr;
+    ggml_cgraph        * gf          = nullptr;
+    ggml_gallocr_t       alloc       = nullptr;
+
+    // Cached input/output tensor pointers — set during build, used by every
+    // subsequent forward to avoid name lookups.
+    ggml_tensor * token_layer = nullptr;
+    ggml_tensor * token_types = nullptr;  // pre-filled with zeros at build time
+    ggml_tensor * pad_mask    = nullptr;
+    ggml_tensor * positions   = nullptr;  // pre-filled with [0..max_len-1] at build time
+    ggml_tensor * sum         = nullptr;
+    ggml_tensor * minus_one   = nullptr;
+    ggml_tensor * output      = nullptr;
+
+    // Per-call scratch buffers — sized once per (max_len, n_batch) to avoid
+    // heap churn on every forward. Only the contents are rewritten per call.
+    std::vector<int32_t> scratch_tl;   // token ids
+    std::vector<float>   scratch_pm;   // pad mask (1=real, 0=padding)
+    std::vector<float>   scratch_sv;   // mean-pool weights (1/seq_len on real, 0 on pad)
+
+    // Constant buffers — filled once at build, re-uploaded each call on CPU
+    // (gallocr quirk: input-flagged tensors don't stay stable between computes).
+    std::vector<int32_t> scratch_pos;
+    std::vector<int32_t> scratch_types;
+
+    ~bert_cached_graph() {
+        if (alloc)       ggml_gallocr_free(alloc);
+        if (ctx_compute) ggml_free(ctx_compute);
+    }
+};
 
-struct ggml_cgraph * bert_build_graph(bert_ctx * ctx, bert_batch batch) {
+using bert_graph_key = std::pair<int,int>;   // (max_len, n_batch)
+struct bert_graph_cache_map {
+    std::map<bert_graph_key, std::unique_ptr<bert_cached_graph>> entries;
+};
+
+static bert_graph_cache_map & bert_get_cache(bert_ctx * ctx) {
+    if (!ctx->graph_cache) ctx->graph_cache = new bert_graph_cache_map();
+    return *static_cast<bert_graph_cache_map *>(ctx->graph_cache);
+}
+
+static void bert_destroy_graph_cache(void * p) {
+    delete static_cast<bert_graph_cache_map *>(p);
+}
+
+// Build the compute graph for fixed (max_len, n_batch). The returned struct
+// owns its ggml_context / cgraph / allocator and is cached in bert_ctx.
+//
+// Notes:
+//   - Self-attention uses ggml_flash_attn_ext (fused scaled-dot-product
+//     attention with mask) instead of an unrolled mul_mat / softmax / mul_mat
+//     so we get the Metal fused kernel and skip 3 cont+permute copies.
+//   - Mean-pooling uses a 3D mul_mat that preserves the batch dim, fixing the
+//     bs>1 reshape assertion in the previous implementation.
+//   - L2 normalisation is still done on the host after backend_tensor_get.
+static std::unique_ptr<bert_cached_graph>
+build_cached_graph(bert_ctx * ctx, int max_len, int n_batch) {
     const bert_hparams & hp    = ctx->model.hparams;
     const bert_model   & model = ctx->model;
 
     const int   n_embd         = hp.n_embd;
     const int   n_layer        = hp.n_layer;
-    const int   n_max_tokens   = hp.n_max_tokens;
     const int   n_head         = hp.n_head;
     const float layer_norm_eps = hp.layer_norm_eps;
     const int   d_head         = n_embd / n_head;
 
-    const int n_batch = (int)batch.size();
-    int cur_max_len = 0;
-    for (const auto & seq : batch)
-        cur_max_len = std::max(cur_max_len, (int)seq.size());
-
-    if (cur_max_len > n_max_tokens) {
-        ggml_log_internal(GGML_LOG_LEVEL_ERROR,
-            "Too many tokens: max %d, got %d\n", n_max_tokens, cur_max_len);
-        return nullptr;
-    }
-
-    struct ggml_init_params p = {
-        ctx->buf_compute_meta.size(),
-        ctx->buf_compute_meta.data(),
-        true
-    };
-    struct ggml_context * ctx0 = ggml_init(p);
-    struct ggml_cgraph  * gf   = ggml_new_graph_custom(ctx0, BERT_MAX_NODES, false);
-
-    // Input tensors — declared no-alloc; gallocr fills them in bert_forward_batch
-    struct ggml_tensor * token_layer = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cur_max_len * n_batch);
-    ggml_set_name(token_layer, "token_layer");
-
-    struct ggml_tensor * token_types = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cur_max_len * n_batch);
-    ggml_set_name(token_types, "token_types");
-
-    struct ggml_tensor * pad_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, 1, cur_max_len, 1, n_batch);
-    ggml_set_name(pad_mask, "pad_mask");
-
-    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cur_max_len * n_batch);
-    ggml_set_name(positions, "positions");
-
-    struct ggml_tensor * sum = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, cur_max_len, 1, n_batch);
-    ggml_set_name(sum, "sum");
-
-    struct ggml_tensor * minus_one = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-    ggml_set_name(minus_one, "minus_one");
-
-    ggml_set_input(token_layer);
-    ggml_set_input(token_types);
-    ggml_set_input(pad_mask);
-    ggml_set_input(positions);
-    ggml_set_input(sum);
-    ggml_set_input(minus_one);
-
-    // ---------------------------------------------------------------------------
-    // Attention mask
-    // pad_mask is [1, cur_max_len, 1, n_batch] — a column vector per batch.
-    // mul_mat gives the outer product [cur_max_len, cur_max_len, 1, n_batch]:
-    //   active×active = 1.0,  padded×anything = 0.0
-    // Scale to 100000 then subtract 100000:
-    //   active pair → 0.0  (no masking)
-    //   padded pair → -100000.0  (suppressed after softmax)
-    // ---------------------------------------------------------------------------
-    struct ggml_tensor * attn_mask = ggml_mul_mat(ctx0, pad_mask, pad_mask);
-    attn_mask = ggml_scale_inplace(ctx0, attn_mask, 100000.0f);
-    struct ggml_tensor * large_offset = ggml_scale(ctx0, minus_one, 100000.0f);
-    attn_mask = ggml_add(ctx0, attn_mask, large_offset);
-
-    // ---------------------------------------------------------------------------
-    // Embeddings — all shaped [n_embd, cur_max_len, n_batch]
-    // ---------------------------------------------------------------------------
-    struct ggml_tensor * word_embd = ggml_get_rows(ctx0, model.word_embeddings, token_layer);
-    word_embd = ggml_reshape_3d(ctx0, word_embd, n_embd, cur_max_len, n_batch);
-
-    struct ggml_tensor * type_embd = ggml_get_rows(ctx0, model.token_type_embeddings, token_types);
-    type_embd = ggml_reshape_3d(ctx0, type_embd, n_embd, cur_max_len, n_batch);
-
-    struct ggml_tensor * pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
-    pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, cur_max_len, n_batch);
-
-    struct ggml_tensor * inpL = ggml_add(ctx0, word_embd, type_embd);
+    auto cg = std::make_unique<bert_cached_graph>();
+    cg->max_len = max_len;
+    cg->n_batch = n_batch;
+
+    cg->buf_meta.resize(BERT_MAX_NODES * ggml_tensor_overhead() + ggml_graph_overhead());
+    ggml_init_params p = { cg->buf_meta.size(), cg->buf_meta.data(), /*no_alloc=*/true };
+    cg->ctx_compute = ggml_init(p);
+    cg->gf = ggml_new_graph_custom(cg->ctx_compute, BERT_MAX_NODES, false);
+    ggml_context * ctx0 = cg->ctx_compute;
+
+    // Input tensors
+    cg->token_layer = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, max_len * n_batch);
+    ggml_set_name(cg->token_layer, "token_layer");
+    cg->token_types = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, max_len * n_batch);
+    ggml_set_name(cg->token_types, "token_types");
+    cg->pad_mask    = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, 1, max_len, 1, n_batch);
+    ggml_set_name(cg->pad_mask, "pad_mask");
+    cg->positions   = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, max_len * n_batch);
+    ggml_set_name(cg->positions, "positions");
+    cg->sum         = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, max_len, 1, n_batch);
+    ggml_set_name(cg->sum, "sum");
+    cg->minus_one   = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_set_name(cg->minus_one, "minus_one");
+
+    ggml_set_input(cg->token_layer);
+    ggml_set_input(cg->token_types);
+    ggml_set_input(cg->pad_mask);
+    ggml_set_input(cg->positions);
+    ggml_set_input(cg->sum);
+    ggml_set_input(cg->minus_one);
+
+    // Attention mask: 0 for unmasked pairs, -large for masked.
+    // pad_mask [1, max_len, 1, n_batch] outer product -> [max_len, max_len, 1, n_batch]
+    ggml_tensor * attn_mask_f32 = ggml_mul_mat(ctx0, cg->pad_mask, cg->pad_mask);
+    attn_mask_f32 = ggml_scale_inplace(ctx0, attn_mask_f32, 100000.0f);
+    ggml_tensor * large_offset = ggml_scale(ctx0, cg->minus_one, 100000.0f);
+    attn_mask_f32 = ggml_add(ctx0, attn_mask_f32, large_offset);
+    // flash_attn_ext requires f16 contiguous mask
+    ggml_tensor * attn_mask = ggml_cast(ctx0, attn_mask_f32, GGML_TYPE_F16);
+
+    // Embeddings — all shaped [n_embd, max_len, n_batch]
+    ggml_tensor * word_embd = ggml_get_rows(ctx0, model.word_embeddings, cg->token_layer);
+    word_embd = ggml_reshape_3d(ctx0, word_embd, n_embd, max_len, n_batch);
+    ggml_tensor * type_embd = ggml_get_rows(ctx0, model.token_type_embeddings, cg->token_types);
+    type_embd = ggml_reshape_3d(ctx0, type_embd, n_embd, max_len, n_batch);
+    ggml_tensor * pos_embd  = ggml_get_rows(ctx0, model.position_embeddings, cg->positions);
+    pos_embd  = ggml_reshape_3d(ctx0, pos_embd, n_embd, max_len, n_batch);
+
+    ggml_tensor * inpL = ggml_add(ctx0, word_embd, type_embd);
     inpL = ggml_add(ctx0, inpL, pos_embd);
-
-    // Embedding LayerNorm
     inpL = ggml_norm_inplace(ctx0, inpL, layer_norm_eps);
     inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.ln_e_w), model.ln_e_b);
 
-    // ---------------------------------------------------------------------------
-    // Transformer layers
-    // ---------------------------------------------------------------------------
+    const float attn_scale = 1.0f / sqrtf((float)d_head);
+
     for (int il = 0; il < n_layer; ++il) {
-        const bert_layer & layer = model.layers[il];
-        struct ggml_tensor * cur = inpL;
-
-        // Self-attention
-        {
-            // Project Q, K, V and reshape to [d_head, cur_max_len, n_head, n_batch]
-            auto proj = [&](struct ggml_tensor * w, struct ggml_tensor * b) {
-                struct ggml_tensor * x = ggml_add(ctx0, ggml_mul_mat(ctx0, w, cur), b);
-                x = ggml_reshape_4d(ctx0, x, d_head, n_head, cur_max_len, n_batch);
-                return ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3));
-            };
-            struct ggml_tensor * Q = proj(layer.q_w, layer.q_b);
-            struct ggml_tensor * K = proj(layer.k_w, layer.k_b);
-            struct ggml_tensor * V = proj(layer.v_w, layer.v_b);
-
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head));
-            KQ = ggml_add(ctx0, KQ, attn_mask);
-            KQ = ggml_soft_max(ctx0, KQ);
-
-            V = ggml_cont(ctx0, ggml_transpose(ctx0, V));
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
-            KQV = ggml_cont(ctx0, ggml_permute(ctx0, KQV, 0, 2, 1, 3));
-            cur = ggml_reshape_3d(ctx0, KQV, n_embd, cur_max_len, n_batch);
-        }
+        const bert_layer & L = model.layers[il];
+        ggml_tensor * cur = inpL;
+
+        // Self-attention: Q, K, V projections then flash_attn_ext
+        // Projections produce [n_embd, max_len, n_batch]. Reshape+permute to
+        // [d_head, max_len, n_head, n_batch] — the layout flash_attn_ext wants.
+        // No ggml_cont needed; flash_attn handles non-contiguous q/k/v.
+        auto proj = [&](ggml_tensor * w, ggml_tensor * b) {
+            ggml_tensor * x = ggml_add(ctx0, ggml_mul_mat(ctx0, w, cur), b);
+            x = ggml_reshape_4d(ctx0, x, d_head, n_head, max_len, n_batch);
+            return ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3));
+        };
+        ggml_tensor * Q = proj(L.q_w, L.q_b);
+        ggml_tensor * K = proj(L.k_w, L.k_b);
+        ggml_tensor * V = proj(L.v_w, L.v_b);
+
+        // flash_attn_ext result: [d_head, n_head, max_len, n_batch] contiguous.
+        // Reshape directly to [n_embd, max_len, n_batch] — d_head and n_head
+        // are contiguous in memory so they merge cleanly.
+        // Precision: F16 accumulate on accelerated backends is ~1.3-1.5x faster
+        // (native fp16 hw) with negligible numerical drift for sentence embeddings.
+        // CPU has no native fp16 path, so F16 there would just upconvert and pay
+        // extra work — force F32 in that case.
+        ggml_tensor * sdpa = ggml_flash_attn_ext(ctx0, Q, K, V, attn_mask,
+                                                 attn_scale, 0.0f, 0.0f);
+        if (ggml_backend_is_cpu(ctx->backend))
+            ggml_flash_attn_ext_set_prec(sdpa, GGML_PREC_F32);
+        cur = ggml_reshape_3d(ctx0, sdpa, n_embd, max_len, n_batch);
 
         // Output projection + residual + post-attention LayerNorm
-        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.o_w, cur), layer.o_b);
+        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, L.o_w, cur), L.o_b);
         cur = ggml_add(ctx0, cur, inpL);
         cur = ggml_norm_inplace(ctx0, cur, layer_norm_eps);
-        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_att_w), layer.ln_att_b);
+        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, L.ln_att_w), L.ln_att_b);
 
-        struct ggml_tensor * att_out = cur;
+        ggml_tensor * att_out = cur;
 
         // FFN + residual + output LayerNorm
-        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_i_w, cur), layer.ff_i_b);
+        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, L.ff_i_w, cur), L.ff_i_b);
         cur = ggml_gelu(ctx0, cur);
-        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_o_w, cur), layer.ff_o_b);
+        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, L.ff_o_w, cur), L.ff_o_b);
         cur = ggml_add(ctx0, att_out, cur);
         cur = ggml_norm_inplace(ctx0, cur, layer_norm_eps);
-        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_out_w), layer.ln_out_b);
+        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, L.ln_out_w), L.ln_out_b);
 
         inpL = cur;
     }
 
-    // ---------------------------------------------------------------------------
-    // Mean pooling
-    //
-    // inpL:   [n_embd, cur_max_len, n_batch]
-    // sum:    [cur_max_len, 1, n_batch]  — each active position holds 1/seq_len,
-    //                                      padding positions hold 0.
+    // Mean pooling via 3D mul_mat that preserves the batch dim.
+    //   inpL: [n_embd, max_len, n_batch]
+    //   sum:  [max_len, 1, n_batch]    (1/seq_len on active positions, 0 on padding)
     //
-    // We multiply element-wise to zero out padding and scale active tokens,
-    // then sum-reduce across the sequence dimension via mul_mat.
-    // Result: [n_embd, n_batch]
-    // ---------------------------------------------------------------------------
-
-    // Reshape sum to [1, cur_max_len, n_batch] so it broadcasts over n_embd
-    struct ggml_tensor * sum_3d = ggml_reshape_3d(ctx0, sum, 1, cur_max_len, n_batch);
-
-    // Scale + zero-pad: [n_embd, cur_max_len, n_batch]
-    struct ggml_tensor * masked_tokens = ggml_mul(ctx0, inpL, sum_3d);
-
-    // Flatten to [n_embd, cur_max_len * n_batch]
-    struct ggml_tensor * inpL_2d = ggml_reshape_2d(ctx0, masked_tokens, n_embd, cur_max_len * n_batch);
+    // For broadcasted batched mul_mat(A, B):
+    //   A: [k, n, batch], B: [k, m, batch]  -> [n, m, batch]
+    // We want [n_embd, 1, n_batch]. So set A = inpL_T with shape [max_len, n_embd, n_batch]
+    // (transpose dims 0/1 only — batch dim preserved) and B = sum.
+    ggml_tensor * inpL_T = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));   // [max_len, n_embd, n_batch]
+    ggml_tensor * pooled = ggml_mul_mat(ctx0, inpL_T, cg->sum);           // [n_embd, 1, n_batch]
+    pooled = ggml_reshape_2d(ctx0, pooled, n_embd, n_batch);
+
+    cg->output = pooled;
+    ggml_set_output(cg->output);
+    ggml_build_forward_expand(cg->gf, cg->output);
+
+    // Allocate compute buffers for this fixed graph shape.
+    ggml_backend_dev_t          dev  = ggml_backend_get_device(ctx->backend);
+    ggml_backend_buffer_type_t  buft = ggml_backend_dev_buffer_type(dev);
+    cg->alloc = ggml_gallocr_new(buft);
+    if (!ggml_gallocr_alloc_graph(cg->alloc, cg->gf)) {
+        fprintf(stderr, "%s: gallocr_alloc_graph failed (max_len=%d, n_batch=%d)\n",
+                __func__, max_len, n_batch);
+        return nullptr;
+    }
 
-    // Flatten sum to [cur_max_len * n_batch]
-    struct ggml_tensor * sum_1d = ggml_reshape_1d(ctx0, sum, cur_max_len * n_batch);
+    // ----------------------------------------------------------------------
+    // One-time fill for tensors that never vary across forwards at this shape:
+    //   - positions: always [0, 1, ..., max_len-1] for every batch element
+    //   - token_types: always 0 for sentence-embedding BERT
+    //   - minus_one: literal -1.0
+    // After this point the backend buffers hold the correct values forever,
+    // so bert_forward_batch can skip writing them on every call.
+    // ----------------------------------------------------------------------
+    {
+        cg->scratch_pos.assign((size_t)max_len * n_batch, 0);
+        for (int ba = 0; ba < n_batch; ++ba)
+            for (int i = 0; i < max_len; ++i)
+                cg->scratch_pos[(size_t)ba * max_len + i] = i;
+        cg->scratch_types.assign((size_t)max_len * n_batch, 0);
+        const float m1 = -1.0f;
 
-    // Transpose to [cur_max_len * n_batch, n_embd] for mul_mat reduction
-    struct ggml_tensor * inpL_2d_T = ggml_cont(ctx0, ggml_transpose(ctx0, inpL_2d));
+        ggml_backend_tensor_set(cg->positions,   cg->scratch_pos.data(),   0, ggml_nbytes(cg->positions));
+        ggml_backend_tensor_set(cg->token_types, cg->scratch_types.data(), 0, ggml_nbytes(cg->token_types));
+        ggml_backend_tensor_set(cg->minus_one,   &m1,                      0, sizeof(m1));
+    }
 
-    // mul_mat: [cur_max_len*n_batch, n_embd] × [cur_max_len*n_batch, 1]
-    //       → [n_embd, n_batch]  (after transpose below)
-    struct ggml_tensor * pooled = ggml_mul_mat(ctx0, inpL_2d_T, sum_1d);
+    // Size per-call scratch once — every forward at this shape rewrites contents
+    // in place, never reallocs.
+    cg->scratch_tl.resize((size_t)max_len * n_batch);
+    cg->scratch_pm.resize((size_t)max_len * n_batch);
+    cg->scratch_sv.resize((size_t)max_len * n_batch);
 
-    // Rotate back to [n_embd, n_batch]
-    inpL = ggml_cont(ctx0, ggml_transpose(ctx0, pooled));
-    inpL = ggml_reshape_2d(ctx0, inpL, n_embd, n_batch);
+    return cg;
+}
 
-    ggml_set_output(inpL);
-    ggml_build_forward_expand(gf, inpL);
-    ggml_free(ctx0);
-    return gf;
+// Kept for ABI compatibility with bert.h: builds (and caches) a graph sized to
+// the given batch. The returned cgraph is owned by ctx — callers MUST NOT free it.
+struct ggml_cgraph * bert_build_graph(bert_ctx * ctx, bert_batch batch) {
+    int max_len = 0;
+    for (const auto & seq : batch) max_len = std::max(max_len, (int)seq.size());
+    if (max_len > ctx->model.hparams.n_max_tokens) {
+        ggml_log_internal(GGML_LOG_LEVEL_ERROR,
+            "Too many tokens: max %d, got %d\n", ctx->model.hparams.n_max_tokens, max_len);
+        return nullptr;
+    }
+    const int n_batch = (int)batch.size();
+    auto & cache = bert_get_cache(ctx);
+    auto key = std::make_pair(max_len, n_batch);
+    auto it = cache.entries.find(key);
+    if (it == cache.entries.end()) {
+        auto cg = build_cached_graph(ctx, max_len, n_batch);
+        if (!cg) return nullptr;
+        it = cache.entries.emplace(key, std::move(cg)).first;
+    }
+    return it->second->gf;
 }
 
 // ---------------------------------------------------------------------------
@@ -1300,95 +1393,71 @@ struct ggml_cgraph * bert_build_graph(bert_ctx * ctx, bert_batch batch) {
 
 void bert_forward_batch(bert_ctx * ctx, bert_batch batch,
                         float * embeddings, int32_t n_threads) {
+    const int n_batch = (int)batch.size();
+    if (n_batch == 0) return;
 
-#if 1
-    // Resize meta buffer to fit this graph — safe to resize each call
-    ctx->buf_compute_meta.resize(
-        BERT_MAX_NODES * ggml_tensor_overhead() + ggml_graph_overhead());
-#endif
-
-    struct ggml_cgraph * gf = bert_build_graph(ctx, batch);
-    if (!gf) { fprintf(stderr, "%s: build graph failed\n", __func__); return; }
-
-    // Scoped allocator — fresh for every call, freed at the end
-    ggml_backend_dev_t          dev        = ggml_backend_get_device(ctx->backend);
-    ggml_backend_buffer_type_t  buft       = ggml_backend_dev_buffer_type(dev);
-    ggml_gallocr_t              local_alloc = ggml_gallocr_new(buft);
-
-    if (!ggml_gallocr_alloc_graph(local_alloc, gf)) {
-        fprintf(stderr, "%s: local graph allocation failed\n", __func__);
-        ggml_gallocr_free(local_alloc);
+    int max_len = 0;
+    for (const auto & s : batch) max_len = std::max(max_len, (int)s.size());
+    if (max_len == 0) return;
+    if (max_len > ctx->model.hparams.n_max_tokens) {
+        ggml_log_internal(GGML_LOG_LEVEL_ERROR,
+            "Too many tokens: max %d, got %d\n", ctx->model.hparams.n_max_tokens, max_len);
         return;
     }
 
-    const int n_batch = (int)batch.size();
-    int cur_max_len = 0;
-    for (const auto & s : batch)
-        cur_max_len = std::max(cur_max_len, (int)s.size());
-
-    struct ggml_tensor * token_layer = ggml_graph_get_tensor(gf, "token_layer");
-    struct ggml_tensor * token_types = ggml_graph_get_tensor(gf, "token_types");
-    struct ggml_tensor * pad_mask    = ggml_graph_get_tensor(gf, "pad_mask");
-    struct ggml_tensor * positions   = ggml_graph_get_tensor(gf, "positions");
-    struct ggml_tensor * sum         = ggml_graph_get_tensor(gf, "sum");
-    struct ggml_tensor * minus_one   = ggml_graph_get_tensor(gf, "minus_one");
-
-    // Fill input buffers
+    auto & cache = bert_get_cache(ctx);
+    auto key = std::make_pair(max_len, n_batch);
+    auto it = cache.entries.find(key);
+    if (it == cache.entries.end()) {
+        auto cg = build_cached_graph(ctx, max_len, n_batch);
+        if (!cg) { fprintf(stderr, "%s: build graph failed\n", __func__); return; }
+        it = cache.entries.emplace(key, std::move(cg)).first;
+    }
+    bert_cached_graph & cg = *it->second;
+
+    // Fill per-call input buffers. positions / token_types / minus_one were
+    // pre-filled at build time and never change. Scratch vectors are sized
+    // once in build_cached_graph; we only rewrite their contents here.
     {
-        std::vector<int32_t> tl(cur_max_len * n_batch);
-        std::vector<int32_t> tt(cur_max_len * n_batch, 0);
-        std::vector<float>   pm(cur_max_len * n_batch, 0.0f);
-        std::vector<int32_t> pos(cur_max_len * n_batch);
-        std::vector<float>   sv(cur_max_len * n_batch, 0.0f);
-        const float m1 = -1.0f;
+        int32_t * tl = cg.scratch_tl.data();
+        float   * pm = cg.scratch_pm.data();
+        float   * sv = cg.scratch_sv.data();
 
         for (int ba = 0; ba < n_batch; ++ba) {
             const int cl = (int)batch[ba].size();
-            // Number of real (non-padding) tokens for mean-pool weight.
-            // cl includes [CLS] and [SEP]; to exclude them from pooling,
-            // use cl-2 as the divisor and skip positions 0 and cl-1 below.
-            // Currently we include them (standard all-token mean pool).
             const float w = (cl > 0) ? 1.0f / (float)cl : 0.0f;
-
-            for (int i = 0; i < cur_max_len; ++i) {
-                const int idx = ba * cur_max_len + i;
-                if (i < cl) {
-                    tl[idx]  = batch[ba][i];
-                    pm[idx]  = 1.0f;
-                    sv[idx]  = w;
-                } else {
-                    tl[idx]  = 0;   // padding token id (unused, masked out)
-                    pm[idx]  = 0.0f;
-                    sv[idx]  = 0.0f;
-                }
-                pos[idx] = i;
+            const int row = ba * max_len;
+            for (int i = 0; i < cl; ++i) {
+                tl[row + i] = batch[ba][i];
+                pm[row + i] = 1.0f;
+                sv[row + i] = w;
             }
+            for (int i = cl; i < max_len; ++i) {
+                tl[row + i] = 0;
+                pm[row + i] = 0.0f;
+                sv[row + i] = 0.0f;
+            }
+        }
+        ggml_backend_tensor_set(cg.token_layer, tl, 0, ggml_nbytes(cg.token_layer));
+        ggml_backend_tensor_set(cg.pad_mask,    pm, 0, ggml_nbytes(cg.pad_mask));
+        ggml_backend_tensor_set(cg.sum,         sv, 0, ggml_nbytes(cg.sum));
+        // Workaround: CPU gallocr appears to reuse these slots between
+        // compute calls even though they're flagged ggml_set_input. Metal
+        // keeps them stable. Cheap to rewrite each call.
+        if (ggml_backend_is_cpu(ctx->backend)) {
+            ggml_backend_tensor_set(cg.positions,   cg.scratch_pos.data(),   0, ggml_nbytes(cg.positions));
+            ggml_backend_tensor_set(cg.token_types, cg.scratch_types.data(), 0, ggml_nbytes(cg.token_types));
         }
-        ggml_backend_tensor_set(token_layer, tl.data(),  0, ggml_nbytes(token_layer));
-        ggml_backend_tensor_set(token_types, tt.data(),  0, ggml_nbytes(token_types));
-        ggml_backend_tensor_set(pad_mask,    pm.data(),  0, ggml_nbytes(pad_mask));
-        ggml_backend_tensor_set(positions,   pos.data(), 0, ggml_nbytes(positions));
-        ggml_backend_tensor_set(sum,         sv.data(),  0, ggml_nbytes(sum));
-        ggml_backend_tensor_set(minus_one,   &m1,        0, sizeof(m1));
     }
 
-    if (verbosity >= 3) ggml_graph_print(gf);
-
     if (ggml_backend_is_cpu(ctx->backend))
         ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
 
-    ggml_backend_graph_compute(ctx->backend, gf);
-
-    // Output is the last node in the graph (set via ggml_set_output in build)
-    struct ggml_tensor * output = ggml_graph_node(gf, -1);
+    ggml_backend_graph_compute(ctx->backend, cg.gf);
 
-    // Copy embeddings to host
-    ggml_backend_tensor_get(output, embeddings, 0, ggml_nbytes(output));
+    ggml_backend_tensor_get(cg.output, embeddings, 0, ggml_nbytes(cg.output));
 
-    // ---------------------------------------------------------------------------
-    // L2 normalisation (in-place, per embedding vector)
-    // Sentence-transformer models expect unit-norm vectors for cosine similarity.
-    // ---------------------------------------------------------------------------
+    // L2 normalisation (in-place) — sentence-transformer convention.
     const int n_embd = ctx->model.hparams.n_embd;
     for (int b = 0; b < n_batch; ++b) {
         float * vec = embeddings + b * n_embd;
@@ -1399,8 +1468,6 @@ void bert_forward_batch(bert_ctx * ctx, bert_batch batch,
             for (int i = 0; i < n_embd; ++i) vec[i] /= norm;
         }
     }
-
-    ggml_gallocr_free(local_alloc);
 }
 
 void bert_encode_batch(bert_ctx * ctx, bert_strings texts,
diff --git a/bert.h b/bert.h
index dc1566a..a5057fb 100644
--- a/bert.h
+++ b/bert.h
@@ -91,6 +91,11 @@ struct bert_ctx {
 
     std::string  model_name = "";
     std::string  model_arch = "bert";
+
+    // Opaque graph cache (bert_graph_cache_map *). Built lazily on first forward
+    // at a given (max_len, batch_size); reused for all subsequent forwards at the
+    // same shape so we avoid rebuilding the graph and re-running gallocr.
+    void * graph_cache = nullptr;
 };
 
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index f619d96..8357803 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -3,5 +3,8 @@ include_directories(${CMAKE_SOURCE_DIR}/)
 add_executable(main main.cpp)
 target_link_libraries(main PRIVATE bert ggml)
 
+add_executable(bench bench.cpp)
+target_link_libraries(bench PRIVATE bert ggml)
+
 #add_executable(basic basic.cpp)
 #target_link_libraries(basic PRIVATE bert ggml)
diff --git a/examples/bench.cpp b/examples/bench.cpp
new file mode 100644
index 0000000..479440e
--- /dev/null
+++ b/examples/bench.cpp
@@ -0,0 +1,142 @@
+#include "bert.h"
+#include "ggml.h"
+
+#include <algorithm>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <vector>
+
+struct bench_params {
+    const char * model     = nullptr;
+    int          n_threads = 6;
+    int          warmup    = 5;
+    int          iters     = 50;
+    bool         use_cpu   = false;
+};
+
+static void usage(const char * argv0) {
+    fprintf(stderr,
+        "usage: %s -m MODEL [-c] [-t N] [-w N] [-n N]\n"
+        "  -m  model path (gguf)\n"
+        "  -c  CPU backend (default: accelerated)\n"
+        "  -t  threads (default 6)\n"
+        "  -w  warmup iterations (default 5)\n"
+        "  -n  measured iterations (default 50)\n", argv0);
+}
+
+static bool parse(int argc, char ** argv, bench_params & p) {
+    for (int i = 1; i < argc; i++) {
+        std::string a = argv[i];
+        if      (a == "-m") p.model     = argv[++i];
+        else if (a == "-c") p.use_cpu   = true;
+        else if (a == "-t") p.n_threads = std::stoi(argv[++i]);
+        else if (a == "-w") p.warmup    = std::stoi(argv[++i]);
+        else if (a == "-n") p.iters     = std::stoi(argv[++i]);
+        else if (a == "-h" || a == "--help") { usage(argv[0]); return false; }
+        else { fprintf(stderr, "unknown arg: %s\n", a.c_str()); usage(argv[0]); return false; }
+    }
+    return p.model != nullptr;
+}
+
+static double median(std::vector<double> v) {
+    std::sort(v.begin(), v.end());
+    size_t n = v.size();
+    return n % 2 ? v[n/2] : 0.5*(v[n/2 - 1] + v[n/2]);
+}
+
+static double percentile(std::vector<double> v, double p) {
+    std::sort(v.begin(), v.end());
+    size_t idx = (size_t)((v.size() - 1) * p);
+    return v[idx];
+}
+
+static double mean(const std::vector<double> & v) {
+    double s = 0; for (double x : v) s += x; return s / v.size();
+}
+
+// build a prompt of roughly `target_tokens` tokens (each filler word becomes ~1 token)
+static std::string make_prompt(int target_tokens) {
+    static const char * filler = "the quick brown fox jumps over the lazy dog ";
+    std::string s;
+    // -2 for [CLS]/[SEP]
+    int words = std::max(1, target_tokens - 2);
+    // each filler word is ~1 token; pad with single tokens
+    s.reserve(words * 6);
+    while ((int)s.size()/5 < words) s += filler;
+    return s;
+}
+
+static void run_scenario(bert_ctx * ctx, const bench_params & p,
+                         const char * label, int target_tokens, int batch_size) {
+    std::string prompt = make_prompt(target_tokens);
+    bert_tokens toks = bert_tokenize(ctx, prompt, bert_n_max_tokens(ctx));
+    int actual_tokens = (int)toks.size();
+
+    bert_batch batch(batch_size, toks);
+    const int n_embd = bert_n_embd(ctx);
+    std::vector<float> embed((size_t)batch_size * n_embd);
+
+    // warmup
+    for (int i = 0; i < p.warmup; i++) {
+        bert_forward_batch(ctx, batch, embed.data(), p.n_threads);
+    }
+
+    // measure
+    std::vector<double> ms;
+    ms.reserve(p.iters);
+    for (int i = 0; i < p.iters; i++) {
+        int64_t t0 = ggml_time_us();
+        bert_forward_batch(ctx, batch, embed.data(), p.n_threads);
+        int64_t t1 = ggml_time_us();
+        ms.push_back((t1 - t0) / 1000.0);
+    }
+
+    double m  = mean(ms);
+    double md = median(ms);
+    double p95= percentile(ms, 0.95);
+    double mn = *std::min_element(ms.begin(), ms.end());
+    double mx = *std::max_element(ms.begin(), ms.end());
+    double tokens_per_call = (double)actual_tokens * batch_size;
+    double tok_per_sec     = tokens_per_call / (m / 1000.0);
+    double seq_per_sec     = (double)batch_size / (m / 1000.0);
+
+    printf("  %-14s tok=%3d  bs=%3d  | mean=%7.2f ms  med=%7.2f  p95=%7.2f  min=%6.2f  max=%6.2f  | %8.0f tok/s  %7.1f seq/s\n",
+           label, actual_tokens, batch_size, m, md, p95, mn, mx, tok_per_sec, seq_per_sec);
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+    bench_params p;
+    if (!parse(argc, argv, p)) return 1;
+
+    printf("Loading %s on %s backend (threads=%d)...\n",
+           p.model, p.use_cpu ? "CPU" : "accelerated", p.n_threads);
+    int64_t tl0 = ggml_time_us();
+    bert_ctx * ctx = bert_load_from_file(p.model, p.use_cpu);
+    int64_t tl1 = ggml_time_us();
+    if (!ctx) { fprintf(stderr, "failed to load model\n"); return 1; }
+    printf("Model: %s  arch=%s  n_embd=%d  load=%.0f ms\n",
+           std::string(bert_get_model_name(ctx)).c_str(),
+           std::string(bert_get_architecture(ctx)).c_str(),
+           bert_n_embd(ctx),
+           (tl1 - tl0) / 1000.0);
+    printf("Warmup=%d iters, measure=%d iters\n\n", p.warmup, p.iters);
+
+    printf("== latency by input length (batch=1) ==\n");
+    run_scenario(ctx, p, "very-short", 8,   1);
+    run_scenario(ctx, p, "short",      32,  1);
+    run_scenario(ctx, p, "medium",     128, 1);
+    run_scenario(ctx, p, "long",       256, 1);
+    run_scenario(ctx, p, "max",        512, 1);
+
+    printf("\n== throughput by batch size (tok=128) ==\n");
+    run_scenario(ctx, p, "bs=1",   128, 1);
+    run_scenario(ctx, p, "bs=4",   128, 4);
+    run_scenario(ctx, p, "bs=8",   128, 8);
+    run_scenario(ctx, p, "bs=16",  128, 16);
+    run_scenario(ctx, p, "bs=32",  128, 32);
+
+    bert_free(ctx);
+    return 0;
+}