diff --git a/CMakeLists.txt b/CMakeLists.txt index c42de48..f7259cb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -86,7 +86,7 @@ if (GGML_METAL) # copy ggml-metal.metal to bin directory - configure_file(ggml/src/ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) + configure_file(ggml/src/ggml-metal/ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) endif() diff --git a/bert.cpp b/bert.cpp index d5b90f4..da19f81 100644 --- a/bert.cpp +++ b/bert.cpp @@ -64,7 +64,9 @@ For these use the llama.cpp interface. #include #include #include +#include #include +#include #define BERT_MAX_NODES 4096 #define KEY_TOKEN_LIST "tokenizer.ggml.tokens" @@ -1045,6 +1047,13 @@ BERT_API struct bert_ctx * bert_load_from_file_wordpiece(const char * fname, boo // Buffer management // --------------------------------------------------------------------------- +// Forward declarations for graph caching — full definitions live with the +// graph-construction code further below. bert_free needs the complete type +// to call ~bert_graph_cache_map, so we provide a small helper there too. +struct bert_cached_graph; +struct bert_graph_cache_map; +static void bert_destroy_graph_cache(void * p); + int32_t bert_n_embd (bert_ctx * ctx) { return ctx->model.hparams.n_embd; } int32_t bert_n_max_tokens(bert_ctx * ctx) { return ctx->model.hparams.n_max_tokens; } @@ -1088,6 +1097,10 @@ void bert_deallocate_buffers(bert_ctx * ctx) { void bert_free(bert_ctx * ctx) { if (!ctx) return; + if (ctx->graph_cache) { + bert_destroy_graph_cache(ctx->graph_cache); + ctx->graph_cache = nullptr; + } if (ctx->compute_alloc) { ggml_gallocr_free(ctx->compute_alloc); ctx->compute_alloc = nullptr; @@ -1114,184 +1127,264 @@ void bert_free(bert_ctx * ctx) { // --------------------------------------------------------------------------- // Graph construction // --------------------------------------------------------------------------- +// +// The graph is parameterised by (max_len, n_batch) only — input contents are +// filled at forward time. We cache one graph per distinct (max_len, n_batch) +// in ctx->graph_cache so repeated forwards at the same shape skip both the +// graph build and the gallocr planning step. This is the dominant cost for +// short-input latency. + +struct bert_cached_graph { + int max_len = 0; + int n_batch = 0; + + std::vector buf_meta; // backing for ctx_compute + ggml_context * ctx_compute = nullptr; + ggml_cgraph * gf = nullptr; + ggml_gallocr_t alloc = nullptr; + + // Cached input/output tensor pointers — set during build, used by every + // subsequent forward to avoid name lookups. + ggml_tensor * token_layer = nullptr; + ggml_tensor * token_types = nullptr; // pre-filled with zeros at build time + ggml_tensor * pad_mask = nullptr; + ggml_tensor * positions = nullptr; // pre-filled with [0..max_len-1] at build time + ggml_tensor * sum = nullptr; + ggml_tensor * minus_one = nullptr; + ggml_tensor * output = nullptr; + + // Per-call scratch buffers — sized once per (max_len, n_batch) to avoid + // heap churn on every forward. Only the contents are rewritten per call. + std::vector scratch_tl; // token ids + std::vector scratch_pm; // pad mask (1=real, 0=padding) + std::vector scratch_sv; // mean-pool weights (1/seq_len on real, 0 on pad) + + // Constant buffers — filled once at build, re-uploaded each call on CPU + // (gallocr quirk: input-flagged tensors don't stay stable between computes). + std::vector scratch_pos; + std::vector scratch_types; + + ~bert_cached_graph() { + if (alloc) ggml_gallocr_free(alloc); + if (ctx_compute) ggml_free(ctx_compute); + } +}; -struct ggml_cgraph * bert_build_graph(bert_ctx * ctx, bert_batch batch) { +using bert_graph_key = std::pair; // (max_len, n_batch) +struct bert_graph_cache_map { + std::map> entries; +}; + +static bert_graph_cache_map & bert_get_cache(bert_ctx * ctx) { + if (!ctx->graph_cache) ctx->graph_cache = new bert_graph_cache_map(); + return *static_cast(ctx->graph_cache); +} + +static void bert_destroy_graph_cache(void * p) { + delete static_cast(p); +} + +// Build the compute graph for fixed (max_len, n_batch). The returned struct +// owns its ggml_context / cgraph / allocator and is cached in bert_ctx. +// +// Notes: +// - Self-attention uses ggml_flash_attn_ext (fused scaled-dot-product +// attention with mask) instead of an unrolled mul_mat / softmax / mul_mat +// so we get the Metal fused kernel and skip 3 cont+permute copies. +// - Mean-pooling uses a 3D mul_mat that preserves the batch dim, fixing the +// bs>1 reshape assertion in the previous implementation. +// - L2 normalisation is still done on the host after backend_tensor_get. +static std::unique_ptr +build_cached_graph(bert_ctx * ctx, int max_len, int n_batch) { const bert_hparams & hp = ctx->model.hparams; const bert_model & model = ctx->model; const int n_embd = hp.n_embd; const int n_layer = hp.n_layer; - const int n_max_tokens = hp.n_max_tokens; const int n_head = hp.n_head; const float layer_norm_eps = hp.layer_norm_eps; const int d_head = n_embd / n_head; - const int n_batch = (int)batch.size(); - int cur_max_len = 0; - for (const auto & seq : batch) - cur_max_len = std::max(cur_max_len, (int)seq.size()); - - if (cur_max_len > n_max_tokens) { - ggml_log_internal(GGML_LOG_LEVEL_ERROR, - "Too many tokens: max %d, got %d\n", n_max_tokens, cur_max_len); - return nullptr; - } - - struct ggml_init_params p = { - ctx->buf_compute_meta.size(), - ctx->buf_compute_meta.data(), - true - }; - struct ggml_context * ctx0 = ggml_init(p); - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, BERT_MAX_NODES, false); - - // Input tensors — declared no-alloc; gallocr fills them in bert_forward_batch - struct ggml_tensor * token_layer = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cur_max_len * n_batch); - ggml_set_name(token_layer, "token_layer"); - - struct ggml_tensor * token_types = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cur_max_len * n_batch); - ggml_set_name(token_types, "token_types"); - - struct ggml_tensor * pad_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, 1, cur_max_len, 1, n_batch); - ggml_set_name(pad_mask, "pad_mask"); - - struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cur_max_len * n_batch); - ggml_set_name(positions, "positions"); - - struct ggml_tensor * sum = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, cur_max_len, 1, n_batch); - ggml_set_name(sum, "sum"); - - struct ggml_tensor * minus_one = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(minus_one, "minus_one"); - - ggml_set_input(token_layer); - ggml_set_input(token_types); - ggml_set_input(pad_mask); - ggml_set_input(positions); - ggml_set_input(sum); - ggml_set_input(minus_one); - - // --------------------------------------------------------------------------- - // Attention mask - // pad_mask is [1, cur_max_len, 1, n_batch] — a column vector per batch. - // mul_mat gives the outer product [cur_max_len, cur_max_len, 1, n_batch]: - // active×active = 1.0, padded×anything = 0.0 - // Scale to 100000 then subtract 100000: - // active pair → 0.0 (no masking) - // padded pair → -100000.0 (suppressed after softmax) - // --------------------------------------------------------------------------- - struct ggml_tensor * attn_mask = ggml_mul_mat(ctx0, pad_mask, pad_mask); - attn_mask = ggml_scale_inplace(ctx0, attn_mask, 100000.0f); - struct ggml_tensor * large_offset = ggml_scale(ctx0, minus_one, 100000.0f); - attn_mask = ggml_add(ctx0, attn_mask, large_offset); - - // --------------------------------------------------------------------------- - // Embeddings — all shaped [n_embd, cur_max_len, n_batch] - // --------------------------------------------------------------------------- - struct ggml_tensor * word_embd = ggml_get_rows(ctx0, model.word_embeddings, token_layer); - word_embd = ggml_reshape_3d(ctx0, word_embd, n_embd, cur_max_len, n_batch); - - struct ggml_tensor * type_embd = ggml_get_rows(ctx0, model.token_type_embeddings, token_types); - type_embd = ggml_reshape_3d(ctx0, type_embd, n_embd, cur_max_len, n_batch); - - struct ggml_tensor * pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions); - pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, cur_max_len, n_batch); - - struct ggml_tensor * inpL = ggml_add(ctx0, word_embd, type_embd); + auto cg = std::make_unique(); + cg->max_len = max_len; + cg->n_batch = n_batch; + + cg->buf_meta.resize(BERT_MAX_NODES * ggml_tensor_overhead() + ggml_graph_overhead()); + ggml_init_params p = { cg->buf_meta.size(), cg->buf_meta.data(), /*no_alloc=*/true }; + cg->ctx_compute = ggml_init(p); + cg->gf = ggml_new_graph_custom(cg->ctx_compute, BERT_MAX_NODES, false); + ggml_context * ctx0 = cg->ctx_compute; + + // Input tensors + cg->token_layer = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, max_len * n_batch); + ggml_set_name(cg->token_layer, "token_layer"); + cg->token_types = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, max_len * n_batch); + ggml_set_name(cg->token_types, "token_types"); + cg->pad_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, 1, max_len, 1, n_batch); + ggml_set_name(cg->pad_mask, "pad_mask"); + cg->positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, max_len * n_batch); + ggml_set_name(cg->positions, "positions"); + cg->sum = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, max_len, 1, n_batch); + ggml_set_name(cg->sum, "sum"); + cg->minus_one = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_set_name(cg->minus_one, "minus_one"); + + ggml_set_input(cg->token_layer); + ggml_set_input(cg->token_types); + ggml_set_input(cg->pad_mask); + ggml_set_input(cg->positions); + ggml_set_input(cg->sum); + ggml_set_input(cg->minus_one); + + // Attention mask: 0 for unmasked pairs, -large for masked. + // pad_mask [1, max_len, 1, n_batch] outer product -> [max_len, max_len, 1, n_batch] + ggml_tensor * attn_mask_f32 = ggml_mul_mat(ctx0, cg->pad_mask, cg->pad_mask); + attn_mask_f32 = ggml_scale_inplace(ctx0, attn_mask_f32, 100000.0f); + ggml_tensor * large_offset = ggml_scale(ctx0, cg->minus_one, 100000.0f); + attn_mask_f32 = ggml_add(ctx0, attn_mask_f32, large_offset); + // flash_attn_ext requires f16 contiguous mask + ggml_tensor * attn_mask = ggml_cast(ctx0, attn_mask_f32, GGML_TYPE_F16); + + // Embeddings — all shaped [n_embd, max_len, n_batch] + ggml_tensor * word_embd = ggml_get_rows(ctx0, model.word_embeddings, cg->token_layer); + word_embd = ggml_reshape_3d(ctx0, word_embd, n_embd, max_len, n_batch); + ggml_tensor * type_embd = ggml_get_rows(ctx0, model.token_type_embeddings, cg->token_types); + type_embd = ggml_reshape_3d(ctx0, type_embd, n_embd, max_len, n_batch); + ggml_tensor * pos_embd = ggml_get_rows(ctx0, model.position_embeddings, cg->positions); + pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, max_len, n_batch); + + ggml_tensor * inpL = ggml_add(ctx0, word_embd, type_embd); inpL = ggml_add(ctx0, inpL, pos_embd); - - // Embedding LayerNorm inpL = ggml_norm_inplace(ctx0, inpL, layer_norm_eps); inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.ln_e_w), model.ln_e_b); - // --------------------------------------------------------------------------- - // Transformer layers - // --------------------------------------------------------------------------- + const float attn_scale = 1.0f / sqrtf((float)d_head); + for (int il = 0; il < n_layer; ++il) { - const bert_layer & layer = model.layers[il]; - struct ggml_tensor * cur = inpL; - - // Self-attention - { - // Project Q, K, V and reshape to [d_head, cur_max_len, n_head, n_batch] - auto proj = [&](struct ggml_tensor * w, struct ggml_tensor * b) { - struct ggml_tensor * x = ggml_add(ctx0, ggml_mul_mat(ctx0, w, cur), b); - x = ggml_reshape_4d(ctx0, x, d_head, n_head, cur_max_len, n_batch); - return ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3)); - }; - struct ggml_tensor * Q = proj(layer.q_w, layer.q_b); - struct ggml_tensor * K = proj(layer.k_w, layer.k_b); - struct ggml_tensor * V = proj(layer.v_w, layer.v_b); - - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head)); - KQ = ggml_add(ctx0, KQ, attn_mask); - KQ = ggml_soft_max(ctx0, KQ); - - V = ggml_cont(ctx0, ggml_transpose(ctx0, V)); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); - KQV = ggml_cont(ctx0, ggml_permute(ctx0, KQV, 0, 2, 1, 3)); - cur = ggml_reshape_3d(ctx0, KQV, n_embd, cur_max_len, n_batch); - } + const bert_layer & L = model.layers[il]; + ggml_tensor * cur = inpL; + + // Self-attention: Q, K, V projections then flash_attn_ext + // Projections produce [n_embd, max_len, n_batch]. Reshape+permute to + // [d_head, max_len, n_head, n_batch] — the layout flash_attn_ext wants. + // No ggml_cont needed; flash_attn handles non-contiguous q/k/v. + auto proj = [&](ggml_tensor * w, ggml_tensor * b) { + ggml_tensor * x = ggml_add(ctx0, ggml_mul_mat(ctx0, w, cur), b); + x = ggml_reshape_4d(ctx0, x, d_head, n_head, max_len, n_batch); + return ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3)); + }; + ggml_tensor * Q = proj(L.q_w, L.q_b); + ggml_tensor * K = proj(L.k_w, L.k_b); + ggml_tensor * V = proj(L.v_w, L.v_b); + + // flash_attn_ext result: [d_head, n_head, max_len, n_batch] contiguous. + // Reshape directly to [n_embd, max_len, n_batch] — d_head and n_head + // are contiguous in memory so they merge cleanly. + // Precision: F16 accumulate on accelerated backends is ~1.3-1.5x faster + // (native fp16 hw) with negligible numerical drift for sentence embeddings. + // CPU has no native fp16 path, so F16 there would just upconvert and pay + // extra work — force F32 in that case. + ggml_tensor * sdpa = ggml_flash_attn_ext(ctx0, Q, K, V, attn_mask, + attn_scale, 0.0f, 0.0f); + if (ggml_backend_is_cpu(ctx->backend)) + ggml_flash_attn_ext_set_prec(sdpa, GGML_PREC_F32); + cur = ggml_reshape_3d(ctx0, sdpa, n_embd, max_len, n_batch); // Output projection + residual + post-attention LayerNorm - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.o_w, cur), layer.o_b); + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, L.o_w, cur), L.o_b); cur = ggml_add(ctx0, cur, inpL); cur = ggml_norm_inplace(ctx0, cur, layer_norm_eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_att_w), layer.ln_att_b); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, L.ln_att_w), L.ln_att_b); - struct ggml_tensor * att_out = cur; + ggml_tensor * att_out = cur; // FFN + residual + output LayerNorm - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_i_w, cur), layer.ff_i_b); + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, L.ff_i_w, cur), L.ff_i_b); cur = ggml_gelu(ctx0, cur); - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_o_w, cur), layer.ff_o_b); + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, L.ff_o_w, cur), L.ff_o_b); cur = ggml_add(ctx0, att_out, cur); cur = ggml_norm_inplace(ctx0, cur, layer_norm_eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_out_w), layer.ln_out_b); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, L.ln_out_w), L.ln_out_b); inpL = cur; } - // --------------------------------------------------------------------------- - // Mean pooling - // - // inpL: [n_embd, cur_max_len, n_batch] - // sum: [cur_max_len, 1, n_batch] — each active position holds 1/seq_len, - // padding positions hold 0. + // Mean pooling via 3D mul_mat that preserves the batch dim. + // inpL: [n_embd, max_len, n_batch] + // sum: [max_len, 1, n_batch] (1/seq_len on active positions, 0 on padding) // - // We multiply element-wise to zero out padding and scale active tokens, - // then sum-reduce across the sequence dimension via mul_mat. - // Result: [n_embd, n_batch] - // --------------------------------------------------------------------------- - - // Reshape sum to [1, cur_max_len, n_batch] so it broadcasts over n_embd - struct ggml_tensor * sum_3d = ggml_reshape_3d(ctx0, sum, 1, cur_max_len, n_batch); - - // Scale + zero-pad: [n_embd, cur_max_len, n_batch] - struct ggml_tensor * masked_tokens = ggml_mul(ctx0, inpL, sum_3d); - - // Flatten to [n_embd, cur_max_len * n_batch] - struct ggml_tensor * inpL_2d = ggml_reshape_2d(ctx0, masked_tokens, n_embd, cur_max_len * n_batch); + // For broadcasted batched mul_mat(A, B): + // A: [k, n, batch], B: [k, m, batch] -> [n, m, batch] + // We want [n_embd, 1, n_batch]. So set A = inpL_T with shape [max_len, n_embd, n_batch] + // (transpose dims 0/1 only — batch dim preserved) and B = sum. + ggml_tensor * inpL_T = ggml_cont(ctx0, ggml_transpose(ctx0, inpL)); // [max_len, n_embd, n_batch] + ggml_tensor * pooled = ggml_mul_mat(ctx0, inpL_T, cg->sum); // [n_embd, 1, n_batch] + pooled = ggml_reshape_2d(ctx0, pooled, n_embd, n_batch); + + cg->output = pooled; + ggml_set_output(cg->output); + ggml_build_forward_expand(cg->gf, cg->output); + + // Allocate compute buffers for this fixed graph shape. + ggml_backend_dev_t dev = ggml_backend_get_device(ctx->backend); + ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(dev); + cg->alloc = ggml_gallocr_new(buft); + if (!ggml_gallocr_alloc_graph(cg->alloc, cg->gf)) { + fprintf(stderr, "%s: gallocr_alloc_graph failed (max_len=%d, n_batch=%d)\n", + __func__, max_len, n_batch); + return nullptr; + } - // Flatten sum to [cur_max_len * n_batch] - struct ggml_tensor * sum_1d = ggml_reshape_1d(ctx0, sum, cur_max_len * n_batch); + // ---------------------------------------------------------------------- + // One-time fill for tensors that never vary across forwards at this shape: + // - positions: always [0, 1, ..., max_len-1] for every batch element + // - token_types: always 0 for sentence-embedding BERT + // - minus_one: literal -1.0 + // After this point the backend buffers hold the correct values forever, + // so bert_forward_batch can skip writing them on every call. + // ---------------------------------------------------------------------- + { + cg->scratch_pos.assign((size_t)max_len * n_batch, 0); + for (int ba = 0; ba < n_batch; ++ba) + for (int i = 0; i < max_len; ++i) + cg->scratch_pos[(size_t)ba * max_len + i] = i; + cg->scratch_types.assign((size_t)max_len * n_batch, 0); + const float m1 = -1.0f; - // Transpose to [cur_max_len * n_batch, n_embd] for mul_mat reduction - struct ggml_tensor * inpL_2d_T = ggml_cont(ctx0, ggml_transpose(ctx0, inpL_2d)); + ggml_backend_tensor_set(cg->positions, cg->scratch_pos.data(), 0, ggml_nbytes(cg->positions)); + ggml_backend_tensor_set(cg->token_types, cg->scratch_types.data(), 0, ggml_nbytes(cg->token_types)); + ggml_backend_tensor_set(cg->minus_one, &m1, 0, sizeof(m1)); + } - // mul_mat: [cur_max_len*n_batch, n_embd] × [cur_max_len*n_batch, 1] - // → [n_embd, n_batch] (after transpose below) - struct ggml_tensor * pooled = ggml_mul_mat(ctx0, inpL_2d_T, sum_1d); + // Size per-call scratch once — every forward at this shape rewrites contents + // in place, never reallocs. + cg->scratch_tl.resize((size_t)max_len * n_batch); + cg->scratch_pm.resize((size_t)max_len * n_batch); + cg->scratch_sv.resize((size_t)max_len * n_batch); - // Rotate back to [n_embd, n_batch] - inpL = ggml_cont(ctx0, ggml_transpose(ctx0, pooled)); - inpL = ggml_reshape_2d(ctx0, inpL, n_embd, n_batch); + return cg; +} - ggml_set_output(inpL); - ggml_build_forward_expand(gf, inpL); - ggml_free(ctx0); - return gf; +// Kept for ABI compatibility with bert.h: builds (and caches) a graph sized to +// the given batch. The returned cgraph is owned by ctx — callers MUST NOT free it. +struct ggml_cgraph * bert_build_graph(bert_ctx * ctx, bert_batch batch) { + int max_len = 0; + for (const auto & seq : batch) max_len = std::max(max_len, (int)seq.size()); + if (max_len > ctx->model.hparams.n_max_tokens) { + ggml_log_internal(GGML_LOG_LEVEL_ERROR, + "Too many tokens: max %d, got %d\n", ctx->model.hparams.n_max_tokens, max_len); + return nullptr; + } + const int n_batch = (int)batch.size(); + auto & cache = bert_get_cache(ctx); + auto key = std::make_pair(max_len, n_batch); + auto it = cache.entries.find(key); + if (it == cache.entries.end()) { + auto cg = build_cached_graph(ctx, max_len, n_batch); + if (!cg) return nullptr; + it = cache.entries.emplace(key, std::move(cg)).first; + } + return it->second->gf; } // --------------------------------------------------------------------------- @@ -1300,95 +1393,71 @@ struct ggml_cgraph * bert_build_graph(bert_ctx * ctx, bert_batch batch) { void bert_forward_batch(bert_ctx * ctx, bert_batch batch, float * embeddings, int32_t n_threads) { + const int n_batch = (int)batch.size(); + if (n_batch == 0) return; -#if 1 - // Resize meta buffer to fit this graph — safe to resize each call - ctx->buf_compute_meta.resize( - BERT_MAX_NODES * ggml_tensor_overhead() + ggml_graph_overhead()); -#endif - - struct ggml_cgraph * gf = bert_build_graph(ctx, batch); - if (!gf) { fprintf(stderr, "%s: build graph failed\n", __func__); return; } - - // Scoped allocator — fresh for every call, freed at the end - ggml_backend_dev_t dev = ggml_backend_get_device(ctx->backend); - ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(dev); - ggml_gallocr_t local_alloc = ggml_gallocr_new(buft); - - if (!ggml_gallocr_alloc_graph(local_alloc, gf)) { - fprintf(stderr, "%s: local graph allocation failed\n", __func__); - ggml_gallocr_free(local_alloc); + int max_len = 0; + for (const auto & s : batch) max_len = std::max(max_len, (int)s.size()); + if (max_len == 0) return; + if (max_len > ctx->model.hparams.n_max_tokens) { + ggml_log_internal(GGML_LOG_LEVEL_ERROR, + "Too many tokens: max %d, got %d\n", ctx->model.hparams.n_max_tokens, max_len); return; } - const int n_batch = (int)batch.size(); - int cur_max_len = 0; - for (const auto & s : batch) - cur_max_len = std::max(cur_max_len, (int)s.size()); - - struct ggml_tensor * token_layer = ggml_graph_get_tensor(gf, "token_layer"); - struct ggml_tensor * token_types = ggml_graph_get_tensor(gf, "token_types"); - struct ggml_tensor * pad_mask = ggml_graph_get_tensor(gf, "pad_mask"); - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); - struct ggml_tensor * sum = ggml_graph_get_tensor(gf, "sum"); - struct ggml_tensor * minus_one = ggml_graph_get_tensor(gf, "minus_one"); - - // Fill input buffers + auto & cache = bert_get_cache(ctx); + auto key = std::make_pair(max_len, n_batch); + auto it = cache.entries.find(key); + if (it == cache.entries.end()) { + auto cg = build_cached_graph(ctx, max_len, n_batch); + if (!cg) { fprintf(stderr, "%s: build graph failed\n", __func__); return; } + it = cache.entries.emplace(key, std::move(cg)).first; + } + bert_cached_graph & cg = *it->second; + + // Fill per-call input buffers. positions / token_types / minus_one were + // pre-filled at build time and never change. Scratch vectors are sized + // once in build_cached_graph; we only rewrite their contents here. { - std::vector tl(cur_max_len * n_batch); - std::vector tt(cur_max_len * n_batch, 0); - std::vector pm(cur_max_len * n_batch, 0.0f); - std::vector pos(cur_max_len * n_batch); - std::vector sv(cur_max_len * n_batch, 0.0f); - const float m1 = -1.0f; + int32_t * tl = cg.scratch_tl.data(); + float * pm = cg.scratch_pm.data(); + float * sv = cg.scratch_sv.data(); for (int ba = 0; ba < n_batch; ++ba) { const int cl = (int)batch[ba].size(); - // Number of real (non-padding) tokens for mean-pool weight. - // cl includes [CLS] and [SEP]; to exclude them from pooling, - // use cl-2 as the divisor and skip positions 0 and cl-1 below. - // Currently we include them (standard all-token mean pool). const float w = (cl > 0) ? 1.0f / (float)cl : 0.0f; - - for (int i = 0; i < cur_max_len; ++i) { - const int idx = ba * cur_max_len + i; - if (i < cl) { - tl[idx] = batch[ba][i]; - pm[idx] = 1.0f; - sv[idx] = w; - } else { - tl[idx] = 0; // padding token id (unused, masked out) - pm[idx] = 0.0f; - sv[idx] = 0.0f; - } - pos[idx] = i; + const int row = ba * max_len; + for (int i = 0; i < cl; ++i) { + tl[row + i] = batch[ba][i]; + pm[row + i] = 1.0f; + sv[row + i] = w; } + for (int i = cl; i < max_len; ++i) { + tl[row + i] = 0; + pm[row + i] = 0.0f; + sv[row + i] = 0.0f; + } + } + ggml_backend_tensor_set(cg.token_layer, tl, 0, ggml_nbytes(cg.token_layer)); + ggml_backend_tensor_set(cg.pad_mask, pm, 0, ggml_nbytes(cg.pad_mask)); + ggml_backend_tensor_set(cg.sum, sv, 0, ggml_nbytes(cg.sum)); + // Workaround: CPU gallocr appears to reuse these slots between + // compute calls even though they're flagged ggml_set_input. Metal + // keeps them stable. Cheap to rewrite each call. + if (ggml_backend_is_cpu(ctx->backend)) { + ggml_backend_tensor_set(cg.positions, cg.scratch_pos.data(), 0, ggml_nbytes(cg.positions)); + ggml_backend_tensor_set(cg.token_types, cg.scratch_types.data(), 0, ggml_nbytes(cg.token_types)); } - ggml_backend_tensor_set(token_layer, tl.data(), 0, ggml_nbytes(token_layer)); - ggml_backend_tensor_set(token_types, tt.data(), 0, ggml_nbytes(token_types)); - ggml_backend_tensor_set(pad_mask, pm.data(), 0, ggml_nbytes(pad_mask)); - ggml_backend_tensor_set(positions, pos.data(), 0, ggml_nbytes(positions)); - ggml_backend_tensor_set(sum, sv.data(), 0, ggml_nbytes(sum)); - ggml_backend_tensor_set(minus_one, &m1, 0, sizeof(m1)); } - if (verbosity >= 3) ggml_graph_print(gf); - if (ggml_backend_is_cpu(ctx->backend)) ggml_backend_cpu_set_n_threads(ctx->backend, n_threads); - ggml_backend_graph_compute(ctx->backend, gf); - - // Output is the last node in the graph (set via ggml_set_output in build) - struct ggml_tensor * output = ggml_graph_node(gf, -1); + ggml_backend_graph_compute(ctx->backend, cg.gf); - // Copy embeddings to host - ggml_backend_tensor_get(output, embeddings, 0, ggml_nbytes(output)); + ggml_backend_tensor_get(cg.output, embeddings, 0, ggml_nbytes(cg.output)); - // --------------------------------------------------------------------------- - // L2 normalisation (in-place, per embedding vector) - // Sentence-transformer models expect unit-norm vectors for cosine similarity. - // --------------------------------------------------------------------------- + // L2 normalisation (in-place) — sentence-transformer convention. const int n_embd = ctx->model.hparams.n_embd; for (int b = 0; b < n_batch; ++b) { float * vec = embeddings + b * n_embd; @@ -1399,8 +1468,6 @@ void bert_forward_batch(bert_ctx * ctx, bert_batch batch, for (int i = 0; i < n_embd; ++i) vec[i] /= norm; } } - - ggml_gallocr_free(local_alloc); } void bert_encode_batch(bert_ctx * ctx, bert_strings texts, diff --git a/bert.h b/bert.h index dc1566a..a5057fb 100644 --- a/bert.h +++ b/bert.h @@ -91,6 +91,11 @@ struct bert_ctx { std::string model_name = ""; std::string model_arch = "bert"; + + // Opaque graph cache (bert_graph_cache_map *). Built lazily on first forward + // at a given (max_len, batch_size); reused for all subsequent forwards at the + // same shape so we avoid rebuilding the graph and re-running gallocr. + void * graph_cache = nullptr; }; diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index f619d96..8357803 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -3,5 +3,8 @@ include_directories(${CMAKE_SOURCE_DIR}/) add_executable(main main.cpp) target_link_libraries(main PRIVATE bert ggml) +add_executable(bench bench.cpp) +target_link_libraries(bench PRIVATE bert ggml) + #add_executable(basic basic.cpp) #target_link_libraries(basic PRIVATE bert ggml) diff --git a/examples/bench.cpp b/examples/bench.cpp new file mode 100644 index 0000000..479440e --- /dev/null +++ b/examples/bench.cpp @@ -0,0 +1,142 @@ +#include "bert.h" +#include "ggml.h" + +#include +#include +#include +#include +#include + +struct bench_params { + const char * model = nullptr; + int n_threads = 6; + int warmup = 5; + int iters = 50; + bool use_cpu = false; +}; + +static void usage(const char * argv0) { + fprintf(stderr, + "usage: %s -m MODEL [-c] [-t N] [-w N] [-n N]\n" + " -m model path (gguf)\n" + " -c CPU backend (default: accelerated)\n" + " -t threads (default 6)\n" + " -w warmup iterations (default 5)\n" + " -n measured iterations (default 50)\n", argv0); +} + +static bool parse(int argc, char ** argv, bench_params & p) { + for (int i = 1; i < argc; i++) { + std::string a = argv[i]; + if (a == "-m") p.model = argv[++i]; + else if (a == "-c") p.use_cpu = true; + else if (a == "-t") p.n_threads = std::stoi(argv[++i]); + else if (a == "-w") p.warmup = std::stoi(argv[++i]); + else if (a == "-n") p.iters = std::stoi(argv[++i]); + else if (a == "-h" || a == "--help") { usage(argv[0]); return false; } + else { fprintf(stderr, "unknown arg: %s\n", a.c_str()); usage(argv[0]); return false; } + } + return p.model != nullptr; +} + +static double median(std::vector v) { + std::sort(v.begin(), v.end()); + size_t n = v.size(); + return n % 2 ? v[n/2] : 0.5*(v[n/2 - 1] + v[n/2]); +} + +static double percentile(std::vector v, double p) { + std::sort(v.begin(), v.end()); + size_t idx = (size_t)((v.size() - 1) * p); + return v[idx]; +} + +static double mean(const std::vector & v) { + double s = 0; for (double x : v) s += x; return s / v.size(); +} + +// build a prompt of roughly `target_tokens` tokens (each filler word becomes ~1 token) +static std::string make_prompt(int target_tokens) { + static const char * filler = "the quick brown fox jumps over the lazy dog "; + std::string s; + // -2 for [CLS]/[SEP] + int words = std::max(1, target_tokens - 2); + // each filler word is ~1 token; pad with single tokens + s.reserve(words * 6); + while ((int)s.size()/5 < words) s += filler; + return s; +} + +static void run_scenario(bert_ctx * ctx, const bench_params & p, + const char * label, int target_tokens, int batch_size) { + std::string prompt = make_prompt(target_tokens); + bert_tokens toks = bert_tokenize(ctx, prompt, bert_n_max_tokens(ctx)); + int actual_tokens = (int)toks.size(); + + bert_batch batch(batch_size, toks); + const int n_embd = bert_n_embd(ctx); + std::vector embed((size_t)batch_size * n_embd); + + // warmup + for (int i = 0; i < p.warmup; i++) { + bert_forward_batch(ctx, batch, embed.data(), p.n_threads); + } + + // measure + std::vector ms; + ms.reserve(p.iters); + for (int i = 0; i < p.iters; i++) { + int64_t t0 = ggml_time_us(); + bert_forward_batch(ctx, batch, embed.data(), p.n_threads); + int64_t t1 = ggml_time_us(); + ms.push_back((t1 - t0) / 1000.0); + } + + double m = mean(ms); + double md = median(ms); + double p95= percentile(ms, 0.95); + double mn = *std::min_element(ms.begin(), ms.end()); + double mx = *std::max_element(ms.begin(), ms.end()); + double tokens_per_call = (double)actual_tokens * batch_size; + double tok_per_sec = tokens_per_call / (m / 1000.0); + double seq_per_sec = (double)batch_size / (m / 1000.0); + + printf(" %-14s tok=%3d bs=%3d | mean=%7.2f ms med=%7.2f p95=%7.2f min=%6.2f max=%6.2f | %8.0f tok/s %7.1f seq/s\n", + label, actual_tokens, batch_size, m, md, p95, mn, mx, tok_per_sec, seq_per_sec); +} + +int main(int argc, char ** argv) { + ggml_time_init(); + bench_params p; + if (!parse(argc, argv, p)) return 1; + + printf("Loading %s on %s backend (threads=%d)...\n", + p.model, p.use_cpu ? "CPU" : "accelerated", p.n_threads); + int64_t tl0 = ggml_time_us(); + bert_ctx * ctx = bert_load_from_file(p.model, p.use_cpu); + int64_t tl1 = ggml_time_us(); + if (!ctx) { fprintf(stderr, "failed to load model\n"); return 1; } + printf("Model: %s arch=%s n_embd=%d load=%.0f ms\n", + std::string(bert_get_model_name(ctx)).c_str(), + std::string(bert_get_architecture(ctx)).c_str(), + bert_n_embd(ctx), + (tl1 - tl0) / 1000.0); + printf("Warmup=%d iters, measure=%d iters\n\n", p.warmup, p.iters); + + printf("== latency by input length (batch=1) ==\n"); + run_scenario(ctx, p, "very-short", 8, 1); + run_scenario(ctx, p, "short", 32, 1); + run_scenario(ctx, p, "medium", 128, 1); + run_scenario(ctx, p, "long", 256, 1); + run_scenario(ctx, p, "max", 512, 1); + + printf("\n== throughput by batch size (tok=128) ==\n"); + run_scenario(ctx, p, "bs=1", 128, 1); + run_scenario(ctx, p, "bs=4", 128, 4); + run_scenario(ctx, p, "bs=8", 128, 8); + run_scenario(ctx, p, "bs=16", 128, 16); + run_scenario(ctx, p, "bs=32", 128, 32); + + bert_free(ctx); + return 0; +}