-
Notifications
You must be signed in to change notification settings - Fork 106
Expand file tree
/
Copy pathclassify_zerocopy.py
More file actions
67 lines (51 loc) · 2.69 KB
/
Copy pathclassify_zerocopy.py
File metadata and controls
67 lines (51 loc) · 2.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python3
"""Zero-copy GPU inference with trtcpp + CuPy.
Demonstrates the binding's zero-copy contract end to end: a CuPy array on the GPU is fed to the
engine with no host<->device copy (via __cuda_array_interface__), inference runs on the caller's
CUDA stream with the GIL released, and the output Tensor is wrapped back into a CuPy array (again
zero-copy) for GPU-side post-processing. The only host transfer is the final tiny top-5 result.
python classify_zerocopy.py <model.onnx|engine> [image.npy]
If an image .npy ([3,H,W] or [1,3,H,W] float32, ImageNet-normalized) is given it is used;
otherwise a random input is generated. (Decoding/normalizing an image file is exactly what the
preproc sublib + the C++ examples do; this script focuses on the zero-copy Python plumbing.)
"""
import sys
import cupy as cp
import numpy as np
import trtcpp
def main() -> int:
if len(sys.argv) < 2:
print(f"usage: {sys.argv[0]} <model.onnx|engine> [image.npy]", file=sys.stderr)
return 2
model_path = sys.argv[1]
bo = trtcpp.BuildOptions()
bo.precision = trtcpp.Precision.Fp16
bo.engine_cache_dir = "engines"
engine = trtcpp.EngineBuilder().build_and_load(model_path, bo)
in_name = engine.input_names()[0]
_, _, in_h, in_w = engine.tensor_shape(in_name).dims()
# Build the input as a CuPy array ON THE GPU (no host buffer in the hot path).
if len(sys.argv) > 2:
arr = np.load(sys.argv[2]).astype(np.float32).reshape(1, 3, in_h, in_w)
gpu_input = cp.asarray(arr)
else:
gpu_input = cp.random.randn(1, 3, in_h, in_w, dtype=cp.float32)
# Adopt CuPy's current stream so trtcpp and CuPy order work on the same stream.
stream = trtcpp.Stream.wrap(cp.cuda.get_current_stream().ptr)
# Zero-copy in: gpu_input exposes __cuda_array_interface__, so the engine binds its device
# pointer directly -- no copy. GIL released during inference.
out = engine.infer_single({in_name: gpu_input}, stream)
# Zero-copy out: wrap the output Tensor as a CuPy array (shares device memory), then do the
# softmax/top-k on the GPU. The output Tensor must stay alive while gpu_out aliases it.
gpu_out = cp.asarray(out).reshape(-1)
stream.synchronize() # the engine never implicitly syncs; do it before reading
probs = cp.exp(gpu_out - gpu_out.max())
probs /= probs.sum()
topk = cp.asnumpy(cp.argsort(probs)[::-1][:5]) # only the 5 indices come back to the host
print(f"input {gpu_input.shape} on {gpu_input.device}, zero-copy in/out")
print("top-5:")
for cls in topk:
print(f" class {int(cls):4d} {float(probs[int(cls)]) * 100:6.2f}%")
return 0
if __name__ == "__main__":
raise SystemExit(main())