aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--pyecsca/sca/stacked_traces/combine.py9
1 files changed, 9 insertions, 0 deletions
diff --git a/pyecsca/sca/stacked_traces/combine.py b/pyecsca/sca/stacked_traces/combine.py
index 853df70..1e7f7fb 100644
--- a/pyecsca/sca/stacked_traces/combine.py
+++ b/pyecsca/sca/stacked_traces/combine.py
@@ -278,6 +278,8 @@ class GPUTraceManager(BaseTraceManager):
self._traces.samples.shape[1] + self._chunk_size - 1
) // self._chunk_size
streams = [cuda.stream() for _ in range(self._stream_count)]
+ events: List[Union[None, cuda.Event]] = [
+ None for _ in range(self._stream_count)]
# Pre-allocate pinned memory for each stream
pinned_input_buffers = [
@@ -297,6 +299,10 @@ class GPUTraceManager(BaseTraceManager):
end = min((chunk + 1) * self._chunk_size,
self._traces.samples.shape[1])
stream = streams[chunk % self._stream_count]
+ event = events[chunk % self._stream_count]
+ if event is not None:
+ event.wait(stream=stream)
+ # stream.synchronize()
pinned_input = pinned_input_buffers[chunk % self._stream_count]
np.copyto(pinned_input, self._traces.samples[:, start:end])
@@ -313,6 +319,9 @@ class GPUTraceManager(BaseTraceManager):
bpg = (end - start + self._tpb - 1) // self._tpb
func[bpg, self._tpb, stream](device_input, *device_outputs)
+ event = cuda.event()
+ event.record(stream=stream)
+ events[chunk % self._stream_count] = event
for output_i, device_output in enumerate(device_outputs):
# Allocating pinned memory for results