feat: Changed streams to a fixed number

author: Tomáš Jusko 2023-09-12 12:57:36 +0200
committer: Tomáš Jusko 2023-09-12 12:57:36 +0200
commit: b8b273c33bc2cda29d77d69f9f273c5f9ac0046e (patch)
tree: 144c28b3f7e0aef0b3a46782096f0d17f7ad8d87 /pyecsca/sca
parent: 88b6cab0279f131388c2795adf4630789626fe0b (diff)
download: pyecsca-b8b273c33bc2cda29d77d69f9f273c5f9ac0046e.tar.gz
pyecsca-b8b273c33bc2cda29d77d69f9f273c5f9ac0046e.tar.zst
pyecsca-b8b273c33bc2cda29d77d69f9f273c5f9ac0046e.zip
1 files changed, 13 insertions, 10 deletions
diff --git a/pyecsca/sca/stacked_traces/combine.py b/pyecsca/sca/stacked_traces/combine.py
index 40da4b8..160c486 100644
--- a/pyecsca/sca/stacked_traces/combine.py
+++ b/pyecsca/sca/stacked_traces/combine.py
@@ -119,7 +119,7 @@ class GPUTraceManager(BaseTraceManager):
 
         if chunk_size is not None and chunk_size <= 0:
             raise ValueError("Chunk size should be positive")
-        
+
         chunk = (chunk
                  or chunk_size is not None
                  or chunk_memory_ratio is not None)
@@ -231,8 +231,8 @@ class GPUTraceManager(BaseTraceManager):
             self._traces.samples.shape[1] + self._chunk_size - 1
         ) // self._chunk_size
 
-        data_stream = cuda.stream()
-        compute_stream = cuda.stream()
+        stream_count = 4
+        streams = [cuda.stream() for _ in range(stream_count)]
 
         chunk_results: List[List[npt.NDArray[np.number]]] = [
             list()
@@ -245,23 +245,26 @@ class GPUTraceManager(BaseTraceManager):
 
             device_input = cuda.to_device(
                 self._traces.samples[:, start:end],
-                stream=data_stream
+                stream=streams[chunk % stream_count]
             )
             device_outputs = [cuda.device_array(
-                end - start, stream=data_stream) for _ in range(output_count)]
+                end - start,
+                stream=streams[chunk % stream_count])
+                              for _ in range(output_count)]
 
             bpg = (end - start + self._tpb - 1) // self._tpb
-            func[bpg, self._tpb, compute_stream](
+            func[bpg, self._tpb, streams[chunk % stream_count]](
                 device_input, *device_outputs
             )
 
-            compute_stream.synchronize()
+            # streams[chunk % stream_count].synchronize()
             for output_i, device_output in enumerate(device_outputs):
                 chunk_results[output_i].append(
-                    device_output.copy_to_host(stream=data_stream))
-
-        data_stream.synchronize()
+                    device_output.copy_to_host(
+                        stream=streams[chunk % stream_count]))
 
+        # data_stream.synchronize()
+        cuda.synchronize()
         return [np.concatenate(chunk_result)
                 for chunk_result in chunk_results]
author	Tomáš Jusko	2023-09-12 12:57:36 +0200
committer	Tomáš Jusko	2023-09-12 12:57:36 +0200
commit	b8b273c33bc2cda29d77d69f9f273c5f9ac0046e (patch)
tree	144c28b3f7e0aef0b3a46782096f0d17f7ad8d87 /pyecsca/sca
parent	88b6cab0279f131388c2795adf4630789626fe0b (diff)
download	pyecsca-b8b273c33bc2cda29d77d69f9f273c5f9ac0046e.tar.gz pyecsca-b8b273c33bc2cda29d77d69f9f273c5f9ac0046e.tar.zst pyecsca-b8b273c33bc2cda29d77d69f9f273c5f9ac0046e.zip