diff options
| author | Tomáš Jusko | 2023-09-12 12:57:36 +0200 |
|---|---|---|
| committer | Tomáš Jusko | 2023-09-12 12:57:36 +0200 |
| commit | b8b273c33bc2cda29d77d69f9f273c5f9ac0046e (patch) | |
| tree | 144c28b3f7e0aef0b3a46782096f0d17f7ad8d87 /pyecsca/sca | |
| parent | 88b6cab0279f131388c2795adf4630789626fe0b (diff) | |
| download | pyecsca-b8b273c33bc2cda29d77d69f9f273c5f9ac0046e.tar.gz pyecsca-b8b273c33bc2cda29d77d69f9f273c5f9ac0046e.tar.zst pyecsca-b8b273c33bc2cda29d77d69f9f273c5f9ac0046e.zip | |
feat: Changed streams to a fixed number
Diffstat (limited to 'pyecsca/sca')
| -rw-r--r-- | pyecsca/sca/stacked_traces/combine.py | 23 |
1 files changed, 13 insertions, 10 deletions
diff --git a/pyecsca/sca/stacked_traces/combine.py b/pyecsca/sca/stacked_traces/combine.py index 40da4b8..160c486 100644 --- a/pyecsca/sca/stacked_traces/combine.py +++ b/pyecsca/sca/stacked_traces/combine.py @@ -119,7 +119,7 @@ class GPUTraceManager(BaseTraceManager): if chunk_size is not None and chunk_size <= 0: raise ValueError("Chunk size should be positive") - + chunk = (chunk or chunk_size is not None or chunk_memory_ratio is not None) @@ -231,8 +231,8 @@ class GPUTraceManager(BaseTraceManager): self._traces.samples.shape[1] + self._chunk_size - 1 ) // self._chunk_size - data_stream = cuda.stream() - compute_stream = cuda.stream() + stream_count = 4 + streams = [cuda.stream() for _ in range(stream_count)] chunk_results: List[List[npt.NDArray[np.number]]] = [ list() @@ -245,23 +245,26 @@ class GPUTraceManager(BaseTraceManager): device_input = cuda.to_device( self._traces.samples[:, start:end], - stream=data_stream + stream=streams[chunk % stream_count] ) device_outputs = [cuda.device_array( - end - start, stream=data_stream) for _ in range(output_count)] + end - start, + stream=streams[chunk % stream_count]) + for _ in range(output_count)] bpg = (end - start + self._tpb - 1) // self._tpb - func[bpg, self._tpb, compute_stream]( + func[bpg, self._tpb, streams[chunk % stream_count]]( device_input, *device_outputs ) - compute_stream.synchronize() + # streams[chunk % stream_count].synchronize() for output_i, device_output in enumerate(device_outputs): chunk_results[output_i].append( - device_output.copy_to_host(stream=data_stream)) - - data_stream.synchronize() + device_output.copy_to_host( + stream=streams[chunk % stream_count])) + # data_stream.synchronize() + cuda.synchronize() return [np.concatenate(chunk_result) for chunk_result in chunk_results] |
