add parse_int expriment.

This commit is contained in:
Syoyo Fujita
2025-07-26 05:50:03 +09:00
parent dd9e1649f7
commit c07d4dae6a
3 changed files with 495 additions and 0 deletions

View File

@@ -0,0 +1,12 @@
all:
clang++-17 -O2 -g -stdlib=libc++ parse_int.cc -o parse_int
clean:
rm -f parse_int a.out
test: all
./parse_int 1000000 1 1
./parse_int 1000000 1 4
./parse_int 1000000 1 8
.PHONY: all clean test

View File

@@ -0,0 +1,64 @@
# Efficient Integer Array Parser
Based on the efficient float parsing implementation in `../parse_fp`, this is an optimized integer array parser that can handle large arrays with multithreading support.
## Features
- **Fast lexing**: Efficient tokenization of integer arrays in `[1,2,3,...]` format
- **Multithreaded parsing**: Uses `std::from_chars` with thread pool for large arrays
- **Memory efficient**: Zero-copy lexing using spans pointing to original input
- **Robust error handling**: Comprehensive validation and error reporting
- **Configurable**: Support for trailing delimiters and custom separators
## Usage
```bash
make
./parse_int [num_elements] [delim_at_end] [num_threads]
```
### Parameters
- `num_elements`: Number of integers to generate and parse (default: 33554432)
- `delim_at_end`: Allow trailing comma (1=yes, 0=no, default: 1)
- `num_threads`: Number of threads for parsing (default: 1)
### Examples
```bash
# Parse 1M integers with 4 threads
./parse_int 1000000 1 4
# Parse 10M integers, no trailing comma, single-threaded
./parse_int 10000000 0 1
```
## Architecture
### Two-Phase Parsing
1. **Lexing Phase**: Fast scan through input to identify integer boundaries
- Returns `int_lex_span` objects with pointer + length
- Handles whitespace, delimiters, and validation
- O(n) single pass through input
2. **Parsing Phase**: Convert lexed spans to actual integers
- Uses fast `std::from_chars` for conversion
- Automatic multithreading for arrays > 128K elements
- Thread-safe with atomic counters
### Key Data Structures
- `int_lex_span`: Zero-copy span representing an integer token
- `Lexer`: Stateful lexer with position tracking and error reporting
- Thread pool with work stealing for parsing phase
## Performance Notes
- Optimized for large integer arrays (millions of elements)
- Multithreading kicks in automatically for arrays > 131,072 elements
- Uses `std::from_chars` which is typically faster than `std::stoi` or `atoi`
- Memory usage scales linearly with input size
## TODO
- Add support for different integer types (int32, uint64, etc.)
- Implement vector parsing (e.g., `[(1,2), (3,4)]`)
- Add SIMD optimizations for lexing phase
- Support for hexadecimal and binary integer formats

View File

@@ -0,0 +1,419 @@
#include <vector>
#include <iostream>
#include <sstream>
#include <chrono>
#include <thread>
#include <mutex>
#include <atomic>
#include <random>
#include <charconv>
std::string gen_intarray(size_t n, bool delim_at_end) {
std::stringstream ss;
std::random_device rd;
std::mt19937 engine(rd());
std::uniform_int_distribution<int64_t> dist(-1000000, 1000000);
ss << "[";
for (size_t i = 0; i < n; i++) {
int64_t val = dist(engine);
ss << std::to_string(val);
if (delim_at_end) {
ss << ",";
} else if (i < (n-1)) {
ss << ",";
}
}
ss << "]";
return ss.str();
}
struct Lexer {
void init(const char *_p_begin, const char *_p_end, size_t row = 0, size_t column = 0) {
p_begin = _p_begin;
p_end = _p_end;
curr = p_begin;
row_ = row;
column_ = column;
}
void skip_whitespaces() {
while (!eof()) {
char s = *curr;
if ((s == ' ') || (s == '\t') || (s == '\f') || (s == '\n') || (s == '\r') || (s == '\v')) {
curr++;
column_++;
if (s == '\r') {
if (!eof()) {
char c{'\0'};
look_char1(&c);
if (c == '\n') {
curr++;
}
}
row_++;
column_ = 0;
} else if (s == '\n') {
row_++;
column_ = 0;
}
} else {
break;
}
}
}
bool skip_until_delim_or_close_paren(const char delim, const char close_paren) {
while (!eof()) {
char s = *curr;
if ((s == delim) || (s == close_paren)) {
return true;
}
curr++;
column_++;
if (s == '\r') {
if (!eof()) {
char c{'\0'};
look_char1(&c);
if (c == '\n') {
curr++;
}
}
row_++;
column_ = 0;
} else if (s == '\n') {
row_++;
column_ = 0;
}
}
return false;
}
bool char1(char *result) {
if (eof()) {
return false;
}
*result = *curr;
curr++;
column_++;
if ((*result == '\r') || (*result == '\n')) {
row_++;
column_ = 0;
}
return true;
}
bool look_char1(char *result) {
if (eof()) {
return false;
}
*result = *curr;
return true;
}
bool consume_char1() {
if (eof()) {
return false;
}
char c = *curr;
curr++;
if ((c == '\r') || (c == '\n')) {
row_++;
column_ = 0;
}
return true;
}
inline bool eof() const {
return (curr >= p_end);
}
bool lex_int(uint16_t &len, bool &truncated) {
constexpr size_t n_trunc_chars = 256;
size_t n = 0;
bool has_sign = false;
bool found_digit = false;
while (!eof() && (n < n_trunc_chars)) {
char c;
look_char1(&c);
if ((c == '-') || (c == '+')) {
if (has_sign || found_digit) {
break;
}
has_sign = true;
} else if ((c >= '0') && (c <= '9')) {
found_digit = true;
} else {
break;
}
consume_char1();
n++;
}
if (n == 0 || !found_digit) {
len = 0;
return false;
}
truncated = (n >= n_trunc_chars);
len = uint16_t(n);
return true;
}
void push_error(const std::string &msg) {
err_ += msg + " (near line " + std::to_string(row_) + ", column " + std::to_string(column_) + ")\n";
}
std::string get_error() const {
return err_;
}
const char *p_begin{nullptr};
const char *p_end{nullptr};
const char *curr{nullptr};
size_t row_{0};
size_t column_{0};
private:
std::string err_;
};
struct int_lex_span {
const char *p_begin{nullptr};
uint16_t length{0};
};
template<size_t N>
struct vec_lex_span {
int_lex_span vspans[N];
};
bool lex_int_array(
const char *p_begin,
const char *p_end,
std::vector<int_lex_span> &result,
std::string &err,
const bool allow_delim_at_last = true,
const char delim = ',',
const char open_paren = '[',
const char close_paren = ']') {
if (p_begin >= p_end) {
err = "Invalid input\n";
return false;
}
Lexer lexer;
lexer.p_begin = p_begin;
lexer.p_end = p_end;
lexer.curr = p_begin;
char c;
if (!lexer.char1(&c)) {
err = "Input too short.\n";
return false;
}
if (c != open_paren) {
err = "Input does not begin with open parenthesis character.\n";
return false;
}
lexer.skip_whitespaces();
while (!lexer.eof()) {
bool prev_is_delim = false;
{
char c;
if (!lexer.look_char1(&c)) {
lexer.push_error("Invalid character found.");
err = lexer.get_error();
return false;
}
if (c == delim) {
if (result.empty()) {
lexer.push_error("Array element starts with the delimiter character.");
err = lexer.get_error();
return false;
}
prev_is_delim = true;
lexer.consume_char1();
}
lexer.skip_whitespaces();
}
{
char c;
if (!lexer.look_char1(&c)) {
lexer.push_error("Failed to read a character.");
err = lexer.get_error();
return false;
}
if (c == close_paren) {
if (prev_is_delim) {
if (allow_delim_at_last) {
return true;
} else {
lexer.push_error("Delimiter character is not allowed before the closing parenthesis character.");
err = lexer.get_error();
return false;
}
} else {
return true;
}
}
}
int_lex_span sp;
sp.p_begin = lexer.curr;
uint16_t length{0};
bool truncated{false};
if (!lexer.lex_int(length, truncated)) {
lexer.push_error("Input is not an integer literal.");
err = lexer.get_error();
return false;
}
sp.length = length;
if (truncated) {
if (!lexer.skip_until_delim_or_close_paren(delim, close_paren)) {
lexer.push_error("Failed to seek to delimiter or closing parenthesis character.");
err = lexer.get_error();
return false;
}
}
result.emplace_back(std::move(sp));
lexer.skip_whitespaces();
}
return true;
}
bool do_parse(
uint32_t nthreads,
const std::vector<int_lex_span> &spans,
std::vector<int64_t> &results) {
auto start = std::chrono::steady_clock::now();
results.resize(spans.size());
if (spans.size() > (1024*128)) {
nthreads = (std::min)((std::max)(1u, nthreads), 256u);
std::mutex mutex;
std::atomic<size_t> cnt(0);
std::atomic<bool> parse_failed{false};
std::vector<std::thread> threads;
for (uint32_t i = 0; i < nthreads; i++) {
threads.emplace_back(std::thread([&] {
size_t j;
while ((j = cnt++) < results.size()) {
int64_t val;
auto answer = std::from_chars(spans[j].p_begin, spans[j].p_begin + spans[j].length, val);
if (answer.ec != std::errc()) {
parse_failed = true;
}
results[j] = val;
}
}));
}
for (auto &&th : threads) {
th.join();
}
if (parse_failed) {
std::cerr << "parsing failure\n";
return false;
}
} else {
for (size_t i = 0; i < spans.size(); i++) {
int64_t val;
auto answer = std::from_chars(spans[i].p_begin, spans[i].p_begin + spans[i].length, val);
if (answer.ec != std::errc()) {
std::cerr << "parsing failure\n";
return false;
}
results[i] = val;
}
}
auto end = std::chrono::steady_clock::now();
std::cout << "n threads: " << nthreads << "\n";
std::cout << "n elems: " << spans.size() << "\n";
std::cout << "parse time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " [ms]\n";
return true;
}
int main(int argc, char **argv) {
std::vector<int_lex_span> lex_results;
uint32_t nthreads = 1;
bool delim_at_end = true;
size_t n = 1024*1024*32;
if (argc > 1) {
n = std::stoi(argv[1]);
}
if (argc > 2) {
delim_at_end = std::stoi(argv[2]) > 0;
}
if (argc > 3) {
nthreads = std::stoi(argv[3]);
}
lex_results.reserve(n);
std::string input = gen_intarray(n, delim_at_end);
auto start = std::chrono::steady_clock::now();
std::string err;
if (!lex_int_array(input.c_str(), input.c_str() + input.size(), lex_results, err)) {
std::cerr << "parse error\n";
std::cerr << err << "\n";
return -1;
}
auto end = std::chrono::steady_clock::now();
std::cout << "n elems " << lex_results.size() << "\n";
std::cout << "size " << input.size() << "\n";
std::cout << "lex time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " [ms]\n";
std::vector<int64_t> parse_results;
parse_results.reserve(n);
do_parse(nthreads, lex_results, parse_results);
return 0;
}