Context
I was looking at Shell script to count chess game outcomes, where the run time of my answer is dominated by the grep
command which selects lines beginning with [Result
. I reckoned that a special-purpose filter could be much faster, but still general enough to be useful for similar problems:
- It doesn't need a full regexp engine, but unlike
grep -F
, it only needs to look at the start of each line. - The order in which the lines are emitted doesn't matter (as long as they are intact - not intercut with parts of other lines).
- We don't need to maintain bookkeeping for the lines, as we are not printing any location information.
- We only need to read from regular files which can be memory-mapped.
The result is this program, which can be executed in either of two ways: with a list of files following the prefix to match, or with the list of files supplied on standard input - as emitted by find -print0
.
The code
#include <atomic>
#include <cstdio>
#include <cstdlib>
#include <functional>
#include <mutex>
#include <iostream>
#include <print>
#include <string_view>
#include <vector>
#include <sys/uio.h>
#include <boost/interprocess/file_mapping.hpp>
#include <boost/interprocess/mapped_region.hpp>
// View that chunks a std::string_view into its constituent lines.
// The underlying content is expected to represent a text file -
// comprising complete lines, each ending with newline.
class lines_view
{
char const *const first;
char const *const last;
public:
// The passed string view must remain alive until after the last
// access through this object's iterators.
lines_view(std::string_view sv)
: first{sv.data()},
last{std::invoke([sv]{
// Last iterator points to the character after the final
// newline - usually same as `sv.end()`. If the text file is
// malformed(missing its newline), the incomplete final line
// is excluded.
if (auto last_nl = sv.rfind('\n'); last_nl == std::string_view::npos) {
return sv.data();
} else {
return sv.data() + last_nl + 1;
}
})}
{}
struct line_iterator
{
// Can't be forward_iterator because we return a string-view
// rather than a real reference. But it does provide the
// multipass guarantee like forward iterators.
using iterator_category = std::input_iterator_tag;
using difference_type = std::ptrdiff_t;
using value_type = std::string_view;
using pointer = value_type const*;
using reference = value_type;
char const* p; // start of line
mutable char const* nl = nullptr; // ending newline, once located
// lazy accessor - must not be called on the end iterator
auto *get_nl() const
{
return nl ? nl : (nl = std::strchr(p, '\n') + 1);
}
public:
bool operator==(const line_iterator& other) const {
return p == other.p;
}
// The returned string view is valid for as long as the
// underlying string view that was passed to lines_view
// constructor.
value_type operator*() const
{
return {p, static_cast<std::string_view::size_type>(get_nl() - p)};
}
line_iterator& operator++()
{
p = get_nl();
nl = nullptr;
return *this;
}
line_iterator operator++(int)
{
auto it = *this;
++*this;
return it;
}
};
auto begin() const
{
return line_iterator{first};
}
auto end() const
{
return line_iterator{last};
}
};
// This might be overkill, as I've never seen writev() write less than
// requested. But it's certainly permitted to.
static ssize_t writev_all(int fd, iovec *outputs, int count)
{
ssize_t total = 0;
while (count > 0) {
static auto const max_iov = static_cast<int>(sysconf(_SC_IOV_MAX));
auto n = max_iov > 0 && count > max_iov ? max_iov : count;
auto w_ret = writev(fd, outputs, n);
if (w_ret < 0) {
return w_ret;
}
total += w_ret;
auto written = static_cast<std::size_t>(w_ret);
for (; count > 0 && outputs->iov_len <= written; ++outputs, --count) [[likely]] {
written -= outputs->iov_len;
}
if (count > 0 && written > 0) [[unlikely]] {
outputs->iov_base = static_cast<char*>(outputs->iov_base) + written;
outputs->iov_len -= written;
}
}
return total;
}
static int grep_single_file(std::string_view const prefix,
char const *const filename)
{
namespace bi = boost::interprocess;
// This is the only function that writes to standard output stream,
// so we control the access here.
static std::mutex out_mutex;
std::vector<iovec> outputs = {};
try {
bi::file_mapping fm(filename, bi::read_only);
bi::mapped_region mr(fm, bi::read_only);
std::string_view sv(static_cast<char const*>(mr.get_address()), mr.get_size());
for (auto line: lines_view(sv)) {
if (line.starts_with(prefix)) [[unlikely]] {
// iovec has writable char* for use with readv() - treated as const by writev()
outputs.emplace_back(const_cast<char*>(line.data()), line.size());
}
}
std::unique_lock lock{out_mutex};
if (writev_all(STDOUT_FILENO, outputs.data(), static_cast<int>(outputs.size())) < 0) {
perror("writev");
return EXIT_FAILURE;
}
} catch (std::exception& e) {
std::println(stderr, "{}: {}", filename, e.what());
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
static auto& read_nul_terminated_line(std::string& filename)
{
// This is the only function that reads from standard input stream,
// so we control the access here.
static std::mutex in_mutex;
std::unique_lock lock{in_mutex};
return std::getline(std::cin, filename, '0円');
}
int main(int argc, char **argv)
{
if (argc < 2) {
std::println(stderr, "Usage: {} PREFIX [FILE...]", argv[0] ? argv[0] : "program");
return EXIT_FAILURE;
}
std::string_view const prefix(argv[1]);
// Any thread may declare failure
std::atomic<int> status = EXIT_SUCCESS;
if (argc == 2) {
// Filenames are supplied on stdin.
#pragma omp parallel
{
std::string filename;
while (status == EXIT_SUCCESS && read_nul_terminated_line(filename)) {
if (grep_single_file(prefix, filename.c_str())) {
status = EXIT_FAILURE;
}
}
}
} else {
// argc > 2: Filenames are supplied as arguments.
// Dynamic schedule required because files vary in length.
// GCC's "guided,1" over-populates first thread: see
// <URL: https://stackoverflow.com/a/43047074/4850040 >
#pragma omp parallel for schedule(dynamic,1)
for (int i = 2; i < argc; ++i) {
if (status == EXIT_SUCCESS) {
char const *const fname = argv[i];
if (grep_single_file(prefix, fname)) {
status = EXIT_FAILURE;
}
}
}
}
// Report errors found during parallel loop (where we couldn't
// return directly).
return status;
}
Input data
I used input data from ChessData on GitHub (Warning: 3.6 GB), which contains around 10 million matching lines across 7.2 GB of input text. Some of these files contain single games, but many are concatenations of hundreds or thousands. Here's a simple order-of-magnitude distribution of the file sizes:
$ find ChessData/ -type f -name '*.pgn' -exec stat -c '%s' {} + |
> sort -n | tr '[0-9]' x | uniq -c
8 xxx
107 xxxx
916 xxxxx
723 xxxxxx
1310 xxxxxxx
144 xxxxxxxx
Functional test
I made sure it reliably feeds comparable input to the AWK script in the linked question. The output is the same for reading file names from stdin or reading them as arguments, compared to GNU grep
used as the baseline.
find ChessData/ -type f -name '*.pgn' -print0 | ./78086a '[Result ' |./78086.awk
find ChessData/ -type f -name '*.pgn' -exec ./78086a '[Result ' {} + |./78086.awk
find ChessData/ -type f -name '*.pgn' -exec grep -h '^\[Result ' {} + |./78086.awk
10316025 3956979 3035935 3323111
10316025 3956979 3035935 3323111
10316025 3956979 3035935 3323111
Performance
I compiled with GCC 14, tuned for Intel i7-3770 with -O3 -march
.
I ran the same three instances, with a pipe to cat to avoid grep
spotting that it is being used as a test and thus turning on -q
- it's in all three commands for fairness.
time find ChessData/ -type f -name '*.pgn' -print0 | ./78086a '[Result ' | cat >/dev/null
time find ChessData/ -type f -name '*.pgn' -exec ./78086a '[Result ' {} + | cat >/dev/null
time find ChessData/ -type f -name '*.pgn' -exec grep -h '^\[Result ' {} + | cat >/dev/null
After a couple of runs (populating file cache), the time stabilised; here's a representative run:
real 0m0.991s
user 0m4.236s
sys 0m1.757s
real 0m0.981s
user 0m4.229s
sys 0m1.695s
real 0m5.911s
user 0m5.035s
sys 0m1.009s
As you can see, we've taken nearly 20% off the processing (user) time, but more importantly, the time to completion (real) is reduced by a factor of around 6. On my system, it saturated three cores - parallelising further didn't increase throughput any more.
Ideas not used
In the filenames-as-arguments path, I tried sorting the files by size, as reported by stat()
, largest first, in the hope that this would reduce straggler effect (as some of the inputs are many megabytes, comprising multiple games). And in the filenames-via-stdin path, doing something similar with a std::priority_queue
between filename-reading thread and grep workers. In both cases, this produced no measurable benefit.
2 Answers 2
Create wrappers for the C parts
You are using low-level POSIX functions that have C APIs, and they clash with the modern C++ code. Seeing std::string_view
s and char*
used in the same places is jarring, and it's hard to tell if you are doing things correctly and safely.
I strongly recommend that you take all the C parts from your code and write nice and clean C++ wrappers for them. Then you can have a clear distinction between the C and C++ parts of your code. I'd like to see the C++ parts look like this:
int grep_single_file(std::string_view prefix, std::filesystem::path filename) {
...
try {
...
auto sv = to_string_view(mr);
auto iov = lines_view(sv)
| std::views::filter([&](auto line){ return line.starts_with(prefix); })
| std::ranges::to<io_vector>();
std::unique_lock lock{out_mutex};
iov.write(STDOUT_FILENO);
} ...
};
This requires adding a class io_vector
that internally deals with a range of struct iovec
s, and a function to_string_view()
that takes a boost::interprocess::mapped_region
and turns it into a std::string_view
.
Write idiomatic C++ code
Don't return POSIX return codes from C++ functions, instead throw
errors or return a bool
, or perhaps even a std::expected<void>
.
In main()
, I would also just wite std::atomic<bool> success = true
, and only at the end convert it to an exit code: return success ? EXIT_SUCCESS : EXIT_FAILURE;
Don't call perror()
, use std::println(std::cerr, ...)
instead (note, not stderr
).
Be consistent
Is it filename
or fname
? Why are you using std::string_view
in many places, but then use two char*
s for first
and last
in class lines_view
, only to create std::string_view
s from them later?
Why does writev_all()
take an int count
when the size of outputs
is clearly a std::size_t
?
In the latter case I would not even try to make it act like the POSIX function writev()
does, and instead make it more idiomatic C++: return void
and throw
any errors you encounter, and pass a std::span<iovec>
or something equivalent.
I'm not sure mmap
is the most efficient way to read small files, especially without MAP_POPULATE
; a PGN tends to be less than 4KiB so not even one full page, so you're paying one round-trip to the kernel for an mmap
system call, then another for a page fault when you read the page. With <1 page of file data, that page-fault cost doesn't even get amortized by fault-ahead / fault-around to wire up nearby pages into the page table like when reading through a big file.
Letting a read system call copy into a per-thread 8K buffer you reuse should be good for this use-case; I don't think read
even needs to invalidate any TLB entries when the destination page has already been touched so it doesn't need to allocate a freshly zeroed page (like it would if reading into a large buffer you just allocated with mmap(MAP_ANONYMOUS) directly or via a large malloc / new.)
Small-file I/O overhead is definitely a challenge here. io_uring
might be useful to submit more system calls to the kernel with a single request, although I haven't used it myself. (man page)
I notice your version has higher sys
time than grep
. That might be from page-fault handlers, and from not buffering you output I/O (one writev
system call per input file, which for your data set is one writev
per output line unless you any big files with many PGNs concatenated.)
Obviously it's a tradeoff between development time vs. performance, and specializing your tool more for this use-case vs. making it generally useful. Perhaps if each thread had its own C++ output stream open on STDOUT_FD, you could get buffering that way. But OpenMP for threading makes it harder to get fancy, I think.
Use libc search functions which are hand-written in asm
memmem
would be a good choice for scanning for the user-specified string inside a known-length buffer, instead of doing a bunch of std::string
and std::string_view
line-at-a-time work.
Or strstr
if you want to stick to standard functions; memmem
is available in GNU and *BSD systems, and the MUSL libc, and works with explicit-length strings and buffers like your program does.
Your "needle" for it can be the user's string prepended with '\n'
. I guess that makes the first line of the file a special case, so memcmp
that. Unless you can read
into a buffer that starts with a fake newline so your search sees a newline before the first line of the file. For large files you might want to mmap or at least page-align your read buffer, but for small files this could be a good trick. So e.g. alignas(4096) static char buf[8192];
(or thread_local
?) and buf[0] = '\n';
. And for each small file, read(fd, buf+1, sizeof(buf)-1);
.
Unfortunately open
doesn't tell you the file length, and fstat
is an extra system call, so maybe best to just start with a smallish read
before going to mmap
if that doesn't reach EOF.
These search functions are typically heavily optimized in good libc implementations, usually with hand-written SIMD asm optimized for the system's CPU ISA and extensions. (e.g. SSE4 or AVX2 on x86-64.) They may have more startup overhead than memchr
(which unlike strchr can easily check that it's not too close to the end of the string to do a full SIMD vector load right away).
You expect that a PGN will have one [Result
line per maybe 1KiB or so, but often not the first couple lines so stopping at each newline is more work than ideal. And your code doesn't stop at the first match in a file, allowing for big files of concatenated PGNs where memmem
(or strstr
) can shine, scanning through all the text between [Result
lines fast. You do want memchr
to find the ending newline for lines you found.
-
1\$\begingroup\$ BTW, the data set does have some big multi-game files and also some tiny ones. I'll see if I can make a histogram and add that to the question. \$\endgroup\$Toby Speight– Toby Speight2024年10月03日 06:31:12 +00:00Commented Oct 3, 2024 at 6:31
-
1\$\begingroup\$ @TobySpeight: Ok, in that case there's probably some room for improvement with AVX2 string-searching, maybe looking for a 2-byte
[R
or maybesu
pattern and then checking for if that's part of a[Result
line. (Re6
etc. is a valid move soRe
isn't a great key , and[R
appears in[Round
as well.su
can appear inside other text tags easily enough, too, but doesn't in the example PGN on wikipedia :P) 2-byte keys are a good tradeoff between breaking out of the no-match loop too often vs. too much work per vector: 2xvpcmpeqw
+ 1xvpor
+vpmovmskb
+test
per 32-byte load \$\endgroup\$Peter Cordes– Peter Cordes2024年10月03日 06:42:09 +00:00Commented Oct 3, 2024 at 6:42 -
1\$\begingroup\$ @G.Sliepen: Compilers can optimize loops into
memset
. But they can't auto-vectorize loops whose trip-count can't be calculated ahead of the first iteration, so they can't auto-vectorize search loops likememchr
. (Except for ICC-classic, and I think I read something about work on this in recent GCC or Clang, I forget which.) Idiom-recognition instead of auto-vectorization is possible formemchr
but I haven't heard of it, and current GCC and Clang don't do it: godbolt.org/z/o8TGG96sz \$\endgroup\$Peter Cordes– Peter Cordes2024年10月03日 10:43:39 +00:00Commented Oct 3, 2024 at 10:43 -
1\$\begingroup\$ @G.Sliepen: I don't know what
std::string_view
algorithms do internally, they might be manually vectorized in good implementations, or at least usememchr
when possible. But you don't need all that complexity of having separate lines of file data in separate C++ objects if you simply scan a whole buffer for a pattern that includes a newline, andmemmem
orstrstr
give you that. \$\endgroup\$Peter Cordes– Peter Cordes2024年10月03日 10:45:50 +00:00Commented Oct 3, 2024 at 10:45 -
1\$\begingroup\$ @G.Sliepen I wouldn't say that GCC and Clang "recognize that code is equivalent to
memchr
". Usingg++ -E
and searching the included headers, I see libstdc++ has a specialization forchar
oftemplate<> struct char_traits<char>
with afind
function that uses__builtin_memchr
if it's notstd::__is_constant_evaluated
, else it uses__gnu_cxx::char_traits<char_type>::find
to allow compile-time eval. So it's not the compiler recognizing equivalent patterns, it's the library code dispatching to memchr or memcmp. \$\endgroup\$Peter Cordes– Peter Cordes2024年10月03日 19:51:18 +00:00Commented Oct 3, 2024 at 19:51
find ChessData/ -type f -name '*.pgn'
takes. The timing might be more controlled if the output offind
was stored in a text file and the text file was used for timing. \$\endgroup\$find
alone ran in about 40ms (with files in cache), of which 0ms is user-space time. In other words, insignificantly small proportion of the time. \$\endgroup\$