Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit e27b31c

Browse files
committed
Add prefix bloom filter support
1 parent 9ddc0cd commit e27b31c

File tree

13 files changed

+1774
-14
lines changed

13 files changed

+1774
-14
lines changed

‎src/compaction/worker.rs‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,7 @@ fn merge_segments(
363363
opts.tree_id,
364364
opts.config.cache.clone(),
365365
opts.config.descriptor_table.clone(),
366+
opts.config.prefix_extractor.clone(),
366367
payload.dest_level <= 2, // TODO: look at configuration
367368
payload.dest_level <= 2, // TODO: look at configuration
368369
#[cfg(feature = "metrics")]

‎src/config.rs‎

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22
// This source code is licensed under both the Apache 2.0 and MIT License
33
// (found in the LICENSE-* files in the repository)
44

5-
use crate::{path::absolute_path, BlobTree, Cache, CompressionType, DescriptorTable, Tree};
5+
use crate::{
6+
path::absolute_path, prefix::SharedPrefixExtractor, BlobTree, Cache, CompressionType,
7+
DescriptorTable, Tree,
8+
};
69
use std::{
710
path::{Path, PathBuf},
811
sync::Arc,
@@ -94,6 +97,10 @@ pub struct Config {
9497
/// Descriptor table to use
9598
#[doc(hidden)]
9699
pub descriptor_table: Arc<DescriptorTable>,
100+
101+
/// Prefix extractor for bloom filters
102+
#[doc(hidden)]
103+
pub prefix_extractor: Option<SharedPrefixExtractor>,
97104
}
98105

99106
impl Default for Config {
@@ -115,6 +122,7 @@ impl Default for Config {
115122
compression: CompressionType::None,
116123
blob_compression: CompressionType::None,
117124
bloom_bits_per_key: 10,
125+
prefix_extractor: None,
118126

119127
blob_file_target_size: /* 64 MiB */ 64 * 1_024 * 1_024,
120128
blob_file_separation_threshold: /* 4 KiB */ 4 * 1_024,
@@ -312,6 +320,30 @@ impl Config {
312320
self
313321
}
314322

323+
/// Sets the prefix extractor for bloom filters.
324+
///
325+
/// A prefix extractor allows bloom filters to index prefixes of keys
326+
/// instead of (or in addition to) the full keys. This enables efficient
327+
/// filtering for prefix-based queries.
328+
///
329+
/// # Example
330+
///
331+
/// ```
332+
/// # use lsm_tree::Config;
333+
/// use lsm_tree::prefix::FixedPrefixExtractor;
334+
/// use std::sync::Arc;
335+
///
336+
/// # let path = tempfile::tempdir()?;
337+
/// let config = Config::new(path)
338+
/// .prefix_extractor(Arc::new(FixedPrefixExtractor::new(8)));
339+
/// # Ok::<(), Box<dyn std::error::Error>>(())
340+
/// ```
341+
#[must_use]
342+
pub fn prefix_extractor(mut self, extractor: SharedPrefixExtractor) -> Self {
343+
self.prefix_extractor = Some(extractor);
344+
self
345+
}
346+
315347
/// Opens a tree using the config.
316348
///
317349
/// # Errors

‎src/lib.rs‎

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,9 @@ pub mod mvcc_stream;
175175

176176
mod path;
177177

178+
/// Prefix extraction for bloom filters
179+
pub mod prefix;
180+
178181
#[doc(hidden)]
179182
pub mod range;
180183

‎src/metrics.rs‎

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,14 @@ impl Metrics {
4545
let hits = self.bloom_filter_hits.load(Relaxed) as f64;
4646
hits / queries
4747
}
48+
49+
/// Number of bloom filter queries performed.
50+
pub fn bloom_filter_queries(&self) -> usize {
51+
self.bloom_filter_queries.load(Relaxed)
52+
}
53+
54+
/// Number of bloom filter hits (queries that avoided disk I/O).
55+
pub fn bloom_filter_hits(&self) -> usize {
56+
self.bloom_filter_hits.load(Relaxed)
57+
}
4858
}

‎src/prefix.rs‎

Lines changed: 297 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,297 @@
1+
// Copyright (c) 2024-present, fjall-rs
2+
// This source code is licensed under both the Apache 2.0 and MIT License
3+
// (found in the LICENSE-* files in the repository)
4+
5+
use std::sync::Arc;
6+
7+
/// Trait for extracting prefixes from keys for prefix bloom filters.
8+
///
9+
/// A prefix extractor allows the bloom filter to index prefixes of keys
10+
/// instead of (or in addition to) the full keys. This enables efficient
11+
/// filtering for prefix-based queries.
12+
///
13+
/// # Examples
14+
///
15+
/// ## Simple fixed-length prefix:
16+
/// ```
17+
/// use lsm_tree::prefix::PrefixExtractor;
18+
///
19+
/// struct FixedPrefixExtractor(usize);
20+
///
21+
/// impl PrefixExtractor for FixedPrefixExtractor {
22+
/// fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
23+
/// Box::new(std::iter::once(key.get(0..self.0).unwrap_or(key)))
24+
/// }
25+
///
26+
/// fn name(&self) -> &str {
27+
/// "fixed_prefix"
28+
/// }
29+
/// }
30+
/// ```
31+
///
32+
/// ## Segmented prefixes (e.g., account_id#user_id):
33+
/// ```
34+
/// use lsm_tree::prefix::PrefixExtractor;
35+
///
36+
/// struct SegmentedPrefixExtractor;
37+
///
38+
/// impl PrefixExtractor for SegmentedPrefixExtractor {
39+
/// fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
40+
/// let mut prefixes = vec![];
41+
/// let mut end = 0;
42+
/// for (i, &byte) in key.iter().enumerate() {
43+
/// if byte == b'#' {
44+
/// prefixes.push(&key[0..i]);
45+
/// end = i;
46+
/// }
47+
/// }
48+
/// if end < key.len() {
49+
/// prefixes.push(key);
50+
/// }
51+
/// Box::new(prefixes.into_iter())
52+
/// }
53+
///
54+
/// fn name(&self) -> &str {
55+
/// "segmented_prefix"
56+
/// }
57+
/// }
58+
/// ```
59+
pub trait PrefixExtractor: Send + Sync {
60+
/// Extracts zero or more prefixes from a key.
61+
///
62+
/// All prefixes will be added to the bloom filter during segment construction.
63+
///
64+
/// An empty iterator means the key is "out of domain" and won't be added to the bloom filter.
65+
fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a>;
66+
67+
/// Returns a unique name for this prefix extractor.
68+
fn name(&self) -> &str;
69+
}
70+
71+
/// A prefix extractor that returns the full key.
72+
///
73+
/// This is the default behavior if no prefix extractor is specified.
74+
pub struct FullKeyExtractor;
75+
76+
impl PrefixExtractor for FullKeyExtractor {
77+
fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
78+
Box::new(std::iter::once(key))
79+
}
80+
81+
fn name(&self) -> &str {
82+
"full_key"
83+
}
84+
}
85+
86+
/// A prefix extractor that returns a fixed-length prefix.
87+
///
88+
/// If the key is shorter than the prefix length, returns the full key.
89+
pub struct FixedPrefixExtractor {
90+
length: usize,
91+
}
92+
93+
impl FixedPrefixExtractor {
94+
/// Creates a new fixed-length prefix extractor.
95+
#[must_use]
96+
pub fn new(length: usize) -> Self {
97+
Self { length }
98+
}
99+
}
100+
101+
impl PrefixExtractor for FixedPrefixExtractor {
102+
fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
103+
if key.len() <= self.length {
104+
Box::new(std::iter::once(key))
105+
} else {
106+
Box::new(std::iter::once(&key[0..self.length]))
107+
}
108+
}
109+
110+
fn name(&self) -> &str {
111+
"fixed_prefix"
112+
}
113+
}
114+
115+
/// A prefix extractor that requires keys to be at least a certain length.
116+
///
117+
/// Keys shorter than the required length are considered "out of domain"
118+
/// and won't be added to the bloom filter. This matches RocksDB's behavior.
119+
pub struct FixedLengthExtractor {
120+
length: usize,
121+
}
122+
123+
impl FixedLengthExtractor {
124+
/// Creates a new fixed-length extractor.
125+
#[must_use]
126+
pub fn new(length: usize) -> Self {
127+
Self { length }
128+
}
129+
}
130+
131+
impl PrefixExtractor for FixedLengthExtractor {
132+
fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
133+
if key.len() < self.length {
134+
// Key is too short - out of domain
135+
Box::new(std::iter::empty())
136+
} else {
137+
Box::new(std::iter::once(&key[0..self.length]))
138+
}
139+
}
140+
141+
fn name(&self) -> &str {
142+
"fixed_length"
143+
}
144+
}
145+
146+
/// Examples of custom multi-prefix extractors.
147+
///
148+
/// Users can implement their own prefix extractors that return multiple prefixes.
149+
/// The bloom filter will include all returned prefixes.
150+
///
151+
/// # Example
152+
///
153+
/// ```
154+
/// use lsm_tree::prefix::PrefixExtractor;
155+
/// use std::sync::Arc;
156+
///
157+
/// // Example 1: Hierarchical prefix extractor based on delimiter
158+
/// // For key "user/123/data" with delimiter '/', generates:
159+
/// // - "user"
160+
/// // - "user/123"
161+
/// // - "user/123/data" (full key)
162+
/// struct HierarchicalPrefixExtractor {
163+
/// delimiter: u8,
164+
/// }
165+
///
166+
/// impl PrefixExtractor for HierarchicalPrefixExtractor {
167+
/// fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
168+
/// let delimiter = self.delimiter;
169+
/// let mut prefixes = Vec::new();
170+
///
171+
/// // Generate all prefixes up to each delimiter
172+
/// for (i, &byte) in key.iter().enumerate() {
173+
/// if byte == delimiter {
174+
/// prefixes.push(&key[0..i]);
175+
/// }
176+
/// }
177+
///
178+
/// // Always include the full key
179+
/// prefixes.push(key);
180+
///
181+
/// Box::new(prefixes.into_iter())
182+
/// }
183+
///
184+
/// fn name(&self) -> &str {
185+
/// "hierarchical_prefix"
186+
/// }
187+
/// }
188+
///
189+
/// // Example 2: Domain-based extractor for email-like keys
190+
/// // For "user@example.com", generates:
191+
/// // - "example.com" (domain)
192+
/// // - "user@example.com" (full key)
193+
/// struct EmailDomainExtractor;
194+
///
195+
/// impl PrefixExtractor for EmailDomainExtractor {
196+
/// fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
197+
/// if let Ok(key_str) = std::str::from_utf8(key) {
198+
/// if let Some(at_pos) = key_str.find('@') {
199+
/// // Return both domain and full email
200+
/// let domain = &key[at_pos + 1..];
201+
/// return Box::new(vec![domain, key].into_iter());
202+
/// }
203+
/// }
204+
/// // If not an email format, just return the full key
205+
/// Box::new(std::iter::once(key))
206+
/// }
207+
///
208+
/// fn name(&self) -> &str {
209+
/// "email_domain"
210+
/// }
211+
/// }
212+
///
213+
/// // Usage:
214+
/// # let path = tempfile::tempdir()?;
215+
/// let tree = lsm_tree::Config::new(path)
216+
/// .prefix_extractor(Arc::new(HierarchicalPrefixExtractor { delimiter: b'/' }))
217+
/// .open()?;
218+
/// # Ok::<(), Box<dyn std::error::Error>>(())
219+
/// ```
220+
221+
/// Type alias for a shared prefix extractor
222+
pub type SharedPrefixExtractor = Arc<dyn PrefixExtractor>;
223+
224+
#[cfg(test)]
225+
mod tests {
226+
use super::*;
227+
228+
#[test]
229+
fn test_full_key_extractor() {
230+
let extractor = FullKeyExtractor;
231+
let key = b"test_key";
232+
let prefixes: Vec<_> = extractor.extract(key).collect();
233+
assert_eq!(prefixes.len(), 1);
234+
assert_eq!(prefixes[0], b"test_key");
235+
}
236+
237+
#[test]
238+
fn test_fixed_prefix_extractor() {
239+
let extractor = FixedPrefixExtractor::new(5);
240+
241+
// Key longer than prefix
242+
let key = b"longer_key";
243+
let prefixes: Vec<_> = extractor.extract(key).collect();
244+
assert_eq!(prefixes.len(), 1);
245+
assert_eq!(prefixes[0], b"longe");
246+
247+
// Key shorter than prefix
248+
let key = b"key";
249+
let prefixes: Vec<_> = extractor.extract(key).collect();
250+
assert_eq!(prefixes.len(), 1);
251+
assert_eq!(prefixes[0], b"key");
252+
253+
// Key exactly prefix length
254+
let key = b"exact";
255+
let prefixes: Vec<_> = extractor.extract(key).collect();
256+
assert_eq!(prefixes.len(), 1);
257+
assert_eq!(prefixes[0], b"exact");
258+
}
259+
260+
#[test]
261+
fn test_empty_key() {
262+
let full_key = FullKeyExtractor;
263+
let fixed = FixedPrefixExtractor::new(5);
264+
265+
let key = b"";
266+
267+
let prefixes: Vec<_> = full_key.extract(key).collect();
268+
assert_eq!(prefixes.len(), 1);
269+
assert_eq!(prefixes[0], b"");
270+
271+
let prefixes: Vec<_> = fixed.extract(key).collect();
272+
assert_eq!(prefixes.len(), 1);
273+
assert_eq!(prefixes[0], b"");
274+
}
275+
276+
#[test]
277+
fn test_fixed_length_extractor() {
278+
let extractor = FixedLengthExtractor::new(5);
279+
280+
// Key shorter than required length - out of domain
281+
let key = b"abc";
282+
let prefixes: Vec<_> = extractor.extract(key).collect();
283+
assert_eq!(prefixes.len(), 0); // Empty iterator
284+
285+
// Key exactly required length
286+
let key = b"exact";
287+
let prefixes: Vec<_> = extractor.extract(key).collect();
288+
assert_eq!(prefixes.len(), 1);
289+
assert_eq!(prefixes[0], b"exact");
290+
291+
// Key longer than required length
292+
let key = b"longer_key";
293+
let prefixes: Vec<_> = extractor.extract(key).collect();
294+
assert_eq!(prefixes.len(), 1);
295+
assert_eq!(prefixes[0], b"longe");
296+
}
297+
}

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /