| 
 | 1 | +// Copyright (c) 2024-present, fjall-rs  | 
 | 2 | +// This source code is licensed under both the Apache 2.0 and MIT License  | 
 | 3 | +// (found in the LICENSE-* files in the repository)  | 
 | 4 | + | 
 | 5 | +use std::sync::Arc;  | 
 | 6 | + | 
 | 7 | +/// Trait for extracting prefixes from keys for prefix bloom filters.  | 
 | 8 | +///  | 
 | 9 | +/// A prefix extractor allows the bloom filter to index prefixes of keys  | 
 | 10 | +/// instead of (or in addition to) the full keys. This enables efficient  | 
 | 11 | +/// filtering for prefix-based queries.  | 
 | 12 | +///  | 
 | 13 | +/// # Examples  | 
 | 14 | +///  | 
 | 15 | +/// ## Simple fixed-length prefix:  | 
 | 16 | +/// ```  | 
 | 17 | +/// use lsm_tree::prefix::PrefixExtractor;  | 
 | 18 | +///  | 
 | 19 | +/// struct FixedPrefixExtractor(usize);  | 
 | 20 | +///  | 
 | 21 | +/// impl PrefixExtractor for FixedPrefixExtractor {  | 
 | 22 | +/// fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {  | 
 | 23 | +/// Box::new(std::iter::once(key.get(0..self.0).unwrap_or(key)))  | 
 | 24 | +/// }  | 
 | 25 | +///   | 
 | 26 | +/// fn name(&self) -> &str {  | 
 | 27 | +/// "fixed_prefix"  | 
 | 28 | +/// }  | 
 | 29 | +/// }  | 
 | 30 | +/// ```  | 
 | 31 | +///  | 
 | 32 | +/// ## Segmented prefixes (e.g., account_id#user_id):  | 
 | 33 | +/// ```  | 
 | 34 | +/// use lsm_tree::prefix::PrefixExtractor;  | 
 | 35 | +///  | 
 | 36 | +/// struct SegmentedPrefixExtractor;  | 
 | 37 | +///  | 
 | 38 | +/// impl PrefixExtractor for SegmentedPrefixExtractor {  | 
 | 39 | +/// fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {  | 
 | 40 | +/// let mut prefixes = vec![];  | 
 | 41 | +/// let mut end = 0;  | 
 | 42 | +/// for (i, &byte) in key.iter().enumerate() {  | 
 | 43 | +/// if byte == b'#' {  | 
 | 44 | +/// prefixes.push(&key[0..i]);  | 
 | 45 | +/// end = i;  | 
 | 46 | +/// }  | 
 | 47 | +/// }  | 
 | 48 | +/// if end < key.len() {  | 
 | 49 | +/// prefixes.push(key);  | 
 | 50 | +/// }  | 
 | 51 | +/// Box::new(prefixes.into_iter())  | 
 | 52 | +/// }  | 
 | 53 | +///   | 
 | 54 | +/// fn name(&self) -> &str {  | 
 | 55 | +/// "segmented_prefix"  | 
 | 56 | +/// }  | 
 | 57 | +/// }  | 
 | 58 | +/// ```  | 
 | 59 | +pub trait PrefixExtractor: Send + Sync {  | 
 | 60 | + /// Extracts zero or more prefixes from a key.  | 
 | 61 | + ///  | 
 | 62 | + /// All prefixes will be added to the bloom filter during segment construction.  | 
 | 63 | + ///  | 
 | 64 | + /// An empty iterator means the key is "out of domain" and won't be added to the bloom filter.  | 
 | 65 | + fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a>;  | 
 | 66 | + | 
 | 67 | + /// Returns a unique name for this prefix extractor.  | 
 | 68 | + fn name(&self) -> &str;  | 
 | 69 | +}  | 
 | 70 | + | 
 | 71 | +/// A prefix extractor that returns the full key.  | 
 | 72 | +///  | 
 | 73 | +/// This is the default behavior if no prefix extractor is specified.  | 
 | 74 | +pub struct FullKeyExtractor;  | 
 | 75 | + | 
 | 76 | +impl PrefixExtractor for FullKeyExtractor {  | 
 | 77 | + fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {  | 
 | 78 | + Box::new(std::iter::once(key))  | 
 | 79 | + }  | 
 | 80 | + | 
 | 81 | + fn name(&self) -> &str {  | 
 | 82 | + "full_key"  | 
 | 83 | + }  | 
 | 84 | +}  | 
 | 85 | + | 
 | 86 | +/// A prefix extractor that returns a fixed-length prefix.  | 
 | 87 | +///  | 
 | 88 | +/// If the key is shorter than the prefix length, returns the full key.  | 
 | 89 | +pub struct FixedPrefixExtractor {  | 
 | 90 | + length: usize,  | 
 | 91 | +}  | 
 | 92 | + | 
 | 93 | +impl FixedPrefixExtractor {  | 
 | 94 | + /// Creates a new fixed-length prefix extractor.  | 
 | 95 | + #[must_use]  | 
 | 96 | + pub fn new(length: usize) -> Self {  | 
 | 97 | + Self { length }  | 
 | 98 | + }  | 
 | 99 | +}  | 
 | 100 | + | 
 | 101 | +impl PrefixExtractor for FixedPrefixExtractor {  | 
 | 102 | + fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {  | 
 | 103 | + if key.len() <= self.length {  | 
 | 104 | + Box::new(std::iter::once(key))  | 
 | 105 | + } else {  | 
 | 106 | + Box::new(std::iter::once(&key[0..self.length]))  | 
 | 107 | + }  | 
 | 108 | + }  | 
 | 109 | + | 
 | 110 | + fn name(&self) -> &str {  | 
 | 111 | + "fixed_prefix"  | 
 | 112 | + }  | 
 | 113 | +}  | 
 | 114 | + | 
 | 115 | +/// A prefix extractor that requires keys to be at least a certain length.  | 
 | 116 | +///  | 
 | 117 | +/// Keys shorter than the required length are considered "out of domain"  | 
 | 118 | +/// and won't be added to the bloom filter. This matches RocksDB's behavior.  | 
 | 119 | +pub struct FixedLengthExtractor {  | 
 | 120 | + length: usize,  | 
 | 121 | +}  | 
 | 122 | + | 
 | 123 | +impl FixedLengthExtractor {  | 
 | 124 | + /// Creates a new fixed-length extractor.  | 
 | 125 | + #[must_use]  | 
 | 126 | + pub fn new(length: usize) -> Self {  | 
 | 127 | + Self { length }  | 
 | 128 | + }  | 
 | 129 | +}  | 
 | 130 | + | 
 | 131 | +impl PrefixExtractor for FixedLengthExtractor {  | 
 | 132 | + fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {  | 
 | 133 | + if key.len() < self.length {  | 
 | 134 | + // Key is too short - out of domain  | 
 | 135 | + Box::new(std::iter::empty())  | 
 | 136 | + } else {  | 
 | 137 | + Box::new(std::iter::once(&key[0..self.length]))  | 
 | 138 | + }  | 
 | 139 | + }  | 
 | 140 | + | 
 | 141 | + fn name(&self) -> &str {  | 
 | 142 | + "fixed_length"  | 
 | 143 | + }  | 
 | 144 | +}  | 
 | 145 | + | 
 | 146 | +/// Examples of custom multi-prefix extractors.  | 
 | 147 | +///  | 
 | 148 | +/// Users can implement their own prefix extractors that return multiple prefixes.  | 
 | 149 | +/// The bloom filter will include all returned prefixes.  | 
 | 150 | +///  | 
 | 151 | +/// # Example  | 
 | 152 | +///  | 
 | 153 | +/// ```  | 
 | 154 | +/// use lsm_tree::prefix::PrefixExtractor;  | 
 | 155 | +/// use std::sync::Arc;  | 
 | 156 | +///  | 
 | 157 | +/// // Example 1: Hierarchical prefix extractor based on delimiter  | 
 | 158 | +/// // For key "user/123/data" with delimiter '/', generates:  | 
 | 159 | +/// // - "user"  | 
 | 160 | +/// // - "user/123"  | 
 | 161 | +/// // - "user/123/data" (full key)  | 
 | 162 | +/// struct HierarchicalPrefixExtractor {  | 
 | 163 | +/// delimiter: u8,  | 
 | 164 | +/// }  | 
 | 165 | +///  | 
 | 166 | +/// impl PrefixExtractor for HierarchicalPrefixExtractor {  | 
 | 167 | +/// fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {  | 
 | 168 | +/// let delimiter = self.delimiter;  | 
 | 169 | +/// let mut prefixes = Vec::new();  | 
 | 170 | +///   | 
 | 171 | +/// // Generate all prefixes up to each delimiter  | 
 | 172 | +/// for (i, &byte) in key.iter().enumerate() {  | 
 | 173 | +/// if byte == delimiter {  | 
 | 174 | +/// prefixes.push(&key[0..i]);  | 
 | 175 | +/// }  | 
 | 176 | +/// }  | 
 | 177 | +///   | 
 | 178 | +/// // Always include the full key  | 
 | 179 | +/// prefixes.push(key);  | 
 | 180 | +///   | 
 | 181 | +/// Box::new(prefixes.into_iter())  | 
 | 182 | +/// }  | 
 | 183 | +///   | 
 | 184 | +/// fn name(&self) -> &str {  | 
 | 185 | +/// "hierarchical_prefix"  | 
 | 186 | +/// }  | 
 | 187 | +/// }  | 
 | 188 | +///  | 
 | 189 | +/// // Example 2: Domain-based extractor for email-like keys  | 
 | 190 | +/// // For "user@example.com", generates:  | 
 | 191 | +/// // - "example.com" (domain)  | 
 | 192 | +/// // - "user@example.com" (full key)  | 
 | 193 | +/// struct EmailDomainExtractor;  | 
 | 194 | +///  | 
 | 195 | +/// impl PrefixExtractor for EmailDomainExtractor {  | 
 | 196 | +/// fn extract<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {  | 
 | 197 | +/// if let Ok(key_str) = std::str::from_utf8(key) {  | 
 | 198 | +/// if let Some(at_pos) = key_str.find('@') {  | 
 | 199 | +/// // Return both domain and full email  | 
 | 200 | +/// let domain = &key[at_pos + 1..];  | 
 | 201 | +/// return Box::new(vec![domain, key].into_iter());  | 
 | 202 | +/// }  | 
 | 203 | +/// }  | 
 | 204 | +/// // If not an email format, just return the full key  | 
 | 205 | +/// Box::new(std::iter::once(key))  | 
 | 206 | +/// }  | 
 | 207 | +///   | 
 | 208 | +/// fn name(&self) -> &str {  | 
 | 209 | +/// "email_domain"  | 
 | 210 | +/// }  | 
 | 211 | +/// }  | 
 | 212 | +///  | 
 | 213 | +/// // Usage:  | 
 | 214 | +/// # let path = tempfile::tempdir()?;  | 
 | 215 | +/// let tree = lsm_tree::Config::new(path)  | 
 | 216 | +/// .prefix_extractor(Arc::new(HierarchicalPrefixExtractor { delimiter: b'/' }))  | 
 | 217 | +/// .open()?;  | 
 | 218 | +/// # Ok::<(), Box<dyn std::error::Error>>(())  | 
 | 219 | +/// ```  | 
 | 220 | + | 
 | 221 | +/// Type alias for a shared prefix extractor  | 
 | 222 | +pub type SharedPrefixExtractor = Arc<dyn PrefixExtractor>;  | 
 | 223 | + | 
 | 224 | +#[cfg(test)]  | 
 | 225 | +mod tests {  | 
 | 226 | + use super::*;  | 
 | 227 | + | 
 | 228 | + #[test]  | 
 | 229 | + fn test_full_key_extractor() {  | 
 | 230 | + let extractor = FullKeyExtractor;  | 
 | 231 | + let key = b"test_key";  | 
 | 232 | + let prefixes: Vec<_> = extractor.extract(key).collect();  | 
 | 233 | + assert_eq!(prefixes.len(), 1);  | 
 | 234 | + assert_eq!(prefixes[0], b"test_key");  | 
 | 235 | + }  | 
 | 236 | + | 
 | 237 | + #[test]  | 
 | 238 | + fn test_fixed_prefix_extractor() {  | 
 | 239 | + let extractor = FixedPrefixExtractor::new(5);  | 
 | 240 | + | 
 | 241 | + // Key longer than prefix  | 
 | 242 | + let key = b"longer_key";  | 
 | 243 | + let prefixes: Vec<_> = extractor.extract(key).collect();  | 
 | 244 | + assert_eq!(prefixes.len(), 1);  | 
 | 245 | + assert_eq!(prefixes[0], b"longe");  | 
 | 246 | + | 
 | 247 | + // Key shorter than prefix  | 
 | 248 | + let key = b"key";  | 
 | 249 | + let prefixes: Vec<_> = extractor.extract(key).collect();  | 
 | 250 | + assert_eq!(prefixes.len(), 1);  | 
 | 251 | + assert_eq!(prefixes[0], b"key");  | 
 | 252 | + | 
 | 253 | + // Key exactly prefix length  | 
 | 254 | + let key = b"exact";  | 
 | 255 | + let prefixes: Vec<_> = extractor.extract(key).collect();  | 
 | 256 | + assert_eq!(prefixes.len(), 1);  | 
 | 257 | + assert_eq!(prefixes[0], b"exact");  | 
 | 258 | + }  | 
 | 259 | + | 
 | 260 | + #[test]  | 
 | 261 | + fn test_empty_key() {  | 
 | 262 | + let full_key = FullKeyExtractor;  | 
 | 263 | + let fixed = FixedPrefixExtractor::new(5);  | 
 | 264 | + | 
 | 265 | + let key = b"";  | 
 | 266 | + | 
 | 267 | + let prefixes: Vec<_> = full_key.extract(key).collect();  | 
 | 268 | + assert_eq!(prefixes.len(), 1);  | 
 | 269 | + assert_eq!(prefixes[0], b"");  | 
 | 270 | + | 
 | 271 | + let prefixes: Vec<_> = fixed.extract(key).collect();  | 
 | 272 | + assert_eq!(prefixes.len(), 1);  | 
 | 273 | + assert_eq!(prefixes[0], b"");  | 
 | 274 | + }  | 
 | 275 | + | 
 | 276 | + #[test]  | 
 | 277 | + fn test_fixed_length_extractor() {  | 
 | 278 | + let extractor = FixedLengthExtractor::new(5);  | 
 | 279 | + | 
 | 280 | + // Key shorter than required length - out of domain  | 
 | 281 | + let key = b"abc";  | 
 | 282 | + let prefixes: Vec<_> = extractor.extract(key).collect();  | 
 | 283 | + assert_eq!(prefixes.len(), 0); // Empty iterator  | 
 | 284 | + | 
 | 285 | + // Key exactly required length  | 
 | 286 | + let key = b"exact";  | 
 | 287 | + let prefixes: Vec<_> = extractor.extract(key).collect();  | 
 | 288 | + assert_eq!(prefixes.len(), 1);  | 
 | 289 | + assert_eq!(prefixes[0], b"exact");  | 
 | 290 | + | 
 | 291 | + // Key longer than required length  | 
 | 292 | + let key = b"longer_key";  | 
 | 293 | + let prefixes: Vec<_> = extractor.extract(key).collect();  | 
 | 294 | + assert_eq!(prefixes.len(), 1);  | 
 | 295 | + assert_eq!(prefixes[0], b"longe");  | 
 | 296 | + }  | 
 | 297 | +}  | 
0 commit comments