Name	Name	Last commit message	Last commit date
Latest commit History 6 Commits
benchmark	benchmark
example	example
lib	lib
scripts	scripts
test	test
.gitignore	.gitignore
.pubignore	.pubignore
CHANGELOG.md	CHANGELOG.md
LICENSE	LICENSE
README.md	README.md
analysis_options.yaml	analysis_options.yaml
pubspec.lock	pubspec.lock
pubspec.yaml	pubspec.yaml
vocab.txt	vocab.txt

dart_bert_tokenizer

A lightweight, pure Dart implementation of BERT WordPiece tokenizer.

Features

Pure Dart - Zero dependencies, works everywhere (Flutter, Server, CLI, Web)
Memory Efficient - Typed arrays (Int32List, Uint8List) for 50-70% memory reduction
Full API - Encoding, decoding, padding, truncation, offset mapping
Batch Processing - Sequential and parallel (Isolate-based) batch encoding
HuggingFace tokenizer.json - Load directly from HuggingFace tokenizer files
Well Tested - 328 tests with 100% pass rate

Installation

dependencies:
 dart_bert_tokenizer: ^1.1.0

Quick Start

import 'package:dart_bert_tokenizer/dart_bert_tokenizer.dart';
void main() {
 // Load tokenizer
 final tokenizer = WordPieceTokenizer.fromVocabFileSync('vocab.txt');
 // Encode text
 final encoding = tokenizer.encode('Hello, world!');
 print(encoding.tokens); // [[CLS], hello, ,, world, !, [SEP]]
 print(encoding.ids); // [101, 7592, 1010, 2088, 999, 102]
 // Decode back to text
 final text = tokenizer.decode(encoding.ids, skipSpecialTokens: true);
 print(text); // hello , world !
}

Loading from tokenizer.json

Load directly from HuggingFace tokenizer.json files — normalizer, post-processor, and model settings are automatically extracted:

// From file (async)
final tokenizer = await WordPieceTokenizer.fromTokenizerJson('tokenizer.json');
// From file (sync)
final tokenizer = WordPieceTokenizer.fromTokenizerJsonSync('tokenizer.json');
// From JSON string (e.g., embedded asset)
final tokenizer = WordPieceTokenizer.fromTokenizerJsonString(jsonString);
// Override extracted config if needed
final tokenizer = WordPieceTokenizer.fromTokenizerJsonSync(
 'tokenizer.json',
 configOverride: WordPieceConfig(lowercase: false),
);

Usage

Single Text Encoding

final encoding = tokenizer.encode('Hello world');
print(encoding.tokens); // Token strings
print(encoding.ids); // Token IDs (Int32List)
print(encoding.attentionMask); // Attention mask (Uint8List)
print(encoding.typeIds); // Type IDs (Uint8List)
print(encoding.offsets); // Character offsets [(start, end), ...]
print(encoding.wordIds); // Word indices
print(encoding.sequenceIds); // Sequence indices (0, 1, or null)
// Without special tokens
final raw = tokenizer.encode('Hello', addSpecialTokens: false);

Sentence Pair Encoding

// For QA, NLI, sentence similarity tasks
final encoding = tokenizer.encodePair(
 'What is machine learning?',
 'Machine learning is a subset of AI.',
);
print(encoding.typeIds); // [0,0,0,0,0,0, 1,1,1,1,1,1,1]
print(encoding.sequenceIds); // [null,0,0,0,0,null, 1,1,1,1,1,1,null]

Batch Encoding

// Sequential batch
final encodings = tokenizer.encodeBatch(['Hello', 'World', 'Test']);
// Parallel batch (uses Isolates for batches >= 8)
final encodings = await tokenizer.encodeBatchParallel(texts);
// Pair batch
final pairs = [('Q1', 'A1'), ('Q2', 'A2')];
final encodings = tokenizer.encodePairBatch(pairs);

Padding

// Fluent API
final tokenizer = WordPieceTokenizer.fromVocabFileSync('vocab.txt')
 ..enablePadding(length: 512, direction: PaddingDirection.right);
// Or pad to longest in batch
tokenizer.enablePadding(); // Auto-pads to longest
// Manual padding
final padded = encoding.withPadding(
 targetLength: 128,
 padTokenId: tokenizer.vocab.padTokenId,
 padOnRight: true,
);
// Pad to multiple of N
final padded = encoding.withPaddingToMultipleOf(
 multiple: 8,
 padTokenId: tokenizer.vocab.padTokenId,
);

Truncation

// Fluent API
final tokenizer = WordPieceTokenizer.fromVocabFileSync('vocab.txt')
 ..enableTruncation(maxLength: 512, direction: TruncationDirection.right);
// Manual truncation
final truncated = encoding.withTruncation(maxLength: 64);
// Truncation strategies for pairs
tokenizer.encodePair(textA, textB,
 maxLength: 128,
 truncationStrategy: TruncationStrategy.longestFirst,
);

Truncation Strategies:

longestFirst - Remove from longest sequence iteratively
onlyFirst - Truncate first sequence only
onlySecond - Truncate second sequence only
doNotTruncate - No truncation

Offset Mapping

final encoding = tokenizer.encode('Hello world');
// Character position -> Token index
final tokenIdx = encoding.charToToken(6); // 'w' -> token index
// Token index -> Character span
final (start, end) = encoding.tokenToChars(1)!; // token -> (0, 5)
// Word index -> Token span
final (startToken, endToken) = encoding.wordToTokens(0)!;
// Token -> Word index
final wordIdx = encoding.tokenToWord(1);
// Token -> Sequence index (0, 1, or null for special tokens)
final seqIdx = encoding.tokenToSequence(1);

Vocabulary Access

print(tokenizer.vocab.size); // 30522
print(tokenizer.vocab.clsTokenId); // 101
print(tokenizer.vocab.sepTokenId); // 102
print(tokenizer.vocab.padTokenId); // 0
print(tokenizer.vocab.unkTokenId); // 100
print(tokenizer.vocab.maskTokenId); // 103
// Token <-> ID conversion
tokenizer.convertTokensToIds(['hello', 'world']); // [7592, 2088]
tokenizer.convertIdsToTokens([7592, 2088]); // ['hello', 'world']
// Check if token exists
tokenizer.vocab.contains('hello'); // true

Decoding

// Decode with special tokens
final text = tokenizer.decode(encoding.ids, skipSpecialTokens: false);
// Decode without special tokens (default: true)
final text = tokenizer.decode(encoding.ids);
// Batch decode
final texts = tokenizer.decodeBatch(idsBatch);

ONNX Runtime Integration

Use with ONNX Runtime for on-device ML inference:

import 'package:dart_bert_tokenizer/dart_bert_tokenizer.dart';
import 'dart:typed_data';
final tokenizer = WordPieceTokenizer.fromVocabFileSync('vocab.txt')
 ..enableTruncation(maxLength: 512);
final encoding = tokenizer.encode('search_query: What is ML?');
// Encoding.ids is already Int32List, convert to Int64List for ONNX
final inputIds = Int64List.fromList(encoding.ids);
final attentionMask = Int64List.fromList(encoding.attentionMask);
// Pass to ONNX session
// final outputs = await session.run({
// 'input_ids': inputIds,
// 'attention_mask': attentionMask,
// });

Configuration

final tokenizer = WordPieceTokenizer(
 vocab: vocab,
 config: WordPieceConfig(
 lowercase: true, // Convert to lowercase (default: true)
 stripAccents: true, // Remove accents (default: true)
 handleChineseChars: true, // Space around Chinese chars (default: true)
 subwordPrefix: '##', // Subword prefix (default: '##')
 maxWordLength: 200, // Max word length before [UNK] (default: 200)
 addClsToken: true, // Add [CLS] token (default: true)
 addSepToken: true, // Add [SEP] token (default: true)
 ),
);

API Reference

WordPieceTokenizer

Method	Description
`fromVocabFile(path)`	Load from vocab file (async)
`fromVocabFileSync(path)`	Load from vocab file (sync)
`fromTokenizerJson(path)`	Load from tokenizer.json (async)
`fromTokenizerJsonSync(path)`	Load from tokenizer.json (sync)
`fromTokenizerJsonString(json)`	Load from JSON string
`encode(text)`	Encode single text
`encodePair(textA, textB)`	Encode text pair
`encodeBatch(texts)`	Encode multiple texts
`encodeBatchParallel(texts)`	Parallel batch encoding
`decode(ids)`	Decode IDs to text
`decodeBatch(idsBatch)`	Batch decode
`enablePadding()` / `noPadding()`	Configure padding
`enableTruncation()` / `noTruncation()`	Configure truncation
`convertTokensToIds(tokens)`	Convert tokens to IDs
`convertIdsToTokens(ids)`	Convert IDs to tokens
`numSpecialTokensToAdd(isPair)`	Get special token count

Encoding

Property	Type	Description
`tokens`	`List<String>`	Token strings
`ids`	`Int32List`	Token IDs
`attentionMask`	`Uint8List`	Attention mask (1=attend, 0=ignore)
`typeIds`	`Uint8List`	Token type IDs (0=first, 1=second)
`specialTokensMask`	`Uint8List`	Special token mask
`offsets`	`List<(int, int)>`	Character offsets
`wordIds`	`List<int?>`	Word indices
`sequenceIds`	`List<int?>`	Sequence indices
`length`	`int`	Number of tokens

Performance

Metric	Value
Throughput	~2M tokens/sec
Vocab loading	~40ms (30K tokens)
Memory (vocab)	~5MB
Lookup complexity	O(m) per token
HuggingFace compatibility	100% (34 test cases)

Vocabulary Files

You can load from either vocab.txt or tokenizer.json:

Format	Method	Description
`vocab.txt`	`fromVocabFile()` / `fromVocabFileSync()`	One token per line, line number = ID
`tokenizer.json`	`fromTokenizerJson()` / `fromTokenizerJsonSync()`	Full HuggingFace pipeline config

Download from HuggingFace:

Testing

# Run all tests (328 tests)
dart test
# Run specific test file
dart test test/hf_compatible_test.dart
# Run benchmarks
dart run benchmark/performance_benchmark.dart
# Run HuggingFace compatibility benchmark
dart run benchmark/hf_compatibility_benchmark.dart

HuggingFace Compatibility Verification

# Run HuggingFace compatibility benchmark (85 tests, 100% accuracy)
dart run benchmark/hf_compatibility_benchmark.dart
# Regenerate benchmark expected values (requires Python + tokenizers)
pip install tokenizers
python scripts/generate_hf_benchmark_data.py

License

MIT License

Folders and files

Latest commit

History

Repository files navigation

dart_bert_tokenizer

Features

Installation

Quick Start

Loading from tokenizer.json

Usage

Single Text Encoding

Sentence Pair Encoding

Batch Encoding

Padding

Truncation

Offset Mapping

Vocabulary Access

Decoding

ONNX Runtime Integration

Configuration

API Reference

WordPieceTokenizer

Encoding

Performance

Vocabulary Files

Testing

HuggingFace Compatibility Verification

License

About

Topics

Resources

License

Uh oh!

Stars

Watchers

Forks

Releases 2

Packages 0

Uh oh!

Contributors

Uh oh!

Languages

Packages