Deep learning library, implemented from scratch in numpy for fun.
- Tensor-based autograd
- Object-oriented PyTorch-like API
- Layers: multihead/rotary/grouped-query/sparse attention with KV caching, batch/layer/RMS norm, conv2d, max/avg pooling, dropout
- NLP: byte-pair encoding, SentencePiece enc/dec, LoRA fine-tuning, top-k/nucleus/beam search, speculative sampling, chat templates (Llama chat, ChatML), streaming chat UI
- Models: Mixtral, Mamba, Llama, GPT, ResNet
- Lightweight Tensorboard-like dashboarding
- Chat with Mixtral 8x7B 🌅 (notebook) (model)
- LoRA fine-tuning Llama on my Messenger chats (notebook)
- Chat with Mamba 🐍 (notebook) (model)
- Chat with Llama 🦙 (notebook) (model)
- Chat with GPT2 (notebook) (model)
- Universal and Transferable Adversarial Attacks on Aligned Language Models (Zou et al. 2023) repro (notebook)
- Speculative sampling experiments (notebook)
- KV-caching speedup and memory (notebook)
- Beam search, top-p, top-k sampling quality (notebook)
- Training ResNet20 on CIFAR10 (notebook)
- Experiments with μParameterization / μTransfer (notebook)
- Experiments with Neural Tangent Kernels (notebook)
- Grokking: Generalization Beyond Overfitting (Power et al. 2016) reproduction (notebook)
(long after overfitting, test accuracy suddenly begins increasing — a fun result that feels surprising at first)
import numpy as np import candle import candle.functions as F from candle import Module, Tensor class GPT(Module): def __init__(self, n_layers: int, n_heads: int, embed_dim: int, vocab_size: int, block_size: int, dropout_p: float): super().__init__() self.n_layers = n_layers self.embed_dim = embed_dim self.block_size = block_size self.dropout = candle.Dropout(dropout_p) self.word_embeddings = candle.Embedding(vocab_size, embed_dim) self.position_embeddings = candle.Embedding(block_size, embed_dim) self.decoder_blocks = candle.ParameterList([DecoderBlock(embed_dim, n_heads, dropout_p) for _ in range(n_layers)]) self.layer_norm = candle.LayerNorm(embed_dim) # Tie output projection weights to word embeddings. See "Weight Tying" paper. self.output_projection = self.word_embeddings.embeddings def forward(self, indices: Tensor, use_kv_cache: bool = False): offset = self.get_kv_cache_seqlen() if use_kv_cache else 0 position_indices = Tensor(np.arange(indices.shape[1]) + offset) x = self.word_embeddings(indices) + self.position_embeddings(position_indices) x = self.dropout(x) # shape (batch, seqlen, embed_dim) for decoder_block in self.decoder_blocks: x = decoder_block(x, use_kv_cache) x = self.layer_norm(x) return x @ self.output_projection.T def get_kv_cache_seqlen(self): """Gets KV cache seqlen.""" return self.decoder_blocks[0].attn.get_kv_cache_seqlen() class DecoderBlock(Module): def __init__(self, embed_dim: int, n_heads: int, dropout_p: float): super().__init__() self.dropout = candle.Dropout(dropout_p) self.ln1 = candle.LayerNorm(embed_dim) self.attn = candle.MultiheadAttention(embed_dim, n_heads, dropout_p, batch_first=True) self.ln2 = candle.LayerNorm(embed_dim) self.ffn = FeedForwardBlock(input_dim=embed_dim, hidden_dim=4 * embed_dim) def forward(self, x: Tensor, use_kv_cache: bool): # x: Tensor with shape (batch, seqlen, embed_dim) x = x + self.dropout(self.self_attn(self.ln1(x), use_kv_cache)) x = x + self.dropout(self.ffn(self.ln2(x))) return x def self_attn(self, x: Tensor, use_kv_cache: bool): """Self-attention with causal mask.""" # causal_attn_mask[i, j] = 0 means that query[i] attends to key[j], and so # causal_attn_mask[i, j] = 0 if i >= j and 1 otherwise. causal_attn_mask = Tensor(1 - np.tri(x.shape[1])) (attn_output, attn_scores) = self.attn(x, x, x, attn_mask=causal_attn_mask, use_kv_cache=use_kv_cache) return attn_output class FeedForwardBlock(Module): def __init__(self, input_dim: int, hidden_dim: int): super().__init__() self.linear1 = candle.Linear(input_dim, hidden_dim) self.linear2 = candle.Linear(hidden_dim, input_dim) def forward(self, x): x = self.linear1(x) x = F.gelu(x) x = self.linear2(x) return x
model = GPT(n_layers=12, n_heads=12, embed_dim=768, vocab_size=50257, block_size=1024, dropout_p=0.1) tokenizer = candle.models.gpt.GPT2BPETokenizer() indices = candle.Tensor([tokenizer.encode( 'Once upon a time, there is a cat whose name is Maukoo. He loves eating and cuddling.' )]) targets = indices[:, 1:] logits = model(indices[:, :-1]) loss = F.cross_entropy_loss(logits, targets) loss.backward()
model = candle.models.gpt.GPT.from_pretrained('gpt2-large') with candle.no_grad(): generator = candle.nlp.beam_search_decoder(model, indices[0], n_tokens_to_generate=50, beam_size=1, top_p=0.90, top_k=100, use_kv_cache=True) response_indices = np.concatenate(list(generator)) print(tokenizer.decode(response_indices)) # Output: A lot. He also loves drinking. (But it's an odd habit for a cat that loves eating # and cuddling.) This little kitty is not the sort of kitty you would expect to be a