Commit 2c006e9

committed

Update the package modules and FIX the importing Error.

2 parents a404585 + 966155e commit 2c006e9Copy full SHA for 2c006e9

File tree

15 files changed

+309

-156

lines changed

.github/workflows
- ci.yml
diffusionLM
- __init__.py
- model
- trainer
  - evaluate.py
  - trainer.py
- utils
setup.py

15 files changed

+309

-156

lines changed

`‎.github/workflows/ci.yml‎`

Lines changed: 0 additions & 96 deletions

This file was deleted.

`‎diffusionLM/init.py‎`

Lines changed: 8 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,13 +1,17 @@`
`1`	`1`	`"""DiffusionLM: A Diffusion-based Language Model Package"""`
`2`	`2`
`3`	`3`	`from .trainer import trainer, TrainingError, evaluate`
`4`		`-from .model import DiffusionLLM, DiffusionConfig`
	`4`	`+from .model import (`
	`5`	`+ DiffusionLLM,`
	`6`	`+ DiffusionConfig,`
	`7`	`+ mask_tokens_for_diffusion`
	`8`	`+ )`
`5`	`9`	`from .save_model import (`
`6`	`10`	`save_model,`
`7`	`11`	`load_model,`
`8`	`12`	`ModelSaveError,`
`9`	`13`	`registerANDpush,`
`10`		`- ModelRegistrationError,`
	`14`	`+ ModelRegistrationError`
`11`	`15`	`)`
`12`	`16`	`from .utils import (`
`13`	`17`	`prepare_dataset,`
`@@ -16,7 +20,7 @@`
`16`	`20`	`DatasetError,`
`17`	`21`	`setup_logging,`
`18`	`22`	`DiffusionLMError,`
`19`		`- handle_errors,`
	`23`	`+ handle_errors`
`20`	`24`	`)`
`21`	`25`
`22`	`26`	`__version__ = "0.1.0"`
`@@ -43,4 +47,5 @@`
`43`	`47`	`"DatasetError",`
`44`	`48`	`"prepare_dataset",`
`45`	`49`	`"DatasetPreparationError",`
	`50`	`+ "mask_tokens_for_diffusion",`
`46`	`51`	`]`

`‎diffusionLM/model/MLP.py‎`

Lines changed: 22 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,10 +1,21 @@`
`1`		`-`
`2`	`1`	`import torch`
`3`	`2`	`import torch.nn as nn`
`4`	`3`	`import torch.nn.functional as F`
`5`	`4`
`6`	`5`	`class MLP(nn.Module):`
`7`		`- """Feed-forward network"""`
	`6`	`+ """`
	`7`	`+ Feed-forward neural network (MLP) used in transformer blocks.`
	`8`	`+`
	`9`	`+ This class implements a two-layer feed-forward network with GELU activation and dropout.`
	`10`	`+`
	`11`	`+ Args:`
	`12`	`+ config: Configuration object containing model hyperparameters.`
	`13`	`+`
	`14`	`+ Attributes:`
	`15`	`+ fc1: First linear layer.`
	`16`	`+ fc2: Second linear layer.`
	`17`	`+ dropout: Dropout layer applied after the second linear layer.`
	`18`	`+ """`
`8`	`19`	`def __init__(self, config):`
`9`	`20`	`super().__init__()`
`10`	`21`	`self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)`
`@@ -19,6 +30,15 @@ def _init_weights(self, module):`
`19`	`30`	`nn.init.zeros_(module.bias)`
`20`	`31`
`21`	`32`	`def forward(self, x):`
	`33`	`+ """`
	`34`	`+ Perform a forward pass through the MLP.`
	`35`	`+`
	`36`	`+ Args:`
	`37`	`+ x: Input tensor of shape [batch_size, seq_length, hidden_size].`
	`38`	`+`
	`39`	`+ Returns:`
	`40`	`+ Output tensor of shape [batch_size, seq_length, hidden_size].`
	`41`	`+ """`
`22`	`42`	`h = F.gelu(self.fc1(x))`
`23`	`43`	`h = self.fc2(h)`
`24`	`44`	`h = self.dropout(h)`

`‎diffusionLM/model/init.py‎`

Lines changed: 10 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -2,9 +2,19 @@`
`2`	`2`
`3`	`3`	`from .transformers_model import DiffusionLLM, DiffusionConfig`
`4`	`4`	`from .mask_token import mask_tokens_for_diffusion`
	`5`	`+from .attention import MultiHeadAttention`
	`6`	`+from .diffusionLM import LLaDAModel`
	`7`	`+from .MLP import MLP`
	`8`	`+from .time_embedding import TimeEmbedding`
	`9`	`+from .transformer_block import TransformerBlock`
`5`	`10`
`6`	`11`	`__all__ = [`
`7`	`12`	`"DiffusionLLM",`
`8`	`13`	`"DiffusionConfig",`
`9`	`14`	`"mask_tokens_for_diffusion",`
	`15`	`+ "MultiHeadAttention",`
	`16`	`+ "LLaDAModel",`
	`17`	`+ "MLP",`
	`18`	`+ "TimeEmbedding",`
	`19`	`+ "TransformerBlock",`
`10`	`20`	`]`

`‎diffusionLM/model/attention.py‎`

Lines changed: 28 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,11 +1,23 @@`
`1`		`-`
`2`	`1`	`import torch`
`3`	`2`	`import torch.nn as nn`
`4`	`3`	`import torch.nn.functional as F`
`5`	`4`
`6`	`5`
`7`	`6`	`class MultiHeadAttention(nn.Module):`
`8`		`- """Multi-head attention module"""`
	`7`	`+ """`
	`8`	`+ Multi-head self-attention mechanism.`
	`9`	`+ This class implements the scaled dot-product attention mechanism with multiple attention heads.`
	`10`	`+ Args:`
	`11`	`+ config: Configuration object containing model hyperparameters.`
	`12`	`+`
	`13`	`+ Attributes:`
	`14`	`+ q_proj: Linear layer for projecting queries.`
	`15`	`+ k_proj: Linear layer for projecting keys.`
	`16`	`+ v_proj: Linear layer for projecting values.`
	`17`	`+ out_proj: Linear layer for projecting the output.`
	`18`	`+ attn_dropout: Dropout layer for attention probabilities.`
	`19`	`+ resid_dropout: Dropout layer for residual connections.`
	`20`	`+ """`
`9`	`21`	`def __init__(self, config):`
`10`	`22`	`super().__init__()`
`11`	`23`	`self.config = config`
`@@ -48,6 +60,20 @@ def merge_heads(self, x, batch_size):`
`48`	`60`	`return x.view(batch_size, -1, self.hidden_size)`
`49`	`61`
`50`	`62`	`def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):`
	`63`	`+ """`
	`64`	`+ Perform a forward pass through the multi-head attention mechanism.`
	`65`	`+`
	`66`	`+ Args:`
	`67`	`+ hidden_states: Input tensor of shape [batch_size, seq_length, hidden_size].`
	`68`	`+ attention_mask: Optional mask to avoid attending to padding tokens.`
	`69`	`+ head_mask: Optional mask for specific attention heads.`
	`70`	`+ output_attentions: Whether to return attention probabilities.`
	`71`	`+`
	`72`	`+ Returns:`
	`73`	`+ A tuple containing:`
	`74`	`+ - output: Output tensor of shape [batch_size, seq_length, hidden_size].`
	`75`	`+ - attention_probs (optional): Attention probabilities if output_attentions=True.`
	`76`	`+ """`
`51`	`77`	`batch_size, seq_length = hidden_states.shape[:2]`
`52`	`78`
`53`	`79`	`# Project to queries, keys, values`

`‎diffusionLM/model/diffusionLM.py‎`

Lines changed: 24 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -8,9 +8,31 @@`
`8`	`8`
`9`	`9`	`class LLaDAModel(nn.Module):`
`10`	`10`	`"""`
`11`		`- LLaDA: Large Language Diffusion Model`
`12`		`- Uses a transformer backbone with timestep conditioning for diffusion-based language modeling`
	`11`	`+ A torch-based language model that incorporates diffusion-based generation through time step conditioning.`
	`12`	`+ The model allows for various text generation strategies including random sampling, confidence-based sampling,`
	`13`	`+ semi-autoregressive generation, and beam search.`
	`14`	`+ Attributes:`
	`15`	`+ config: Configuration object containing model hyperparameters`
	`16`	`+ wte (nn.Embedding): Token embeddings`
	`17`	`+ wpe (nn.Embedding): Position embeddings`
	`18`	`+ dropout (nn.Dropout): Dropout layer`
	`19`	`+ h (nn.ModuleList): List of transformer blocks`
	`20`	`+ ln_f (nn.LayerNorm): Final layer normalization`
	`21`	`+ time_embed (TimeEmbedding): Time step embedding module`
	`22`	`+ time_proj (nn.ModuleList): Time projection layers for each transformer block`
	`23`	`+ lm_head (nn.Linear): Output projection to vocabulary`
	`24`	`+ Methods:`
	`25`	`+ forward(input_ids, attention_mask, timesteps, labels):`
	`26`	`+ Forward pass through the model for training and inference`
	`27`	`+ generate(prompt, max_length, num_inference_steps, temperature, strategy, top_p, top_k, num_beams, return_scores):`
	`28`	`+ Generate text using various sampling strategies and the reverse diffusion process`
	`29`	`+ generate_stream(prompt, max_length, num_inference_steps, temperature, strategy, top_p, top_k, num_beams, callback_fn):`
	`30`	`+ Example:`
	`31`	`+ >>> config = ModelConfig(vocab_size=50257, hidden_size=768)`
	`32`	`+ >>> model = LLaDAModel(config)`
	`33`	`+ >>> output = model.generate(prompt="Hello", max_length=50, temperature=0.7)`
`13`	`34`	`"""`
	`35`	`+`
`14`	`36`	`def __init__(self, config):`
`15`	`37`	`super().__init__()`
`16`	`38`	`self.config = config`
`@@ -61,13 +83,11 @@ def forward(`
`61`	`83`	`) -> Dict[str, torch.Tensor]:`
`62`	`84`	`"""`
`63`	`85`	`Forward pass through the model`
`64`		`-`
`65`	`86`	`Args:`
`66`	`87`	`input_ids: Tensor of token ids [batch_size, seq_len]`
`67`	`88`	`attention_mask: Mask tensor [batch_size, seq_len]`
`68`	`89`	`timesteps: Current diffusion timesteps [batch_size]`
`69`	`90`	`labels: Target token ids for masked positions [batch_size, seq_len]`
`70`		`-`
`71`	`91`	`Returns:`
`72`	`92`	`dict with loss and logits`
`73`	`93`	`"""`

`‎diffusionLM/model/mask_token.py‎`

Lines changed: 9 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -8,15 +8,19 @@ def mask_tokens_for_diffusion(`
`8`	`8`	`mask_token_id: int,`
`9`	`9`	`):`
`10`	`10`	`"""`
`11`		`- Apply forward diffusion process by masking tokens according to timestep.`
	`11`	`+ Apply forward diffusion process by masking tokens according to the timestep.`
`12`	`12`
`13`	`13`	`Args:`
`14`		`- batch: Batch of token sequences`
`15`		`- timestep: Current time step (between 0 and 1)`
`16`		`- mask_token_id: ID of the mask token`
	`14`	`+ batch: A dictionary containing input token sequences and attention masks.`
	`15`	`+ timestep: Current timestep (between 0 and 1) for masking probability.`
	`16`	`+ mask_token_id: ID of the mask token to replace selected tokens.`
`17`	`17`
`18`	`18`	`Returns:`
`19`		`- Dictionary with masked inputs and labels`
	`19`	`+ A dictionary containing:`
	`20`	`+ - input_ids: Masked input token IDs.`
	`21`	`+ - attention_mask: Attention mask for the input.`
	`22`	`+ - labels: Labels for the masked tokens (-100 for unmasked tokens).`
	`23`	`+ - mask_ratio: Ratio of tokens that were masked.`
`20`	`24`	`"""`
`21`	`25`	`input_ids = batch["input_ids"].clone()`
`22`	`26`	`attention_mask = batch["attention_mask"]`

`‎diffusionLM/model/time_embedding.py‎`

Lines changed: 16 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,17 @@`
`3`	`3`	`import torch`
`4`	`4`
`5`	`5`	`class TimeEmbedding(nn.Module):`
`6`		`- """Embedding for diffusion timesteps."""`
	`6`	`+ """`
	`7`	`+ Embedding layer for diffusion timesteps.`
	`8`	`+`
	`9`	`+ This class generates sinusoidal embeddings for diffusion timesteps and projects them to a higher-dimensional space.`
	`10`	`+`
	`11`	`+ Args:`
	`12`	`+ time_embed_dim: Dimensionality of the time embedding.`
	`13`	`+`
	`14`	`+ Attributes:`
	`15`	`+ time_embed: Sequential model for projecting sinusoidal embeddings.`
	`16`	`+ """`
`7`	`17`	`def __init__(self, time_embed_dim):`
`8`	`18`	`super().__init__()`
`9`	`19`	`self.time_embed_dim = time_embed_dim`
`@@ -15,10 +25,13 @@ def __init__(self, time_embed_dim):`
`15`	`25`
`16`	`26`	`def forward(self, timesteps):`
`17`	`27`	`"""`
	`28`	`+ Generate time embeddings for the given timesteps.`
	`29`	`+`
`18`	`30`	`Args:`
`19`		`- timesteps: [batch_size] tensor of timestep values`
	`31`	`+ timesteps: Tensor of shape [batch_size] containing timestep values.`
	`32`	`+`
`20`	`33`	`Returns:`
`21`		`- [batch_size, time_embed_dim] tensor of embeddings`
	`34`	`+ Tensor of shape [batch_size, time_embed_dim] containing time embeddings.`
`22`	`35`	`"""`
`23`	`36`	`half_dim = self.time_embed_dim // 8`
`24`	`37`	`emb = math.log(10000) / (half_dim - 1)`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Commit 2c006e9

File tree

15 files changed

15 files changed

`‎.github/workflows/ci.yml‎`

`‎diffusionLM/init.py‎`

`‎diffusionLM/model/MLP.py‎`

`‎diffusionLM/model/init.py‎`

`‎diffusionLM/model/attention.py‎`

`‎diffusionLM/model/diffusionLM.py‎`

`‎diffusionLM/model/mask_token.py‎`

`‎diffusionLM/model/time_embedding.py‎`

0 commit comments