Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 2c006e9

Browse files
Update the package modules and FIX the importing Error.
2 parents a404585 + 966155e commit 2c006e9

File tree

15 files changed

+309
-156
lines changed

15 files changed

+309
-156
lines changed

‎.github/workflows/ci.yml‎

Lines changed: 0 additions & 96 deletions
This file was deleted.

‎diffusionLM/__init__.py‎

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,17 @@
11
"""DiffusionLM: A Diffusion-based Language Model Package"""
22

33
from .trainer import trainer, TrainingError, evaluate
4-
from .model import DiffusionLLM, DiffusionConfig
4+
from .model import (
5+
DiffusionLLM,
6+
DiffusionConfig,
7+
mask_tokens_for_diffusion
8+
)
59
from .save_model import (
610
save_model,
711
load_model,
812
ModelSaveError,
913
registerANDpush,
10-
ModelRegistrationError,
14+
ModelRegistrationError
1115
)
1216
from .utils import (
1317
prepare_dataset,
@@ -16,7 +20,7 @@
1620
DatasetError,
1721
setup_logging,
1822
DiffusionLMError,
19-
handle_errors,
23+
handle_errors
2024
)
2125

2226
__version__ = "0.1.0"
@@ -43,4 +47,5 @@
4347
"DatasetError",
4448
"prepare_dataset",
4549
"DatasetPreparationError",
50+
"mask_tokens_for_diffusion",
4651
]

‎diffusionLM/model/MLP.py‎

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,21 @@
1-
21
import torch
32
import torch.nn as nn
43
import torch.nn.functional as F
54

65
class MLP(nn.Module):
7-
"""Feed-forward network"""
6+
"""
7+
Feed-forward neural network (MLP) used in transformer blocks.
8+
9+
This class implements a two-layer feed-forward network with GELU activation and dropout.
10+
11+
Args:
12+
config: Configuration object containing model hyperparameters.
13+
14+
Attributes:
15+
fc1: First linear layer.
16+
fc2: Second linear layer.
17+
dropout: Dropout layer applied after the second linear layer.
18+
"""
819
def __init__(self, config):
920
super().__init__()
1021
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
@@ -19,6 +30,15 @@ def _init_weights(self, module):
1930
nn.init.zeros_(module.bias)
2031

2132
def forward(self, x):
33+
"""
34+
Perform a forward pass through the MLP.
35+
36+
Args:
37+
x: Input tensor of shape [batch_size, seq_length, hidden_size].
38+
39+
Returns:
40+
Output tensor of shape [batch_size, seq_length, hidden_size].
41+
"""
2242
h = F.gelu(self.fc1(x))
2343
h = self.fc2(h)
2444
h = self.dropout(h)

‎diffusionLM/model/__init__.py‎

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,19 @@
22

33
from .transformers_model import DiffusionLLM, DiffusionConfig
44
from .mask_token import mask_tokens_for_diffusion
5+
from .attention import MultiHeadAttention
6+
from .diffusionLM import LLaDAModel
7+
from .MLP import MLP
8+
from .time_embedding import TimeEmbedding
9+
from .transformer_block import TransformerBlock
510

611
__all__ = [
712
"DiffusionLLM",
813
"DiffusionConfig",
914
"mask_tokens_for_diffusion",
15+
"MultiHeadAttention",
16+
"LLaDAModel",
17+
"MLP",
18+
"TimeEmbedding",
19+
"TransformerBlock",
1020
]

‎diffusionLM/model/attention.py‎

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,23 @@
1-
21
import torch
32
import torch.nn as nn
43
import torch.nn.functional as F
54

65

76
class MultiHeadAttention(nn.Module):
8-
"""Multi-head attention module"""
7+
"""
8+
Multi-head self-attention mechanism.
9+
This class implements the scaled dot-product attention mechanism with multiple attention heads.
10+
Args:
11+
config: Configuration object containing model hyperparameters.
12+
13+
Attributes:
14+
q_proj: Linear layer for projecting queries.
15+
k_proj: Linear layer for projecting keys.
16+
v_proj: Linear layer for projecting values.
17+
out_proj: Linear layer for projecting the output.
18+
attn_dropout: Dropout layer for attention probabilities.
19+
resid_dropout: Dropout layer for residual connections.
20+
"""
921
def __init__(self, config):
1022
super().__init__()
1123
self.config = config
@@ -48,6 +60,20 @@ def merge_heads(self, x, batch_size):
4860
return x.view(batch_size, -1, self.hidden_size)
4961

5062
def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
63+
"""
64+
Perform a forward pass through the multi-head attention mechanism.
65+
66+
Args:
67+
hidden_states: Input tensor of shape [batch_size, seq_length, hidden_size].
68+
attention_mask: Optional mask to avoid attending to padding tokens.
69+
head_mask: Optional mask for specific attention heads.
70+
output_attentions: Whether to return attention probabilities.
71+
72+
Returns:
73+
A tuple containing:
74+
- output: Output tensor of shape [batch_size, seq_length, hidden_size].
75+
- attention_probs (optional): Attention probabilities if output_attentions=True.
76+
"""
5177
batch_size, seq_length = hidden_states.shape[:2]
5278

5379
# Project to queries, keys, values

‎diffusionLM/model/diffusionLM.py‎

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,31 @@
88

99
class LLaDAModel(nn.Module):
1010
"""
11-
LLaDA: Large Language Diffusion Model
12-
Uses a transformer backbone with timestep conditioning for diffusion-based language modeling
11+
A torch-based language model that incorporates diffusion-based generation through time step conditioning.
12+
The model allows for various text generation strategies including random sampling, confidence-based sampling,
13+
semi-autoregressive generation, and beam search.
14+
Attributes:
15+
config: Configuration object containing model hyperparameters
16+
wte (nn.Embedding): Token embeddings
17+
wpe (nn.Embedding): Position embeddings
18+
dropout (nn.Dropout): Dropout layer
19+
h (nn.ModuleList): List of transformer blocks
20+
ln_f (nn.LayerNorm): Final layer normalization
21+
time_embed (TimeEmbedding): Time step embedding module
22+
time_proj (nn.ModuleList): Time projection layers for each transformer block
23+
lm_head (nn.Linear): Output projection to vocabulary
24+
Methods:
25+
forward(input_ids, attention_mask, timesteps, labels):
26+
Forward pass through the model for training and inference
27+
generate(prompt, max_length, num_inference_steps, temperature, strategy, top_p, top_k, num_beams, return_scores):
28+
Generate text using various sampling strategies and the reverse diffusion process
29+
generate_stream(prompt, max_length, num_inference_steps, temperature, strategy, top_p, top_k, num_beams, callback_fn):
30+
Example:
31+
>>> config = ModelConfig(vocab_size=50257, hidden_size=768)
32+
>>> model = LLaDAModel(config)
33+
>>> output = model.generate(prompt="Hello", max_length=50, temperature=0.7)
1334
"""
35+
1436
def __init__(self, config):
1537
super().__init__()
1638
self.config = config
@@ -61,13 +83,11 @@ def forward(
6183
) -> Dict[str, torch.Tensor]:
6284
"""
6385
Forward pass through the model
64-
6586
Args:
6687
input_ids: Tensor of token ids [batch_size, seq_len]
6788
attention_mask: Mask tensor [batch_size, seq_len]
6889
timesteps: Current diffusion timesteps [batch_size]
6990
labels: Target token ids for masked positions [batch_size, seq_len]
70-
7191
Returns:
7292
dict with loss and logits
7393
"""

‎diffusionLM/model/mask_token.py‎

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,19 @@ def mask_tokens_for_diffusion(
88
mask_token_id: int,
99
):
1010
"""
11-
Apply forward diffusion process by masking tokens according to timestep.
11+
Apply forward diffusion process by masking tokens according to the timestep.
1212
1313
Args:
14-
batch: Batch of token sequences
15-
timestep: Current time step (between 0 and 1)
16-
mask_token_id: ID of the mask token
14+
batch: A dictionary containing input token sequences and attention masks.
15+
timestep: Current timestep (between 0 and 1) for masking probability.
16+
mask_token_id: ID of the mask token to replace selected tokens.
1717
1818
Returns:
19-
Dictionary with masked inputs and labels
19+
A dictionary containing:
20+
- input_ids: Masked input token IDs.
21+
- attention_mask: Attention mask for the input.
22+
- labels: Labels for the masked tokens (-100 for unmasked tokens).
23+
- mask_ratio: Ratio of tokens that were masked.
2024
"""
2125
input_ids = batch["input_ids"].clone()
2226
attention_mask = batch["attention_mask"]

‎diffusionLM/model/time_embedding.py‎

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,17 @@
33
import torch
44

55
class TimeEmbedding(nn.Module):
6-
"""Embedding for diffusion timesteps."""
6+
"""
7+
Embedding layer for diffusion timesteps.
8+
9+
This class generates sinusoidal embeddings for diffusion timesteps and projects them to a higher-dimensional space.
10+
11+
Args:
12+
time_embed_dim: Dimensionality of the time embedding.
13+
14+
Attributes:
15+
time_embed: Sequential model for projecting sinusoidal embeddings.
16+
"""
717
def __init__(self, time_embed_dim):
818
super().__init__()
919
self.time_embed_dim = time_embed_dim
@@ -15,10 +25,13 @@ def __init__(self, time_embed_dim):
1525

1626
def forward(self, timesteps):
1727
"""
28+
Generate time embeddings for the given timesteps.
29+
1830
Args:
19-
timesteps: [batch_size] tensor of timestep values
31+
timesteps: Tensor of shape [batch_size] containing timestep values.
32+
2033
Returns:
21-
[batch_size, time_embed_dim] tensor of embeddings
34+
Tensor of shape [batch_size, time_embed_dim] containing time embeddings.
2235
"""
2336
half_dim = self.time_embed_dim // 8
2437
emb = math.log(10000) / (half_dim - 1)

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /