Skip to content

Instantly share code, notes, and snippets.

@ruvnet
Last active January 15, 2025 03:29
Show Gist options
  • Save ruvnet/0928768dd1e4af8816e31dde0a0205d5 to your computer and use it in GitHub Desktop.
Save ruvnet/0928768dd1e4af8816e31dde0a0205d5 to your computer and use it in GitHub Desktop.

Revisions

  1. ruvnet revised this gist Dec 10, 2023. 1 changed file with 31 additions and 21 deletions.
    52 changes: 31 additions & 21 deletions MoE.py
    Original file line number Diff line number Diff line change
    @@ -14,53 +14,63 @@

    # Define the Expert class
    class Expert(nn.Module):
    # Each expert is a small feed-forward neural network
    def __init__(self, input_dim, hidden_dim, output_dim):
    super(Expert, self).__init__()
    # First fully connected layer (input to hidden)
    self.fc1 = nn.Linear(input_dim, hidden_dim)
    # Second fully connected layer (hidden to output)
    self.fc2 = nn.Linear(hidden_dim, output_dim)

    # Forward pass for each expert
    def forward(self, x):
    # Activation function applied to the first layer's output
    x = F.relu(self.fc1(x))
    # The second layer's output is returned
    return self.fc2(x)

    # Define the Gating Network class
    class GatingNetwork(nn.Module):
    # Determines the gating mechanism for the experts
    def __init__(self, input_dim, num_experts):
    super(GatingNetwork, self).__init__()
    # Fully connected layer that outputs a probability distribution over experts
    self.gate = nn.Linear(input_dim, num_experts)

    # Forward pass for the gating network
    def forward(self, x):
    # Softmax function to create a probability distribution over experts
    return F.softmax(self.gate(x), dim=1)
    return F.softmax(self.gate(x), dim=2)

    # Define the Mixture of Experts Layer class
    class MoELayer(nn.Module):
    # Combines all experts and the gating mechanism
    def __init__(self, input_dim, hidden_dim, output_dim, num_experts):
    super(MoELayer, self).__init__()
    # A list of expert networks
    self.experts = nn.ModuleList([Expert(input_dim, hidden_dim, output_dim) for _ in range(num_experts)])
    # The gating network
    self.gate = GatingNetwork(input_dim, num_experts)

    # Forward pass for the MoE layer
    def forward(self, x):
    gating_scores = self.gate(x) # Shape: [batch_size, num_tokens, num_experts]
    expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=1) # Shape: [batch_size, num_experts, num_tokens, output_dim]
    expert_outputs = expert_outputs.transpose(1, 2) # Shape: [batch_size, num_tokens, num_experts, output_dim]
    output = torch.einsum('bte,bteo->bto', gating_scores, expert_outputs) # Corrected einsum operation to combine outputs
    def forward(self, x, num_experts_per_tok):
    gating_scores = self.gate(x)
    topk_gating_scores, topk_indices = gating_scores.topk(num_experts_per_tok, dim=2, sorted=False)
    # Create a mask to zero out the contributions of non-topk experts
    mask = torch.zeros_like(gating_scores).scatter_(2, topk_indices, 1)
    # Use the mask to retain only the topk gating scores
    gating_scores = gating_scores * mask
    # Normalize the gating scores to sum to 1 across the selected top experts
    gating_scores = F.normalize(gating_scores, p=1, dim=2)

    expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=1)
    expert_outputs = expert_outputs.transpose(1, 2)
    output = torch.einsum('bte,bteo->bto', gating_scores, expert_outputs)
    return output

    # Rest of your TransformerWithMoE class remains unchanged
    # Define the overall Transformer model with integrated MoE
    class TransformerWithMoE(nn.Module):
    def __init__(self, num_layers, dim, head_dim, hidden_dim, n_heads, num_experts, vocab_size, num_experts_per_tok):
    super(TransformerWithMoE, self).__init__()
    self.num_experts_per_tok = num_experts_per_tok
    self.embedding = nn.Embedding(vocab_size, dim)
    self.layers = nn.ModuleList([nn.TransformerEncoderLayer(d_model=dim, nhead=n_heads) for _ in range(num_layers)])
    self.moe_layer = MoELayer(dim, hidden_dim, dim, num_experts)
    self.output_layer = nn.Linear(dim, vocab_size)

    def forward(self, x):
    x = self.embedding(x)
    for layer in self.layers:
    x = layer(x)
    x = self.moe_layer(x, self.num_experts_per_tok)
    logits = self.output_layer(x)
    return logits

    # Initialize the model with configurations matching Mixtral 8x7B
    model = TransformerWithMoE(
  2. ruvnet revised this gist Dec 10, 2023. 1 changed file with 5 additions and 31 deletions.
    36 changes: 5 additions & 31 deletions MoE.py
    Original file line number Diff line number Diff line change
    @@ -54,39 +54,13 @@ def __init__(self, input_dim, hidden_dim, output_dim, num_experts):

    # Forward pass for the MoE layer
    def forward(self, x):
    # Get gating scores for each token in the sequence over all experts
    gating_scores = self.gate(x) # Shape: [batch_size, num_tokens, num_experts]
    # Obtain the outputs from all experts
    expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=1) # Shape: [batch_size, num_experts, num_tokens, output_dim]
    # Weighted sum of expert outputs by gating scores, combining expert contributions
    output = torch.einsum('bte,bteh->bth', gating_scores, expert_outputs) # Shape: [batch_size, num_tokens, output_dim]
    expert_outputs = expert_outputs.transpose(1, 2) # Shape: [batch_size, num_tokens, num_experts, output_dim]
    output = torch.einsum('bte,bteo->bto', gating_scores, expert_outputs) # Corrected einsum operation to combine outputs
    return output

    # Define the overall Transformer model with integrated MoE
    class TransformerWithMoE(nn.Module):
    # The main model that will include the transformer layers and the MoE layer
    def __init__(self, num_layers, dim, head_dim, hidden_dim, n_heads, num_experts, vocab_size, num_experts_per_tok):
    super(TransformerWithMoE, self).__init__()
    # Embedding layer for input tokens
    self.embedding = nn.Embedding(vocab_size, dim)
    # Define a list of transformer layers
    self.layers = nn.ModuleList([nn.TransformerEncoderLayer(d_model=dim, nhead=n_heads) for _ in range(num_layers)])
    # The MoE layer, placed after transformer layers, for demonstration purposes
    self.moe_layer = MoELayer(dim, hidden_dim, dim, num_experts)
    # Output layer to map from the transformer's feature space to the vocabulary space
    self.output_layer = nn.Linear(dim, vocab_size)

    def forward(self, x):
    # Embedding tokens
    x = self.embedding(x)
    # Passing through each transformer layer
    for layer in self.layers:
    x = layer(x)
    # Passing through the MoE layer
    x = self.moe_layer(x)
    # Projecting back to vocabulary space for output
    logits = self.output_layer(x)
    return logits
    # Rest of your TransformerWithMoE class remains unchanged

    # Initialize the model with configurations matching Mixtral 8x7B
    model = TransformerWithMoE(
    @@ -96,6 +70,6 @@ def forward(self, x):
    hidden_dim=14336, # Hidden dimensionality in the feed-forward network within the transformer
    n_heads=32, # Number of attention heads
    num_experts=8, # Number of experts in the MoE layer
    vocab_size=32000, # Vocabulary size for the embedding layer
    num_experts_per_tok=2 # Number of experts activated per token
    vocab_size=32000, # Vocabulary size for the embedding layer
    num_experts_per_tok=2 # Number of experts activated per token
    )
  3. ruvnet created this gist Dec 10, 2023.
    101 changes: 101 additions & 0 deletions MoE.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,101 @@
    """
    This model integrates the MoE concept within a Transformer architecture. Each token's
    representation is processed by a subset of experts, determined by the gating mechanism.
    This architecture allows for efficient and specialized handling of different aspects of the
    data, aiming for the adaptability and efficiency noted in the Mixtral 8x7B model's design
    philosophy. The model activates only a fraction of the available experts for each token,
    significantly reducing the computational resources needed compared to activating all experts
    for all tokens.
    """

    import torch
    import torch.nn as nn
    import torch.nn.functional as F

    # Define the Expert class
    class Expert(nn.Module):
    # Each expert is a small feed-forward neural network
    def __init__(self, input_dim, hidden_dim, output_dim):
    super(Expert, self).__init__()
    # First fully connected layer (input to hidden)
    self.fc1 = nn.Linear(input_dim, hidden_dim)
    # Second fully connected layer (hidden to output)
    self.fc2 = nn.Linear(hidden_dim, output_dim)

    # Forward pass for each expert
    def forward(self, x):
    # Activation function applied to the first layer's output
    x = F.relu(self.fc1(x))
    # The second layer's output is returned
    return self.fc2(x)

    # Define the Gating Network class
    class GatingNetwork(nn.Module):
    # Determines the gating mechanism for the experts
    def __init__(self, input_dim, num_experts):
    super(GatingNetwork, self).__init__()
    # Fully connected layer that outputs a probability distribution over experts
    self.gate = nn.Linear(input_dim, num_experts)

    # Forward pass for the gating network
    def forward(self, x):
    # Softmax function to create a probability distribution over experts
    return F.softmax(self.gate(x), dim=1)

    # Define the Mixture of Experts Layer class
    class MoELayer(nn.Module):
    # Combines all experts and the gating mechanism
    def __init__(self, input_dim, hidden_dim, output_dim, num_experts):
    super(MoELayer, self).__init__()
    # A list of expert networks
    self.experts = nn.ModuleList([Expert(input_dim, hidden_dim, output_dim) for _ in range(num_experts)])
    # The gating network
    self.gate = GatingNetwork(input_dim, num_experts)

    # Forward pass for the MoE layer
    def forward(self, x):
    # Get gating scores for each token in the sequence over all experts
    gating_scores = self.gate(x) # Shape: [batch_size, num_tokens, num_experts]
    # Obtain the outputs from all experts
    expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=1) # Shape: [batch_size, num_experts, num_tokens, output_dim]
    # Weighted sum of expert outputs by gating scores, combining expert contributions
    output = torch.einsum('bte,bteh->bth', gating_scores, expert_outputs) # Shape: [batch_size, num_tokens, output_dim]
    return output

    # Define the overall Transformer model with integrated MoE
    class TransformerWithMoE(nn.Module):
    # The main model that will include the transformer layers and the MoE layer
    def __init__(self, num_layers, dim, head_dim, hidden_dim, n_heads, num_experts, vocab_size, num_experts_per_tok):
    super(TransformerWithMoE, self).__init__()
    # Embedding layer for input tokens
    self.embedding = nn.Embedding(vocab_size, dim)
    # Define a list of transformer layers
    self.layers = nn.ModuleList([nn.TransformerEncoderLayer(d_model=dim, nhead=n_heads) for _ in range(num_layers)])
    # The MoE layer, placed after transformer layers, for demonstration purposes
    self.moe_layer = MoELayer(dim, hidden_dim, dim, num_experts)
    # Output layer to map from the transformer's feature space to the vocabulary space
    self.output_layer = nn.Linear(dim, vocab_size)

    def forward(self, x):
    # Embedding tokens
    x = self.embedding(x)
    # Passing through each transformer layer
    for layer in self.layers:
    x = layer(x)
    # Passing through the MoE layer
    x = self.moe_layer(x)
    # Projecting back to vocabulary space for output
    logits = self.output_layer(x)
    return logits

    # Initialize the model with configurations matching Mixtral 8x7B
    model = TransformerWithMoE(
    num_layers=32, # Number of transformer layers
    dim=4096, # Dimension of the model
    head_dim=128, # Dimension of each head in the multi-head attention mechanisms
    hidden_dim=14336, # Hidden dimensionality in the feed-forward network within the transformer
    n_heads=32, # Number of attention heads
    num_experts=8, # Number of experts in the MoE layer
    vocab_size=32000, # Vocabulary size for the embedding layer
    num_experts_per_tok=2 # Number of experts activated per token
    )