Last active
January 15, 2025 03:29
-
-
Save ruvnet/0928768dd1e4af8816e31dde0a0205d5 to your computer and use it in GitHub Desktop.
Revisions
-
ruvnet revised this gist
Dec 10, 2023 . 1 changed file with 31 additions and 21 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -14,53 +14,63 @@ # Define the Expert class class Expert(nn.Module): def __init__(self, input_dim, hidden_dim, output_dim): super(Expert, self).__init__() self.fc1 = nn.Linear(input_dim, hidden_dim) self.fc2 = nn.Linear(hidden_dim, output_dim) def forward(self, x): x = F.relu(self.fc1(x)) return self.fc2(x) # Define the Gating Network class class GatingNetwork(nn.Module): def __init__(self, input_dim, num_experts): super(GatingNetwork, self).__init__() self.gate = nn.Linear(input_dim, num_experts) def forward(self, x): return F.softmax(self.gate(x), dim=2) # Define the Mixture of Experts Layer class class MoELayer(nn.Module): def __init__(self, input_dim, hidden_dim, output_dim, num_experts): super(MoELayer, self).__init__() self.experts = nn.ModuleList([Expert(input_dim, hidden_dim, output_dim) for _ in range(num_experts)]) self.gate = GatingNetwork(input_dim, num_experts) def forward(self, x, num_experts_per_tok): gating_scores = self.gate(x) topk_gating_scores, topk_indices = gating_scores.topk(num_experts_per_tok, dim=2, sorted=False) # Create a mask to zero out the contributions of non-topk experts mask = torch.zeros_like(gating_scores).scatter_(2, topk_indices, 1) # Use the mask to retain only the topk gating scores gating_scores = gating_scores * mask # Normalize the gating scores to sum to 1 across the selected top experts gating_scores = F.normalize(gating_scores, p=1, dim=2) expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=1) expert_outputs = expert_outputs.transpose(1, 2) output = torch.einsum('bte,bteo->bto', gating_scores, expert_outputs) return output # Define the overall Transformer model with integrated MoE class TransformerWithMoE(nn.Module): def __init__(self, num_layers, dim, head_dim, hidden_dim, n_heads, num_experts, vocab_size, num_experts_per_tok): super(TransformerWithMoE, self).__init__() self.num_experts_per_tok = num_experts_per_tok self.embedding = nn.Embedding(vocab_size, dim) self.layers = nn.ModuleList([nn.TransformerEncoderLayer(d_model=dim, nhead=n_heads) for _ in range(num_layers)]) self.moe_layer = MoELayer(dim, hidden_dim, dim, num_experts) self.output_layer = nn.Linear(dim, vocab_size) def forward(self, x): x = self.embedding(x) for layer in self.layers: x = layer(x) x = self.moe_layer(x, self.num_experts_per_tok) logits = self.output_layer(x) return logits # Initialize the model with configurations matching Mixtral 8x7B model = TransformerWithMoE( -
ruvnet revised this gist
Dec 10, 2023 . 1 changed file with 5 additions and 31 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -54,39 +54,13 @@ def __init__(self, input_dim, hidden_dim, output_dim, num_experts): # Forward pass for the MoE layer def forward(self, x): gating_scores = self.gate(x) # Shape: [batch_size, num_tokens, num_experts] expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=1) # Shape: [batch_size, num_experts, num_tokens, output_dim] expert_outputs = expert_outputs.transpose(1, 2) # Shape: [batch_size, num_tokens, num_experts, output_dim] output = torch.einsum('bte,bteo->bto', gating_scores, expert_outputs) # Corrected einsum operation to combine outputs return output # Rest of your TransformerWithMoE class remains unchanged # Initialize the model with configurations matching Mixtral 8x7B model = TransformerWithMoE( @@ -96,6 +70,6 @@ def forward(self, x): hidden_dim=14336, # Hidden dimensionality in the feed-forward network within the transformer n_heads=32, # Number of attention heads num_experts=8, # Number of experts in the MoE layer vocab_size=32000, # Vocabulary size for the embedding layer num_experts_per_tok=2 # Number of experts activated per token ) -
ruvnet created this gist
Dec 10, 2023 .There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,101 @@ """ This model integrates the MoE concept within a Transformer architecture. Each token's representation is processed by a subset of experts, determined by the gating mechanism. This architecture allows for efficient and specialized handling of different aspects of the data, aiming for the adaptability and efficiency noted in the Mixtral 8x7B model's design philosophy. The model activates only a fraction of the available experts for each token, significantly reducing the computational resources needed compared to activating all experts for all tokens. """ import torch import torch.nn as nn import torch.nn.functional as F # Define the Expert class class Expert(nn.Module): # Each expert is a small feed-forward neural network def __init__(self, input_dim, hidden_dim, output_dim): super(Expert, self).__init__() # First fully connected layer (input to hidden) self.fc1 = nn.Linear(input_dim, hidden_dim) # Second fully connected layer (hidden to output) self.fc2 = nn.Linear(hidden_dim, output_dim) # Forward pass for each expert def forward(self, x): # Activation function applied to the first layer's output x = F.relu(self.fc1(x)) # The second layer's output is returned return self.fc2(x) # Define the Gating Network class class GatingNetwork(nn.Module): # Determines the gating mechanism for the experts def __init__(self, input_dim, num_experts): super(GatingNetwork, self).__init__() # Fully connected layer that outputs a probability distribution over experts self.gate = nn.Linear(input_dim, num_experts) # Forward pass for the gating network def forward(self, x): # Softmax function to create a probability distribution over experts return F.softmax(self.gate(x), dim=1) # Define the Mixture of Experts Layer class class MoELayer(nn.Module): # Combines all experts and the gating mechanism def __init__(self, input_dim, hidden_dim, output_dim, num_experts): super(MoELayer, self).__init__() # A list of expert networks self.experts = nn.ModuleList([Expert(input_dim, hidden_dim, output_dim) for _ in range(num_experts)]) # The gating network self.gate = GatingNetwork(input_dim, num_experts) # Forward pass for the MoE layer def forward(self, x): # Get gating scores for each token in the sequence over all experts gating_scores = self.gate(x) # Shape: [batch_size, num_tokens, num_experts] # Obtain the outputs from all experts expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=1) # Shape: [batch_size, num_experts, num_tokens, output_dim] # Weighted sum of expert outputs by gating scores, combining expert contributions output = torch.einsum('bte,bteh->bth', gating_scores, expert_outputs) # Shape: [batch_size, num_tokens, output_dim] return output # Define the overall Transformer model with integrated MoE class TransformerWithMoE(nn.Module): # The main model that will include the transformer layers and the MoE layer def __init__(self, num_layers, dim, head_dim, hidden_dim, n_heads, num_experts, vocab_size, num_experts_per_tok): super(TransformerWithMoE, self).__init__() # Embedding layer for input tokens self.embedding = nn.Embedding(vocab_size, dim) # Define a list of transformer layers self.layers = nn.ModuleList([nn.TransformerEncoderLayer(d_model=dim, nhead=n_heads) for _ in range(num_layers)]) # The MoE layer, placed after transformer layers, for demonstration purposes self.moe_layer = MoELayer(dim, hidden_dim, dim, num_experts) # Output layer to map from the transformer's feature space to the vocabulary space self.output_layer = nn.Linear(dim, vocab_size) def forward(self, x): # Embedding tokens x = self.embedding(x) # Passing through each transformer layer for layer in self.layers: x = layer(x) # Passing through the MoE layer x = self.moe_layer(x) # Projecting back to vocabulary space for output logits = self.output_layer(x) return logits # Initialize the model with configurations matching Mixtral 8x7B model = TransformerWithMoE( num_layers=32, # Number of transformer layers dim=4096, # Dimension of the model head_dim=128, # Dimension of each head in the multi-head attention mechanisms hidden_dim=14336, # Hidden dimensionality in the feed-forward network within the transformer n_heads=32, # Number of attention heads num_experts=8, # Number of experts in the MoE layer vocab_size=32000, # Vocabulary size for the embedding layer num_experts_per_tok=2 # Number of experts activated per token )