ruvnet · January 15, 2025 03:29 · Dec 10, 2023 · Dec 10, 2023 · Dec 10, 2023
diff --git a/MoE.py b/MoE.py
@@ -14,53 +14,63 @@
 
 # Define the Expert class
 class Expert(nn.Module):
-    # Each expert is a small feed-forward neural network
     def __init__(self, input_dim, hidden_dim, output_dim):
         super(Expert, self).__init__()
-        # First fully connected layer (input to hidden)
         self.fc1 = nn.Linear(input_dim, hidden_dim)
-        # Second fully connected layer (hidden to output)
         self.fc2 = nn.Linear(hidden_dim, output_dim)
 
-    # Forward pass for each expert
     def forward(self, x):
-        # Activation function applied to the first layer's output
         x = F.relu(self.fc1(x))
-        # The second layer's output is returned
         return self.fc2(x)
 
 # Define the Gating Network class
 class GatingNetwork(nn.Module):
-    # Determines the gating mechanism for the experts
     def __init__(self, input_dim, num_experts):
         super(GatingNetwork, self).__init__()
-        # Fully connected layer that outputs a probability distribution over experts
         self.gate = nn.Linear(input_dim, num_experts)
 
-    # Forward pass for the gating network
     def forward(self, x):
-        # Softmax function to create a probability distribution over experts
-        return F.softmax(self.gate(x), dim=1)
+        return F.softmax(self.gate(x), dim=2)
 
 # Define the Mixture of Experts Layer class
 class MoELayer(nn.Module):
-    # Combines all experts and the gating mechanism
     def __init__(self, input_dim, hidden_dim, output_dim, num_experts):
         super(MoELayer, self).__init__()
-        # A list of expert networks
         self.experts = nn.ModuleList([Expert(input_dim, hidden_dim, output_dim) for _ in range(num_experts)])
-        # The gating network
         self.gate = GatingNetwork(input_dim, num_experts)
 
-    # Forward pass for the MoE layer
-    def forward(self, x):
-        gating_scores = self.gate(x)  # Shape: [batch_size, num_tokens, num_experts]
-        expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=1)  # Shape: [batch_size, num_experts, num_tokens, output_dim]
-        expert_outputs = expert_outputs.transpose(1, 2)  # Shape: [batch_size, num_tokens, num_experts, output_dim]
-        output = torch.einsum('bte,bteo->bto', gating_scores, expert_outputs)  # Corrected einsum operation to combine outputs
+    def forward(self, x, num_experts_per_tok):
+        gating_scores = self.gate(x)
+        topk_gating_scores, topk_indices = gating_scores.topk(num_experts_per_tok, dim=2, sorted=False)
+        # Create a mask to zero out the contributions of non-topk experts
+        mask = torch.zeros_like(gating_scores).scatter_(2, topk_indices, 1)
+        # Use the mask to retain only the topk gating scores
+        gating_scores = gating_scores * mask
+        # Normalize the gating scores to sum to 1 across the selected top experts
+        gating_scores = F.normalize(gating_scores, p=1, dim=2)
+
+        expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=1)
+        expert_outputs = expert_outputs.transpose(1, 2)
+        output = torch.einsum('bte,bteo->bto', gating_scores, expert_outputs)
         return output
 
-# Rest of your TransformerWithMoE class remains unchanged
+# Define the overall Transformer model with integrated MoE
+class TransformerWithMoE(nn.Module):
+    def __init__(self, num_layers, dim, head_dim, hidden_dim, n_heads, num_experts, vocab_size, num_experts_per_tok):
+        super(TransformerWithMoE, self).__init__()
+        self.num_experts_per_tok = num_experts_per_tok
+        self.embedding = nn.Embedding(vocab_size, dim)
+        self.layers = nn.ModuleList([nn.TransformerEncoderLayer(d_model=dim, nhead=n_heads) for _ in range(num_layers)])
+        self.moe_layer = MoELayer(dim, hidden_dim, dim, num_experts)
+        self.output_layer = nn.Linear(dim, vocab_size)
+
+    def forward(self, x):
+        x = self.embedding(x)
+        for layer in self.layers:
+            x = layer(x)
+        x = self.moe_layer(x, self.num_experts_per_tok)
+        logits = self.output_layer(x)
+        return logits
 
 # Initialize the model with configurations matching Mixtral 8x7B
 model = TransformerWithMoE(

diff --git a/MoE.py b/MoE.py
@@ -54,39 +54,13 @@ def __init__(self, input_dim, hidden_dim, output_dim, num_experts):
 
     # Forward pass for the MoE layer
     def forward(self, x):
-        # Get gating scores for each token in the sequence over all experts
         gating_scores = self.gate(x)  # Shape: [batch_size, num_tokens, num_experts]
-        # Obtain the outputs from all experts
         expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=1)  # Shape: [batch_size, num_experts, num_tokens, output_dim]
-        # Weighted sum of expert outputs by gating scores, combining expert contributions
-        output = torch.einsum('bte,bteh->bth', gating_scores, expert_outputs)  # Shape: [batch_size, num_tokens, output_dim]
+        expert_outputs = expert_outputs.transpose(1, 2)  # Shape: [batch_size, num_tokens, num_experts, output_dim]
+        output = torch.einsum('bte,bteo->bto', gating_scores, expert_outputs)  # Corrected einsum operation to combine outputs
         return output
 
-# Define the overall Transformer model with integrated MoE
-class TransformerWithMoE(nn.Module):
-    # The main model that will include the transformer layers and the MoE layer
-    def __init__(self, num_layers, dim, head_dim, hidden_dim, n_heads, num_experts, vocab_size, num_experts_per_tok):
-        super(TransformerWithMoE, self).__init__()
-        # Embedding layer for input tokens
-        self.embedding = nn.Embedding(vocab_size, dim)
-        # Define a list of transformer layers
-        self.layers = nn.ModuleList([nn.TransformerEncoderLayer(d_model=dim, nhead=n_heads) for _ in range(num_layers)])
-        # The MoE layer, placed after transformer layers, for demonstration purposes
-        self.moe_layer = MoELayer(dim, hidden_dim, dim, num_experts)
-        # Output layer to map from the transformer's feature space to the vocabulary space
-        self.output_layer = nn.Linear(dim, vocab_size)
-
-    def forward(self, x):
-        # Embedding tokens
-        x = self.embedding(x)
-        # Passing through each transformer layer
-        for layer in self.layers:
-            x = layer(x)
-        # Passing through the MoE layer
-        x = self.moe_layer(x)
-        # Projecting back to vocabulary space for output
-        logits = self.output_layer(x)
-        return logits
+# Rest of your TransformerWithMoE class remains unchanged
 
 # Initialize the model with configurations matching Mixtral 8x7B
 model = TransformerWithMoE(
@@ -96,6 +70,6 @@ def forward(self, x):
     hidden_dim=14336,           # Hidden dimensionality in the feed-forward network within the transformer
     n_heads=32,                 # Number of attention heads
     num_experts=8,              # Number of experts in the MoE layer
-vocab_size=32000,           # Vocabulary size for the embedding layer
-num_experts_per_tok=2       # Number of experts activated per token
+    vocab_size=32000,           # Vocabulary size for the embedding layer
+    num_experts_per_tok=2       # Number of experts activated per token
 )
diff --git a/MoE.py b/MoE.py
@@ -0,0 +1,101 @@
+"""
+This model integrates the MoE concept within a Transformer architecture. Each token's
+representation is processed by a subset of experts, determined by the gating mechanism.
+This architecture allows for efficient and specialized handling of different aspects of the
+data, aiming for the adaptability and efficiency noted in the Mixtral 8x7B model's design
+philosophy. The model activates only a fraction of the available experts for each token,
+significantly reducing the computational resources needed compared to activating all experts
+for all tokens.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# Define the Expert class
+class Expert(nn.Module):
+    # Each expert is a small feed-forward neural network
+    def __init__(self, input_dim, hidden_dim, output_dim):
+        super(Expert, self).__init__()
+        # First fully connected layer (input to hidden)
+        self.fc1 = nn.Linear(input_dim, hidden_dim)
+        # Second fully connected layer (hidden to output)
+        self.fc2 = nn.Linear(hidden_dim, output_dim)
+
+    # Forward pass for each expert
+    def forward(self, x):
+        # Activation function applied to the first layer's output
+        x = F.relu(self.fc1(x))
+        # The second layer's output is returned
+        return self.fc2(x)
+
+# Define the Gating Network class
+class GatingNetwork(nn.Module):
+    # Determines the gating mechanism for the experts
+    def __init__(self, input_dim, num_experts):
+        super(GatingNetwork, self).__init__()
+        # Fully connected layer that outputs a probability distribution over experts
+        self.gate = nn.Linear(input_dim, num_experts)
+
+    # Forward pass for the gating network
+    def forward(self, x):
+        # Softmax function to create a probability distribution over experts
+        return F.softmax(self.gate(x), dim=1)
+
+# Define the Mixture of Experts Layer class
+class MoELayer(nn.Module):
+    # Combines all experts and the gating mechanism
+    def __init__(self, input_dim, hidden_dim, output_dim, num_experts):
+        super(MoELayer, self).__init__()
+        # A list of expert networks
+        self.experts = nn.ModuleList([Expert(input_dim, hidden_dim, output_dim) for _ in range(num_experts)])
+        # The gating network
+        self.gate = GatingNetwork(input_dim, num_experts)
+
+    # Forward pass for the MoE layer
+    def forward(self, x):
+        # Get gating scores for each token in the sequence over all experts
+        gating_scores = self.gate(x)  # Shape: [batch_size, num_tokens, num_experts]
+        # Obtain the outputs from all experts
+        expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=1)  # Shape: [batch_size, num_experts, num_tokens, output_dim]
+        # Weighted sum of expert outputs by gating scores, combining expert contributions
+        output = torch.einsum('bte,bteh->bth', gating_scores, expert_outputs)  # Shape: [batch_size, num_tokens, output_dim]
+        return output
+
+# Define the overall Transformer model with integrated MoE
+class TransformerWithMoE(nn.Module):
+    # The main model that will include the transformer layers and the MoE layer
+    def __init__(self, num_layers, dim, head_dim, hidden_dim, n_heads, num_experts, vocab_size, num_experts_per_tok):
+        super(TransformerWithMoE, self).__init__()
+        # Embedding layer for input tokens
+        self.embedding = nn.Embedding(vocab_size, dim)
+        # Define a list of transformer layers
+        self.layers = nn.ModuleList([nn.TransformerEncoderLayer(d_model=dim, nhead=n_heads) for _ in range(num_layers)])
+        # The MoE layer, placed after transformer layers, for demonstration purposes
+        self.moe_layer = MoELayer(dim, hidden_dim, dim, num_experts)
+        # Output layer to map from the transformer's feature space to the vocabulary space
+        self.output_layer = nn.Linear(dim, vocab_size)
+
+    def forward(self, x):
+        # Embedding tokens
+        x = self.embedding(x)
+        # Passing through each transformer layer
+        for layer in self.layers:
+            x = layer(x)
+        # Passing through the MoE layer
+        x = self.moe_layer(x)
+        # Projecting back to vocabulary space for output
+        logits = self.output_layer(x)
+        return logits
+
+# Initialize the model with configurations matching Mixtral 8x7B
+model = TransformerWithMoE(
+    num_layers=32,              # Number of transformer layers
+    dim=4096,                   # Dimension of the model
+    head_dim=128,               # Dimension of each head in the multi-head attention mechanisms
+    hidden_dim=14336,           # Hidden dimensionality in the feed-forward network within the transformer
+    n_heads=32,                 # Number of attention heads
+    num_experts=8,              # Number of experts in the MoE layer
+vocab_size=32000,           # Vocabulary size for the embedding layer
+num_experts_per_tok=2       # Number of experts activated per token
+)