add z-image support

leejet · leejet · commit d3f1bf4dd6bb · 2025-11-30T03:10:24.000+08:00
diff --git a/conditioner.hpp b/conditioner.hpp
@@ -1638,6 +1638,8 @@ struct LLMEmbedder : public Conditioner {
         LLM::LLMArch arch = LLM::LLMArch::QWEN2_5_VL;
         if (sd_version_is_flux2(version)) {
             arch = LLM::LLMArch::MISTRAL_SMALL_3_2;
+        } else if (sd_version_is_z_image(version)) {
+            arch = LLM::LLMArch::QWEN3;
         }
         if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2) {
             tokenizer = std::make_shared<LLM::MistralTokenizer>();
@@ -1785,9 +1787,9 @@ struct LLMEmbedder : public Conditioner {
             prompt = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n";
             prompt += img_prompt;
 
-            prompt_attn_range.first = prompt.size();
+            prompt_attn_range.first = static_cast<int>(prompt.size());
             prompt += conditioner_params.text;
-            prompt_attn_range.second = prompt.size();
+            prompt_attn_range.second = static_cast<int>(prompt.size());
 
             prompt += "<|im_end|>\n<|im_start|>assistant\n";
         } else if (sd_version_is_flux2(version)) {
@@ -1796,19 +1798,30 @@ struct LLMEmbedder : public Conditioner {
 
             prompt = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]";
 
-            prompt_attn_range.first = prompt.size();
+            prompt_attn_range.first = static_cast<int>(prompt.size());
             prompt += conditioner_params.text;
-            prompt_attn_range.second = prompt.size();
+            prompt_attn_range.second = static_cast<int>(prompt.size());
 
             prompt += "[/INST]";
+        } else if (sd_version_is_z_image(version)) {
+            prompt_template_encode_start_idx = 0;
+            out_layers                       = {35};  // -2
+
+            prompt = "<|im_start|>user\n";
+
+            prompt_attn_range.first = static_cast<int>(prompt.size());
+            prompt += conditioner_params.text;
+            prompt_attn_range.second = static_cast<int>(prompt.size());
+
+            prompt += "<|im_end|>\n<|im_start|>assistant\n";
         } else {
             prompt_template_encode_start_idx = 34;
 
             prompt = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n";
 
-            prompt_attn_range.first = prompt.size();
+            prompt_attn_range.first = static_cast<int>(prompt.size());
             prompt += conditioner_params.text;
-            prompt_attn_range.second = prompt.size();
+            prompt_attn_range.second = static_cast<int>(prompt.size());
 
             prompt += "<|im_end|>\n<|im_start|>assistant\n";
         }
diff --git a/diffusion_model.hpp b/diffusion_model.hpp
@@ -6,6 +6,7 @@
 #include "qwen_image.hpp"
 #include "unet.hpp"
 #include "wan.hpp"
+#include "z_image.hpp"
 
 struct DiffusionParams {
     struct ggml_tensor* x                     = nullptr;
@@ -357,4 +358,67 @@ struct QwenImageModel : public DiffusionModel {
     }
 };
 
+struct ZImageModel : public DiffusionModel {
+    std::string prefix;
+    ZImage::ZImageRunner z_image;
+
+    ZImageModel(ggml_backend_t backend,
+                bool offload_params_to_cpu,
+                const String2TensorStorage& tensor_storage_map = {},
+                const std::string prefix                       = "model.diffusion_model",
+                SDVersion version                              = VERSION_Z_IMAGE)
+        : prefix(prefix), z_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
+    }
+
+    std::string get_desc() override {
+        return z_image.get_desc();
+    }
+
+    void alloc_params_buffer() override {
+        z_image.alloc_params_buffer();
+    }
+
+    void free_params_buffer() override {
+        z_image.free_params_buffer();
+    }
+
+    void free_compute_buffer() override {
+        z_image.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+        z_image.get_param_tensors(tensors, prefix);
+    }
+
+    size_t get_params_buffer_size() override {
+        return z_image.get_params_buffer_size();
+    }
+
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        z_image.set_weight_adapter(adapter);
+    }
+
+    int64_t get_adm_in_channels() override {
+        return 768;
+    }
+
+    void set_flash_attn_enabled(bool enabled) {
+        z_image.set_flash_attention_enabled(enabled);
+    }
+
+    void compute(int n_threads,
+                 DiffusionParams diffusion_params,
+                 struct ggml_tensor** output     = nullptr,
+                 struct ggml_context* output_ctx = nullptr) override {
+        return z_image.compute(n_threads,
+                               diffusion_params.x,
+                               diffusion_params.timesteps,
+                               diffusion_params.context,
+                               diffusion_params.ref_latents,
+                               true,  // increase_ref_index
+                               output,
+                               output_ctx);
+    }
+};
+
 #endif
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -1653,8 +1653,14 @@ void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy)
     }
 }
 
+#include "z_image.hpp"
+
 int main(int argc, const char* argv[]) {
     SDParams params;
+    // params.verbose = true;
+    // sd_set_log_callback(sd_log_cb, (void*)&params);
+    // ZImage::ZImageRunner::load_from_file_and_test(argv[1]);
+    // return 1;
     parse_args(argc, argv, params);
     preview_path = params.preview_path;
     if (params.video_frames > 4) {
diff --git a/llm.hpp b/llm.hpp
@@ -1,5 +1,5 @@
-#ifndef __QWENVL_HPP__
-#define __QWENVL_HPP__
+#ifndef __LLM_HPP__
+#define __LLM_HPP__
 
 #include <algorithm>
 #include <fstream>
@@ -469,12 +469,14 @@ namespace LLM {
 
     enum class LLMArch {
         QWEN2_5_VL,
+        QWEN3,
         MISTRAL_SMALL_3_2,
         ARCH_COUNT,
     };
 
     static const char* llm_arch_to_str[] = {
         "qwen2.5vl",
+        "qwen3",
         "mistral_small3.2",
     };
 
@@ -501,6 +503,7 @@ namespace LLM {
         int64_t num_kv_heads      = 4;
         int64_t head_dim          = 128;
         bool qkv_bias             = true;
+        bool qk_norm              = false;
         int64_t vocab_size        = 152064;
         float rms_norm_eps        = 1e-06f;
         LLMVisionParams vision;
@@ -813,14 +816,19 @@ namespace LLM {
         int64_t head_dim;
         int64_t num_heads;
         int64_t num_kv_heads;
+        bool qk_norm;
 
     public:
         Attention(const LLMParams& params)
-            : num_heads(params.num_heads), num_kv_heads(params.num_kv_heads), head_dim(params.head_dim), arch(params.arch) {
+            : arch(params.arch), num_heads(params.num_heads), num_kv_heads(params.num_kv_heads), head_dim(params.head_dim), qk_norm(params.qk_norm) {
             blocks["q_proj"] = std::make_shared<Linear>(params.hidden_size, num_heads * head_dim, params.qkv_bias);
             blocks["k_proj"] = std::make_shared<Linear>(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias);
             blocks["v_proj"] = std::make_shared<Linear>(params.hidden_size, num_kv_heads * head_dim, params.qkv_bias);
             blocks["o_proj"] = std::make_shared<Linear>(num_heads * head_dim, params.hidden_size, false);
+            if (params.qk_norm) {
+                blocks["q_norm"] = std::make_shared<RMSNorm>(head_dim, params.rms_norm_eps);
+                blocks["k_norm"] = std::make_shared<RMSNorm>(head_dim, params.rms_norm_eps);
+            }
         }
 
         struct ggml_tensor* forward(GGMLRunnerContext* ctx,
@@ -842,9 +850,20 @@ namespace LLM {
             k = ggml_reshape_4d(ctx->ggml_ctx, k, head_dim, num_kv_heads, n_token, N);  // [N, n_token, num_kv_heads, head_dim]
             v = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim, num_kv_heads, n_token, N);  // [N, n_token, num_kv_heads, head_dim]
 
+            if (qk_norm) {
+                auto q_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["q_norm"]);
+                auto k_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["k_norm"]);
+
+                q = q_norm->forward(ctx, q);
+                k = k_norm->forward(ctx, k);
+            }
+
             if (arch == LLMArch::MISTRAL_SMALL_3_2) {
                 q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NORMAL, 131072, 1000000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
                 k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NORMAL, 131072, 1000000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
+            } else if (arch == LLMArch::QWEN3) {
+                q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NORMAL, 151936, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
+                k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NORMAL, 151936, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
             } else {
                 int sections[4] = {16, 24, 24, 0};
                 q               = ggml_rope_multi(ctx->ggml_ctx, q, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_MROPE, 128000, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
@@ -1063,6 +1082,17 @@ namespace LLM {
                 params.qkv_bias          = false;
                 params.vocab_size        = 131072;
                 params.rms_norm_eps      = 1e-5f;
+            } else if (arch == LLMArch::QWEN3) {
+                params.num_layers        = 36;
+                params.hidden_size       = 2560;
+                params.intermediate_size = 9728;
+                params.head_dim          = 128;
+                params.num_heads         = 32;
+                params.num_kv_heads      = 8;
+                params.qkv_bias          = false;
+                params.qk_norm           = true;
+                params.vocab_size        = 151936;
+                params.rms_norm_eps      = 1e-6f;
             }
             bool have_vision_weight = false;
             bool llama_cpp_style    = false;
@@ -1132,7 +1162,7 @@ namespace LLM {
             }
 
             int64_t n_tokens = input_ids->ne[0];
-            if (params.arch == LLMArch::MISTRAL_SMALL_3_2) {
+            if (params.arch == LLMArch::MISTRAL_SMALL_3_2 || params.arch == LLMArch::QWEN3) {
                 input_pos_vec.resize(n_tokens);
                 for (int i = 0; i < n_tokens; ++i) {
                     input_pos_vec[i] = i;
@@ -1420,7 +1450,8 @@ namespace LLM {
 
             struct ggml_context* work_ctx = ggml_init(params);
             GGML_ASSERT(work_ctx != nullptr);
-            bool test_mistral          = true;
+            bool test_mistral          = false;
+            bool test_qwen3            = true;
             bool test_vit              = false;
             bool test_decoder_with_vit = false;
 
@@ -1455,9 +1486,9 @@ namespace LLM {
                 std::pair<int, int> prompt_attn_range;
                 std::string text = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n";
                 text += img_prompt;
-                prompt_attn_range.first = text.size();
+                prompt_attn_range.first = static_cast<int>(text.size());
                 text += "change 'flux.cpp' to 'edit.cpp'";
-                prompt_attn_range.second = text.size();
+                prompt_attn_range.second = static_cast<int>(text.size());
                 text += "<|im_end|>\n<|im_start|>assistant\n";
 
                 auto tokens_and_weights     = tokenize(text, prompt_attn_range, 0, false);
@@ -1496,9 +1527,9 @@ namespace LLM {
             } else if (test_mistral) {
                 std::pair<int, int> prompt_attn_range;
                 std::string text        = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]";
-                prompt_attn_range.first = text.size();
+                prompt_attn_range.first = static_cast<int>(text.size());
                 text += "a lovely cat";
-                prompt_attn_range.second = text.size();
+                prompt_attn_range.second = static_cast<int>(text.size());
                 text += "[/INST]";
                 auto tokens_and_weights     = tokenize(text, prompt_attn_range, 0, false);
                 std::vector<int>& tokens    = std::get<0>(tokens_and_weights);
@@ -1514,14 +1545,37 @@ namespace LLM {
                 model.compute(8, input_ids, {}, {10, 20, 30}, &out, work_ctx);
                 int t1 = ggml_time_ms();
 
+                print_ggml_tensor(out);
+                LOG_DEBUG("llm test done in %dms", t1 - t0);
+            } else if (test_qwen3) {
+                std::pair<int, int> prompt_attn_range;
+                std::string text        = "<|im_start|>user\n";
+                prompt_attn_range.first = static_cast<int>(text.size());
+                text += "a lovely cat";
+                prompt_attn_range.second = static_cast<int>(text.size());
+                text += "<|im_end|>\n<|im_start|>assistant\n";
+                auto tokens_and_weights     = tokenize(text, prompt_attn_range, 0, false);
+                std::vector<int>& tokens    = std::get<0>(tokens_and_weights);
+                std::vector<float>& weights = std::get<1>(tokens_and_weights);
+                for (auto token : tokens) {
+                    printf("%d ", token);
+                }
+                printf("\n");
+                auto input_ids          = vector_to_ggml_tensor_i32(work_ctx, tokens);
+                struct ggml_tensor* out = nullptr;
+
+                int t0 = ggml_time_ms();
+                model.compute(8, input_ids, {}, {35}, &out, work_ctx);
+                int t1 = ggml_time_ms();
+
                 print_ggml_tensor(out);
                 LOG_DEBUG("llm test done in %dms", t1 - t0);
             } else {
                 std::pair<int, int> prompt_attn_range;
                 std::string text        = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n";
-                prompt_attn_range.first = text.size();
+                prompt_attn_range.first = static_cast<int>(text.size());
                 text += "a lovely cat";
-                prompt_attn_range.second = text.size();
+                prompt_attn_range.second = static_cast<int>(text.size());
                 text += "<|im_end|>\n<|im_start|>assistant\n";
                 auto tokens_and_weights     = tokenize(text, prompt_attn_range, 0, false);
                 std::vector<int>& tokens    = std::get<0>(tokens_and_weights);
@@ -1563,7 +1617,7 @@ namespace LLM {
                 }
             }
 
-            LLMArch arch = LLMArch::MISTRAL_SMALL_3_2;
+            LLMArch arch = LLMArch::QWEN3;
 
             std::shared_ptr<LLMEmbedder> llm = std::make_shared<LLMEmbedder>(arch,
                                                                              backend,
@@ -1587,6 +1641,6 @@ namespace LLM {
             llm->test();
         }
     };
-};  // Qwen
+};  // LLM
 
-#endif  // __QWENVL_HPP__
+#endif  // __LLM_HPP__
diff --git a/mmdit.hpp b/mmdit.hpp
@@ -101,10 +101,14 @@ struct TimestepEmbedder : public GGMLBlock {
 
 public:
     TimestepEmbedder(int64_t hidden_size,
-                     int64_t frequency_embedding_size = 256)
+                     int64_t frequency_embedding_size = 256,
+                     int64_t out_channels             = 0)
         : frequency_embedding_size(frequency_embedding_size) {
+        if (out_channels <= 0) {
+            out_channels = hidden_size;
+        }
         blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(frequency_embedding_size, hidden_size, true, true));
-        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
+        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, out_channels, true, true));
     }
 
     struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* t) {
diff --git a/model.cpp b/model.cpp
@@ -1067,6 +1067,9 @@ SDVersion ModelLoader::get_sd_version() {
             if (tensor_storage.name.find("model.diffusion_model.double_stream_modulation_img.lin.weight") != std::string::npos) {
                 return VERSION_FLUX2;
             }
+            if (tensor_storage.name.find("model.diffusion_model.cap_embedder.0.weight") != std::string::npos) {
+                return VERSION_Z_IMAGE;
+            }
             if (tensor_storage.name.find("model.diffusion_model.blocks.0.cross_attn.norm_k.weight") != std::string::npos) {
                 is_wan = true;
             }
diff --git a/model.h b/model.h
@@ -44,6 +44,7 @@ enum SDVersion {
     VERSION_WAN2_2_TI2V,
     VERSION_QWEN_IMAGE,
     VERSION_FLUX2,
+    VERSION_Z_IMAGE,
     VERSION_COUNT,
 };
 
@@ -116,6 +117,13 @@ static inline bool sd_version_is_qwen_image(SDVersion version) {
     return false;
 }
 
+static inline bool sd_version_is_z_image(SDVersion version) {
+    if (version == VERSION_Z_IMAGE) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_is_inpaint(SDVersion version) {
     if (version == VERSION_SD1_INPAINT ||
         version == VERSION_SD2_INPAINT ||
@@ -132,7 +140,8 @@ static inline bool sd_version_is_dit(SDVersion version) {
         sd_version_is_flux2(version) ||
         sd_version_is_sd3(version) ||
         sd_version_is_wan(version) ||
-        sd_version_is_qwen_image(version)) {
+        sd_version_is_qwen_image(version) ||
+        sd_version_is_z_image(version)) {
         return true;
     }
     return false;
diff --git a/rope.hpp b/rope.hpp
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
diff --git a/z_image.hpp b/z_image.hpp

Original file line number	Diff line number	Diff line change
`@@ -1653,8 +1653,14 @@ void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy)`
`1653`	`1653`	`}`
`1654`	`1654`	`}`
`1655`	`1655`
	`1656`	`+#include "z_image.hpp"`
	`1657`	`+`
`1656`	`1658`	`int main(int argc, const char* argv[]) {`
`1657`	`1659`	`SDParams params;`
	`1660`	`+ // params.verbose = true;`
	`1661`	`+ // sd_set_log_callback(sd_log_cb, (void*)&params);`
	`1662`	`+ // ZImage::ZImageRunner::load_from_file_and_test(argv[1]);`
	`1663`	`+ // return 1;`
`1658`	`1664`	`parse_args(argc, argv, params);`
`1659`	`1665`	`preview_path = params.preview_path;`
`1660`	`1666`	`if (params.video_frames > 4) {`
Original file line number	Diff line number	Diff line change
`@@ -1067,6 +1067,9 @@ SDVersion ModelLoader::get_sd_version() {`
`1067`	`1067`	`if (tensor_storage.name.find("model.diffusion_model.double_stream_modulation_img.lin.weight") != std::string::npos) {`
`1068`	`1068`	`return VERSION_FLUX2;`
`1069`	`1069`	`}`
	`1070`	`+ if (tensor_storage.name.find("model.diffusion_model.cap_embedder.0.weight") != std::string::npos) {`
	`1071`	`+ return VERSION_Z_IMAGE;`
	`1072`	`+ }`
`1070`	`1073`	`if (tensor_storage.name.find("model.diffusion_model.blocks.0.cross_attn.norm_k.weight") != std::string::npos) {`
`1071`	`1074`	`is_wan = true;`
`1072`	`1075`	`}`