leejet · leejet · Apr 16, 2026 · Apr 15, 2026 · Apr 15, 2026 · Apr 16, 2026
diff --git a/README.md b/README.md
@@ -57,6 +57,7 @@ API and command-line option may change frequently.***
     - [Z-Image](./docs/z_image.md)
     - [Ovis-Image](./docs/ovis_image.md)
     - [Anima](./docs/anima.md)
+    - [ERNIE-Image](./docs/ernie_image.md)
   - Image Edit Models
     - [FLUX.1-Kontext-dev](./docs/kontext.md)
     - [Qwen Image Edit series](./docs/qwen_image_edit.md)
@@ -144,6 +145,7 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe
 - [🔥Z-Image](./docs/z_image.md)
 - [Ovis-Image](./docs/ovis_image.md)
 - [Anima](./docs/anima.md)
+- [ERNIE-Image](./docs/ernie_image.md)
 - [LoRA](./docs/lora.md)
 - [LCM/LCM-LoRA](./docs/lcm.md)
 - [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)

diff --git a/assets/ernie_image/example.png b/assets/ernie_image/example.png
diff --git a/assets/ernie_image/turbo_example.png b/assets/ernie_image/turbo_example.png
diff --git a/docs/ernie_image.md b/docs/ernie_image.md
@@ -0,0 +1,35 @@
+# How to Use
+
+You can run ERNIE-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or even less.
+
+## Download weights
+
+- Download ERNIE-Image-Turbo
+    - safetensors: https://huggingface.co/Comfy-Org/ERNIE-Image/tree/main/diffusion_models
+    - gguf: https://huggingface.co/unsloth/ERNIE-Image-Turbo-GGUF/tree/main
+- Download ERNIE-Image
+    - safetensors: https://huggingface.co/Comfy-Org/ERNIE-Image/tree/main/diffusion_models
+    - gguf: https://huggingface.co/unsloth/ERNIE-Image-GGUF/tree/main
+- Download vae
+    - safetensors: https://huggingface.co/Comfy-Org/ERNIE-Image/tree/main/vae
+- Download ministral 3b
+    - safetensors: https://huggingface.co/Comfy-Org/ERNIE-Image/tree/main/text_encoders
+    - gguf: https://huggingface.co/unsloth/Ministral-3-3B-Instruct-2512-GGUF/tree/main
+
+## Examples
+
+### ERNIE-Image-Turbo
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\ernie-image-turbo.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\ministral-3-3b.safetensors -p "a lovely cat" --cfg-scale 1.0 --steps 8 -v --offload-to-cpu --diffusion-fa
+```
+
+<img width="256" alt="ERNIE-Image Turbo example" src="../assets/ernie_image/turbo_example.png" />
+
+### ERNIE-Image
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\ernie-image-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\ministral-3-3b.safetensors -p "a lovely cat" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa
+```
+
+<img width="256" alt="ERNIE-Image example" src="../assets/ernie_image/example.png" />
diff --git a/src/auto_encoder_kl.hpp b/src/auto_encoder_kl.hpp
@@ -533,7 +533,7 @@ class AutoEncoderKLModel : public GGMLBlock {
                        const std::string& prefix                      = "")
         : version(version), decode_only(decode_only), use_video_decoder(use_video_decoder) {
         if (sd_version_is_dit(version)) {
-            if (sd_version_is_flux2(version)) {
+            if (sd_version_uses_flux2_vae(version)) {
                 dd_config.z_channels = 32;
                 embed_dim            = 32;
             } else {
@@ -578,7 +578,7 @@ class AutoEncoderKLModel : public GGMLBlock {
 
     ggml_tensor* decode(GGMLRunnerContext* ctx, ggml_tensor* z) {
         // z: [N, z_channels, h, w]
-        if (sd_version_is_flux2(version)) {
+        if (sd_version_uses_flux2_vae(version)) {
             // [N, C*p*p, h, w] -> [N, C, h*p, w*p]
             int64_t p = 2;
 
@@ -617,7 +617,7 @@ class AutoEncoderKLModel : public GGMLBlock {
             auto quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["quant_conv"]);
             z               = quant_conv->forward(ctx, z);  // [N, 2*embed_dim, h/8, w/8]
         }
-        if (sd_version_is_flux2(version)) {
+        if (sd_version_uses_flux2_vae(version)) {
             z = ggml_ext_chunk(ctx->ggml_ctx, z, 2, 2)[0];
 
             // [N, C, H, W] -> [N, C*p*p, H/p, W/p]
@@ -640,7 +640,7 @@ class AutoEncoderKLModel : public GGMLBlock {
 
     int get_encoder_output_channels() {
         int factor = dd_config.double_z ? 2 : 1;
-        if (sd_version_is_flux2(version)) {
+        if (sd_version_uses_flux2_vae(version)) {
             return dd_config.z_channels * 4;
         }
         return dd_config.z_channels * factor;
@@ -673,7 +673,7 @@ struct AutoEncoderKL : public VAE {
         } else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) {
             scale_factor = 0.3611f;
             shift_factor = 0.1159f;
-        } else if (sd_version_is_flux2(version)) {
+        } else if (sd_version_uses_flux2_vae(version)) {
             scale_factor = 1.0f;
             shift_factor = 0.f;
         }
@@ -747,7 +747,7 @@ struct AutoEncoderKL : public VAE {
     }
 
     sd::Tensor<float> vae_output_to_latents(const sd::Tensor<float>& vae_output, std::shared_ptr<RNG> rng) override {
-        if (sd_version_is_flux2(version)) {
+        if (sd_version_uses_flux2_vae(version)) {
             return vae_output;
         } else if (version == VERSION_SD1_PIX2PIX) {
             return sd::ops::chunk(vae_output, 2, 2)[0];
@@ -758,7 +758,7 @@ struct AutoEncoderKL : public VAE {
 
     std::pair<sd::Tensor<float>, sd::Tensor<float>> get_latents_mean_std(const sd::Tensor<float>& latents, int channel_dim) {
         GGML_ASSERT(channel_dim >= 0 && static_cast<size_t>(channel_dim) < static_cast<size_t>(latents.dim()));
-        if (sd_version_is_flux2(version)) {
+        if (sd_version_uses_flux2_vae(version)) {
             GGML_ASSERT(latents.shape()[channel_dim] == 128);
             std::vector<int64_t> stats_shape(static_cast<size_t>(latents.dim()), 1);
             stats_shape[static_cast<size_t>(channel_dim)] = latents.shape()[channel_dim];
@@ -804,7 +804,7 @@ struct AutoEncoderKL : public VAE {
     }
 
     sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents) override {
-        if (sd_version_is_flux2(version)) {
+        if (sd_version_uses_flux2_vae(version)) {
             int channel_dim                = 2;
             auto [mean_tensor, std_tensor] = get_latents_mean_std(latents, channel_dim);
             return (latents * std_tensor) / scale_factor + mean_tensor;
@@ -813,7 +813,7 @@ struct AutoEncoderKL : public VAE {
     }
 
     sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents) override {
-        if (sd_version_is_flux2(version)) {
+        if (sd_version_uses_flux2_vae(version)) {
             int channel_dim                = 2;
             auto [mean_tensor, std_tensor] = get_latents_mean_std(latents, channel_dim);
             return ((latents - mean_tensor) * scale_factor) / std_tensor;

diff --git a/src/conditioner.hpp b/src/conditioner.hpp
@@ -1621,10 +1621,12 @@ struct LLMEmbedder : public Conditioner {
         LLM::LLMArch arch = LLM::LLMArch::QWEN2_5_VL;
         if (version == VERSION_FLUX2) {
             arch = LLM::LLMArch::MISTRAL_SMALL_3_2;
+        } else if (sd_version_is_ernie_image(version)) {
+            arch = LLM::LLMArch::MINISTRAL_3_3B;
         } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
             arch = LLM::LLMArch::QWEN3;
         }
-        if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2) {
+        if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2 || arch == LLM::LLMArch::MINISTRAL_3_3B) {
             tokenizer = std::make_shared<MistralTokenizer>();
         } else {
             tokenizer = std::make_shared<Qwen2Tokenizer>();
@@ -1867,6 +1869,13 @@ struct LLMEmbedder : public Conditioner {
             prompt_attn_range.second = static_cast<int>(prompt.size());
 
             prompt += "[/INST]";
+        } else if (sd_version_is_ernie_image(version)) {
+            prompt_template_encode_start_idx = 0;
+            out_layers                       = {25};  // -2
+
+            prompt_attn_range.first = 0;
+            prompt += conditioner_params.text;
+            prompt_attn_range.second = static_cast<int>(prompt.size());
         } else if (sd_version_is_z_image(version)) {
             prompt_template_encode_start_idx = 0;
             out_layers                       = {35};  // -2

diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp
@@ -3,6 +3,7 @@
 
 #include <optional>
 #include "anima.hpp"
+#include "ernie_image.hpp"
 #include "flux.hpp"
 #include "mmdit.hpp"
 #include "qwen_image.hpp"
@@ -516,4 +517,66 @@ struct ZImageModel : public DiffusionModel {
     }
 };
 
+struct ErnieImageModel : public DiffusionModel {
+    std::string prefix;
+    ErnieImage::ErnieImageRunner ernie_image;
+
+    ErnieImageModel(ggml_backend_t backend,
+                    bool offload_params_to_cpu,
+                    const String2TensorStorage& tensor_storage_map = {},
+                    const std::string prefix                       = "model.diffusion_model")
+        : prefix(prefix), ernie_image(backend, offload_params_to_cpu, tensor_storage_map, prefix) {
+    }
+
+    std::string get_desc() override {
+        return ernie_image.get_desc();
+    }
+
+    void alloc_params_buffer() override {
+        ernie_image.alloc_params_buffer();
+    }
+
+    void free_params_buffer() override {
+        ernie_image.free_params_buffer();
+    }
+
+    void free_compute_buffer() override {
+        ernie_image.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
+        ernie_image.get_param_tensors(tensors, prefix);
+    }
+
+    size_t get_params_buffer_size() override {
+        return ernie_image.get_params_buffer_size();
+    }
+
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        ernie_image.set_weight_adapter(adapter);
+    }
+
+    int64_t get_adm_in_channels() override {
+        return 768;
+    }
+
+    void set_flash_attention_enabled(bool enabled) {
+        ernie_image.set_flash_attention_enabled(enabled);
+    }
+
+    void set_circular_axes(bool circular_x, bool circular_y) override {
+        ernie_image.set_circular_axes(circular_x, circular_y);
+    }
+
+    sd::Tensor<float> compute(int n_threads,
+                              const DiffusionParams& diffusion_params) override {
+        GGML_ASSERT(diffusion_params.x != nullptr);
+        GGML_ASSERT(diffusion_params.timesteps != nullptr);
+        return ernie_image.compute(n_threads,
+                                   *diffusion_params.x,
+                                   *diffusion_params.timesteps,
+                                   tensor_or_empty(diffusion_params.context));
+    }
+};
+
 #endif