Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ API and command-line option may change frequently.***
- [Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md)
- [Anima](./docs/anima.md)
- [ERNIE-Image](./docs/ernie_image.md)
- Image Edit Models
- [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Qwen Image Edit series](./docs/qwen_image_edit.md)
Expand Down Expand Up @@ -144,6 +145,7 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe
- [🔥Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md)
- [Anima](./docs/anima.md)
- [ERNIE-Image](./docs/ernie_image.md)
- [LoRA](./docs/lora.md)
- [LCM/LCM-LoRA](./docs/lcm.md)
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
Expand Down
Binary file added assets/ernie_image/example.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/ernie_image/turbo_example.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
35 changes: 35 additions & 0 deletions docs/ernie_image.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# How to Use

You can run ERNIE-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or even less.

## Download weights

- Download ERNIE-Image-Turbo
- safetensors: https://huggingface.co/Comfy-Org/ERNIE-Image/tree/main/diffusion_models
- gguf: https://huggingface.co/unsloth/ERNIE-Image-Turbo-GGUF/tree/main
- Download ERNIE-Image
- safetensors: https://huggingface.co/Comfy-Org/ERNIE-Image/tree/main/diffusion_models
- gguf: https://huggingface.co/unsloth/ERNIE-Image-GGUF/tree/main
- Download vae
- safetensors: https://huggingface.co/Comfy-Org/ERNIE-Image/tree/main/vae
- Download ministral 3b
- safetensors: https://huggingface.co/Comfy-Org/ERNIE-Image/tree/main/text_encoders
- gguf: https://huggingface.co/unsloth/Ministral-3-3B-Instruct-2512-GGUF/tree/main

## Examples

### ERNIE-Image-Turbo

```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\ernie-image-turbo.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\ministral-3-3b.safetensors -p "a lovely cat" --cfg-scale 1.0 --steps 8 -v --offload-to-cpu --diffusion-fa
```

<img width="256" alt="ERNIE-Image Turbo example" src="../assets/ernie_image/turbo_example.png" />

### ERNIE-Image

```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\ernie-image-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\ministral-3-3b.safetensors -p "a lovely cat" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa
```

<img width="256" alt="ERNIE-Image example" src="../assets/ernie_image/example.png" />
18 changes: 9 additions & 9 deletions src/auto_encoder_kl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,7 @@ class AutoEncoderKLModel : public GGMLBlock {
const std::string& prefix = "")
: version(version), decode_only(decode_only), use_video_decoder(use_video_decoder) {
if (sd_version_is_dit(version)) {
if (sd_version_is_flux2(version)) {
if (sd_version_uses_flux2_vae(version)) {
dd_config.z_channels = 32;
embed_dim = 32;
} else {
Expand Down Expand Up @@ -578,7 +578,7 @@ class AutoEncoderKLModel : public GGMLBlock {

ggml_tensor* decode(GGMLRunnerContext* ctx, ggml_tensor* z) {
// z: [N, z_channels, h, w]
if (sd_version_is_flux2(version)) {
if (sd_version_uses_flux2_vae(version)) {
// [N, C*p*p, h, w] -> [N, C, h*p, w*p]
int64_t p = 2;

Expand Down Expand Up @@ -617,7 +617,7 @@ class AutoEncoderKLModel : public GGMLBlock {
auto quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["quant_conv"]);
z = quant_conv->forward(ctx, z); // [N, 2*embed_dim, h/8, w/8]
}
if (sd_version_is_flux2(version)) {
if (sd_version_uses_flux2_vae(version)) {
z = ggml_ext_chunk(ctx->ggml_ctx, z, 2, 2)[0];

// [N, C, H, W] -> [N, C*p*p, H/p, W/p]
Expand All @@ -640,7 +640,7 @@ class AutoEncoderKLModel : public GGMLBlock {

int get_encoder_output_channels() {
int factor = dd_config.double_z ? 2 : 1;
if (sd_version_is_flux2(version)) {
if (sd_version_uses_flux2_vae(version)) {
return dd_config.z_channels * 4;
}
return dd_config.z_channels * factor;
Expand Down Expand Up @@ -673,7 +673,7 @@ struct AutoEncoderKL : public VAE {
} else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) {
scale_factor = 0.3611f;
shift_factor = 0.1159f;
} else if (sd_version_is_flux2(version)) {
} else if (sd_version_uses_flux2_vae(version)) {
scale_factor = 1.0f;
shift_factor = 0.f;
}
Expand Down Expand Up @@ -747,7 +747,7 @@ struct AutoEncoderKL : public VAE {
}

sd::Tensor<float> vae_output_to_latents(const sd::Tensor<float>& vae_output, std::shared_ptr<RNG> rng) override {
if (sd_version_is_flux2(version)) {
if (sd_version_uses_flux2_vae(version)) {
return vae_output;
} else if (version == VERSION_SD1_PIX2PIX) {
return sd::ops::chunk(vae_output, 2, 2)[0];
Expand All @@ -758,7 +758,7 @@ struct AutoEncoderKL : public VAE {

std::pair<sd::Tensor<float>, sd::Tensor<float>> get_latents_mean_std(const sd::Tensor<float>& latents, int channel_dim) {
GGML_ASSERT(channel_dim >= 0 && static_cast<size_t>(channel_dim) < static_cast<size_t>(latents.dim()));
if (sd_version_is_flux2(version)) {
if (sd_version_uses_flux2_vae(version)) {
GGML_ASSERT(latents.shape()[channel_dim] == 128);
std::vector<int64_t> stats_shape(static_cast<size_t>(latents.dim()), 1);
stats_shape[static_cast<size_t>(channel_dim)] = latents.shape()[channel_dim];
Expand Down Expand Up @@ -804,7 +804,7 @@ struct AutoEncoderKL : public VAE {
}

sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents) override {
if (sd_version_is_flux2(version)) {
if (sd_version_uses_flux2_vae(version)) {
int channel_dim = 2;
auto [mean_tensor, std_tensor] = get_latents_mean_std(latents, channel_dim);
return (latents * std_tensor) / scale_factor + mean_tensor;
Expand All @@ -813,7 +813,7 @@ struct AutoEncoderKL : public VAE {
}

sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents) override {
if (sd_version_is_flux2(version)) {
if (sd_version_uses_flux2_vae(version)) {
int channel_dim = 2;
auto [mean_tensor, std_tensor] = get_latents_mean_std(latents, channel_dim);
return ((latents - mean_tensor) * scale_factor) / std_tensor;
Expand Down
11 changes: 10 additions & 1 deletion src/conditioner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1621,10 +1621,12 @@ struct LLMEmbedder : public Conditioner {
LLM::LLMArch arch = LLM::LLMArch::QWEN2_5_VL;
if (version == VERSION_FLUX2) {
arch = LLM::LLMArch::MISTRAL_SMALL_3_2;
} else if (sd_version_is_ernie_image(version)) {
arch = LLM::LLMArch::MINISTRAL_3_3B;
} else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
arch = LLM::LLMArch::QWEN3;
}
if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2) {
if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2 || arch == LLM::LLMArch::MINISTRAL_3_3B) {
tokenizer = std::make_shared<MistralTokenizer>();
} else {
tokenizer = std::make_shared<Qwen2Tokenizer>();
Expand Down Expand Up @@ -1867,6 +1869,13 @@ struct LLMEmbedder : public Conditioner {
prompt_attn_range.second = static_cast<int>(prompt.size());

prompt += "[/INST]";
} else if (sd_version_is_ernie_image(version)) {
prompt_template_encode_start_idx = 0;
out_layers = {25}; // -2

prompt_attn_range.first = 0;
prompt += conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
} else if (sd_version_is_z_image(version)) {
prompt_template_encode_start_idx = 0;
out_layers = {35}; // -2
Expand Down
63 changes: 63 additions & 0 deletions src/diffusion_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <optional>
#include "anima.hpp"
#include "ernie_image.hpp"
#include "flux.hpp"
#include "mmdit.hpp"
#include "qwen_image.hpp"
Expand Down Expand Up @@ -516,4 +517,66 @@ struct ZImageModel : public DiffusionModel {
}
};

struct ErnieImageModel : public DiffusionModel {
std::string prefix;
ErnieImage::ErnieImageRunner ernie_image;

ErnieImageModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model")
: prefix(prefix), ernie_image(backend, offload_params_to_cpu, tensor_storage_map, prefix) {
}

std::string get_desc() override {
return ernie_image.get_desc();
}

void alloc_params_buffer() override {
ernie_image.alloc_params_buffer();
}

void free_params_buffer() override {
ernie_image.free_params_buffer();
}

void free_compute_buffer() override {
ernie_image.free_compute_buffer();
}

void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
ernie_image.get_param_tensors(tensors, prefix);
}

size_t get_params_buffer_size() override {
return ernie_image.get_params_buffer_size();
}

void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
ernie_image.set_weight_adapter(adapter);
}

int64_t get_adm_in_channels() override {
return 768;
}

void set_flash_attention_enabled(bool enabled) {
ernie_image.set_flash_attention_enabled(enabled);
}

void set_circular_axes(bool circular_x, bool circular_y) override {
ernie_image.set_circular_axes(circular_x, circular_y);
}

sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
return ernie_image.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context));
}
};

#endif
Loading
Loading