Skip to content

Commit

Permalink
Some updates to Text 2 image pipeline (#944)
Browse files Browse the repository at this point in the history
**TODO:**
- [ ] Python API and sample
- [ ] Update doc strings
- [x] Update main README.md (PR
#930)
- [ ] Add sample with custom device mapping
- [ ] Experiment with reshape + compile as part of Ctor
- [x] Add LoRA (PR
#911)
- [X] Use std::optional for prompt2, prompt3 and maybe negative prompts
as well
- [X] Update
https://github.com/openvinotoolkit/openvino.genai/blob/master/src/docs/SUPPORTED_MODELS.md
with text 2 image generation models
  • Loading branch information
ilya-lavrenov authored Oct 13, 2024
1 parent 67bcef1 commit a4ac7e3
Show file tree
Hide file tree
Showing 16 changed files with 188 additions and 123 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/causal_lm_cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ jobs:
&& python samples\python\greedy_causal_lm\greedy_causal_lm.py .\TinyLlama-1.1B-Chat-v1.0\ 69 > .\py.txt
- run: fc .\cpp.txt .\py.txt

cpp-beam_search_causal_lm-Qwen-7B-Chat:
cpp-greedy_causal_lm-Qwen-7B-Chat:
runs-on: ubuntu-20.04-16-cores
defaults:
run:
Expand Down Expand Up @@ -866,7 +866,7 @@ jobs:
Overall_Status:
name: ci/gha_overall_status_causal_lm
needs: [cpp-multinomial-greedy_causal_lm-ubuntu, cpp-beam_search_causal_lm-ubuntu, cpp-greedy_causal_lm-windows,
cpp-beam_search_causal_lm-Qwen-7B-Chat, cpp-beam_search_causal_lm-Qwen1_5-7B-Chat, cpp-beam_search_causal_lm-Phi-2,
cpp-greedy_causal_lm-Qwen-7B-Chat, cpp-beam_search_causal_lm-Qwen1_5-7B-Chat, cpp-beam_search_causal_lm-Phi-2,
cpp-beam_search_causal_lm-notus-7b-v1, cpp-speculative_decoding_lm-ubuntu, cpp-prompt_lookup_decoding_lm-ubuntu,
cpp-Phi-1_5, cpp-greedy_causal_lm-redpajama-3b-chat, cpp-chat_sample-ubuntu, cpp-continuous-batching-ubuntu,
visual_language_chat_sample-ubuntu,
Expand Down
11 changes: 0 additions & 11 deletions samples/cpp/text2image/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,6 @@ Prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk gol

![](./512x512.bmp)

## Supported models

Models can be downloaded from [HuggingFace](https://huggingface.co/models). This sample can run the following list of models, but not limited to:

- [botp/stable-diffusion-v1-5](https://huggingface.co/botp/stable-diffusion-v1-5)
- [stabilityai/stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2)
- [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1)
- [dreamlike-art/dreamlike-anime-1.0](https://huggingface.co/dreamlike-art/dreamlike-anime-1.0)
- [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7)
- [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
- [stabilityai/stable-diffusion-xl-base-0.9](https://huggingface.co/stabilityai/stable-diffusion-xl-base-0.9)

## Run with optional LoRA adapters

Expand Down
119 changes: 70 additions & 49 deletions samples/cpp/text2image/imwrite.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,60 +30,59 @@ unsigned char file[14] = {
};

unsigned char info[40] = {
40,
0,
0,
0, // info hd size
0,
0,
0,
0, // width
0,
0,
0,
0, // height
1,
0, // number color planes
24,
0, // bits per pixel
0,
0,
0,
0, // compression is none
0,
0,
0,
0, // image bits size
0x13,
0x0B,
0,
0, // horz resolution in pixel / m
0x13,
0x0B,
0,
0, // vert resolution (0x03C3 = 96 dpi, 0x0B13 = 72
// dpi)
0,
0,
0,
0, // #colors in palette
0,
0,
0,
0, // #important colors
};

}

void imwrite(const std::string& name, ov::Tensor image, bool convert_bgr2rgb) {
std::ofstream output_file(name, std::ofstream::binary);
OPENVINO_ASSERT(output_file.is_open(), "Failed to open the output BMP image path");
40,
0,
0,
0, // info hd size
0,
0,
0,
0, // width
0,
0,
0,
0, // height
1,
0, // number color planes
24,
0, // bits per pixel
0,
0,
0,
0, // compression is none
0,
0,
0,
0, // image bits size
0x13,
0x0B,
0,
0, // horz resolution in pixel / m
0x13,
0x0B,
0,
0, // vert resolution (0x03C3 = 96 dpi, 0x0B13 = 72
// dpi)
0,
0,
0,
0, // #colors in palette
0,
0,
0,
0, // #important colors
};

void imwrite_single_image(const std::string& name, ov::Tensor image, bool convert_bgr2rgb) {
const ov::Shape shape = image.get_shape();
const size_t width = shape[2], height = shape[1], channels = shape[3];
OPENVINO_ASSERT(image.get_element_type() == ov::element::u8 &&
shape.size() == 4 && shape[0] == 1 && channels == 3,
"Image of u8 type and [1, H, W, 3] shape is expected");
"Image of u8 type and [1, H, W, 3] shape is expected.",
"Given image has shape ", shape, " and element type ", image.get_element_type());

std::ofstream output_file(name, std::ofstream::binary);
OPENVINO_ASSERT(output_file.is_open(), "Failed to open the output BMP image path");

int padSize = static_cast<int>(4 - (width * channels) % 4) % 4;
int sizeData = static_cast<int>(width * height * channels + height * padSize);
Expand Down Expand Up @@ -131,3 +130,25 @@ void imwrite(const std::string& name, ov::Tensor image, bool convert_bgr2rgb) {
output_file.write(reinterpret_cast<const char*>(pad), padSize);
}
}

} // namespace


void imwrite(const std::string& name, ov::Tensor images, bool convert_bgr2rgb) {
return;

const ov::Shape shape = images.get_shape(), img_shape = {1, img_shape[1], img_shape[2], img_shape[3]};
uint8_t* img_data = images.data<uint8_t>();

std::cout << "Output shape " << shape << std::endl;
std::cout << "Image shape " << img_shape << std::endl;

for (int img_num = 0, num_images = shape[0], img_size = ov::shape_size(img_shape); img_num < num_images; ++img_num, img_data += img_size) {
char img_name[25];
sprintf(img_name, name.c_str(), img_num);

std::cout << "Try to write image" << img_num << " " << img_name << std::endl;
ov::Tensor image(images.get_element_type(), img_shape, img_data);
// imwrite_single_image(img_name, image, true);
}
}
8 changes: 4 additions & 4 deletions samples/cpp/text2image/imwrite.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
#include "openvino/runtime/tensor.hpp"

/**
* @brief Writes image to file
* @param name File name
* @param image Image tensor
* @brief Writes mutiple images (depending on `image` tensor batch size) to BPM file(s)
* @param name File name or pattern to use to write images
* @param image Image(s) tensor
* @param convert_bgr2rgb Convert BGR to RGB
*/
void imwrite(const std::string& name, ov::Tensor image, bool convert_bgr2rgb);
void imwrite(const std::string& name, ov::Tensor images, bool convert_bgr2rgb);
32 changes: 2 additions & 30 deletions samples/cpp/text2image/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,35 +5,6 @@

#include "imwrite.hpp"

namespace {

void imwrite_output_imgs(const ov::Tensor& output) {
ov::Shape out_shape = output.get_shape();

if (out_shape[0] == 1) {
imwrite("image.bmp", output, true);
return;
}

ov::Shape img_shape = {1, out_shape[1], out_shape[2], out_shape[3]};
size_t img_size = output.get_size() / out_shape[0];

ov::Tensor image(output.get_element_type(), img_shape);
uint8_t* out_data = output.data<uint8_t>();
uint8_t* img_data = image.data<uint8_t>();

for (int img_num = 0; img_num < out_shape[0]; ++img_num) {
std::memcpy(img_data, out_data + img_size * img_num, img_size * sizeof(uint8_t));

char img_name[25];
sprintf(img_name, "image_%d.bmp", img_num);

imwrite(img_name, image, true);
}
}

} //namespace

int32_t main(int32_t argc, char* argv[]) try {
OPENVINO_ASSERT(argc == 3, "Usage: ", argv[0], " <MODEL_DIR> '<PROMPT>'");

Expand All @@ -47,7 +18,8 @@ int32_t main(int32_t argc, char* argv[]) try {
ov::genai::num_inference_steps(20),
ov::genai::num_images_per_prompt(1));

imwrite_output_imgs(image);
// writes `num_images_per_prompt` images by pattern name
imwrite("image_%d.bmp", image, true);

return EXIT_SUCCESS;
} catch (const std::exception& error) {
Expand Down
7 changes: 7 additions & 0 deletions src/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ function(ov_genai_build_jinja2cpp)
option(RAPIDJSON_BUILD_DOC "Build rapidjson documentation." OFF)

add_subdirectory("${jinja2cpp_SOURCE_DIR}" "${jinja2cpp_BINARY_DIR}" EXCLUDE_FROM_ALL)

if(CMAKE_COMPILER_IS_GNUCXX OR OV_COMPILER_IS_CLANG OR (OV_COMPILER_IS_INTEL_LLVM AND UNIX))
target_compile_options(jinja2cpp PRIVATE -Wno-undef)
endif()
if(SUGGEST_OVERRIDE_SUPPORTED)
target_compile_options(jinja2cpp PRIVATE -Wno-suggest-override)
endif()
endif()
endfunction()

Expand Down
12 changes: 6 additions & 6 deletions src/cpp/include/openvino/genai/text2image/pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline {
// SD XL: prompt2 and negative_prompt2
// FLUX: prompt2 (prompt if prompt2 is not defined explicitly)
// SD 3: prompt2, prompt3 (with fallback to prompt) and negative_prompt2, negative_prompt3
std::string prompt2, prompt3;
std::string negative_prompt, negative_prompt2, negative_prompt3;
std::optional<std::string> prompt_2 = std::nullopt, prompt_3 = std::nullopt;
std::string negative_prompt, negative_prompt_2, negative_prompt_3;

size_t num_images_per_prompt = 1;

Expand Down Expand Up @@ -165,12 +165,12 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline {
// Generation config properties
//

static constexpr ov::Property<std::string> prompt2{"prompt2"};
static constexpr ov::Property<std::string> prompt3{"prompt3"};
static constexpr ov::Property<std::string> prompt_2{"prompt_2"};
static constexpr ov::Property<std::string> prompt_3{"prompt_3"};

static constexpr ov::Property<std::string> negative_prompt{"negative_prompt"};
static constexpr ov::Property<std::string> negative_prompt2{"negative_prompt2"};
static constexpr ov::Property<std::string> negative_prompt3{"negative_prompt3"};
static constexpr ov::Property<std::string> negative_prompt_2{"negative_prompt_2"};
static constexpr ov::Property<std::string> negative_prompt_3{"negative_prompt_3"};

static constexpr ov::Property<size_t> num_images_per_prompt{"num_images_per_prompt"};
static constexpr ov::Property<float> guidance_scale{"guidance_scale"};
Expand Down
4 changes: 3 additions & 1 deletion src/cpp/src/text2image/diffusion_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,9 @@ class Text2ImagePipeline::DiffusionPipeline {
protected:
virtual void initialize_generation_config(const std::string& class_name) = 0;

virtual void check_inputs(const int height, const int width) const = 0;
virtual void check_image_size(const int height, const int width) const = 0;

virtual void check_inputs(const GenerationConfig& generation_config) const = 0;

std::shared_ptr<IScheduler> m_scheduler;
GenerationConfig m_generation_config;
Expand Down
8 changes: 4 additions & 4 deletions src/cpp/src/text2image/models/clip_text_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,16 +94,16 @@ ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string

if (do_classifier_free_guidance) {
perform_tokenization(neg_prompt,
ov::Tensor(input_ids, {current_batch_idx , 0},
{current_batch_idx + 1, m_config.max_position_embeddings}));
ov::Tensor(input_ids, {current_batch_idx , 0},
{current_batch_idx + 1, m_config.max_position_embeddings}));
++current_batch_idx;
} else {
// Negative prompt is ignored when --guidanceScale < 1.0
}

perform_tokenization(pos_prompt,
ov::Tensor(input_ids, {current_batch_idx , 0},
{current_batch_idx + 1, m_config.max_position_embeddings}));
ov::Tensor(input_ids, {current_batch_idx , 0},
{current_batch_idx + 1, m_config.max_position_embeddings}));

// text embeddings
m_request.set_tensor("input_ids", input_ids);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,16 +83,16 @@ ov::Tensor CLIPTextModelWithProjection::infer(const std::string& pos_prompt, con

if (do_classifier_free_guidance) {
perform_tokenization(neg_prompt,
ov::Tensor(input_ids, {current_batch_idx , 0},
{current_batch_idx + 1, m_config.max_position_embeddings}));
ov::Tensor(input_ids, {current_batch_idx , 0},
{current_batch_idx + 1, m_config.max_position_embeddings}));
++current_batch_idx;
} else {
// Negative prompt is ignored when --guidanceScale < 1.0
}

perform_tokenization(pos_prompt,
ov::Tensor(input_ids, {current_batch_idx , 0},
{current_batch_idx + 1, m_config.max_position_embeddings}));
ov::Tensor(input_ids, {current_batch_idx , 0},
{current_batch_idx + 1, m_config.max_position_embeddings}));

// text embeddings
m_request.set_tensor("input_ids", input_ids);
Expand Down
4 changes: 2 additions & 2 deletions src/cpp/src/text2image/schedulers/lcm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ LCMScheduler::LCMScheduler(const std::string scheduler_config_path) :
LCMScheduler::LCMScheduler(const Config& scheduler_config)
: m_config(scheduler_config),
m_seed(42),
m_gen(100, std::mt19937(m_seed)),
m_gen(m_seed),
m_normal(0.0f, 1.0f) {

m_sigma_data = 0.5f; // Default: 0.5
Expand Down Expand Up @@ -191,7 +191,7 @@ std::map<std::string, ov::Tensor> LCMScheduler::step(ov::Tensor noise_pred, ov::

if (inference_step != m_num_inference_steps - 1) {
for (std::size_t i = 0; i < batch_size * latent_size; ++i) {
float gen_noise = m_normal(m_gen[i / latent_size]);
float gen_noise = m_normal(m_gen);
prev_sample_data[i] = alpha_prod_t_prev_sqrt * denoised_data[i] + beta_prod_t_prev_sqrt * gen_noise;
}
} else {
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/text2image/schedulers/lcm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class LCMScheduler : public IScheduler {
std::vector<int64_t> m_timesteps;

uint32_t m_seed;
std::vector<std::mt19937> m_gen;
std::mt19937 m_gen;
std::normal_distribution<float> m_normal;

std::vector<float> threshold_sample(const std::vector<float>& flat_sample);
Expand Down
Loading

0 comments on commit a4ac7e3

Please sign in to comment.