From d9075be494ac9796ed096dfb7741c36c33cba813 Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Mon, 27 Nov 2023 06:52:36 -1000 Subject: [PATCH] [load_textual_inversion]: allow multiple tokens (#5837) Co-authored-by: yiyixuxu --- src/diffusers/loaders/textual_inversion.py | 16 +++++++- tests/pipelines/test_pipelines.py | 48 ++++++++++++++++++++++ 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/src/diffusers/loaders/textual_inversion.py b/src/diffusers/loaders/textual_inversion.py index e36f03437a45..d03bd74d5250 100644 --- a/src/diffusers/loaders/textual_inversion.py +++ b/src/diffusers/loaders/textual_inversion.py @@ -189,7 +189,7 @@ def _check_text_inv_inputs(self, tokenizer, text_encoder, pretrained_model_name_ f" `{self.load_textual_inversion.__name__}`" ) - if len(pretrained_model_name_or_paths) != len(tokens): + if len(pretrained_model_name_or_paths) > 1 and len(pretrained_model_name_or_paths) != len(tokens): raise ValueError( f"You have passed a list of models of length {len(pretrained_model_name_or_paths)}, and list of tokens of length {len(tokens)} " f"Make sure both lists have the same length." @@ -382,7 +382,9 @@ def load_textual_inversion( if not isinstance(pretrained_model_name_or_path, list) else pretrained_model_name_or_path ) - tokens = len(pretrained_model_name_or_paths) * [token] if (isinstance(token, str) or token is None) else token + tokens = [token] if not isinstance(token, list) else token + if tokens[0] is None: + tokens = tokens * len(pretrained_model_name_or_paths) # 3. Check inputs self._check_text_inv_inputs(tokenizer, text_encoder, pretrained_model_name_or_paths, tokens) @@ -390,6 +392,16 @@ def load_textual_inversion( # 4. Load state dicts of textual embeddings state_dicts = load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs) + # 4.1 Handle the special case when state_dict is a tensor that contains n embeddings for n tokens + if len(tokens) > 1 and len(state_dicts) == 1: + if isinstance(state_dicts[0], torch.Tensor): + state_dicts = list(state_dicts[0]) + if len(tokens) != len(state_dicts): + raise ValueError( + f"You have passed a state_dict contains {len(state_dicts)} embeddings, and list of tokens of length {len(tokens)} " + f"Make sure both have the same length." + ) + # 4. Retrieve tokens and embeddings tokens, embeddings = self._retrieve_tokens_and_embeddings(tokens, state_dicts, tokenizer) diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index d812ce0ccb95..32ae81ddc2d8 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -792,6 +792,54 @@ def test_text_inversion_download(self): out = pipe(prompt, num_inference_steps=1, output_type="numpy").images assert out.shape == (1, 128, 128, 3) + def test_text_inversion_multi_tokens(self): + pipe1 = StableDiffusionPipeline.from_pretrained( + "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None + ) + pipe1 = pipe1.to(torch_device) + + token1, token2 = "<*>", "<**>" + ten1 = torch.ones((32,)) + ten2 = torch.ones((32,)) * 2 + + num_tokens = len(pipe1.tokenizer) + + pipe1.load_textual_inversion(ten1, token=token1) + pipe1.load_textual_inversion(ten2, token=token2) + emb1 = pipe1.text_encoder.get_input_embeddings().weight + + pipe2 = StableDiffusionPipeline.from_pretrained( + "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None + ) + pipe2 = pipe2.to(torch_device) + pipe2.load_textual_inversion([ten1, ten2], token=[token1, token2]) + emb2 = pipe2.text_encoder.get_input_embeddings().weight + + pipe3 = StableDiffusionPipeline.from_pretrained( + "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None + ) + pipe3 = pipe3.to(torch_device) + pipe3.load_textual_inversion(torch.stack([ten1, ten2], dim=0), token=[token1, token2]) + emb3 = pipe3.text_encoder.get_input_embeddings().weight + + assert len(pipe1.tokenizer) == len(pipe2.tokenizer) == len(pipe3.tokenizer) == num_tokens + 2 + assert ( + pipe1.tokenizer.convert_tokens_to_ids(token1) + == pipe2.tokenizer.convert_tokens_to_ids(token1) + == pipe3.tokenizer.convert_tokens_to_ids(token1) + == num_tokens + ) + assert ( + pipe1.tokenizer.convert_tokens_to_ids(token2) + == pipe2.tokenizer.convert_tokens_to_ids(token2) + == pipe3.tokenizer.convert_tokens_to_ids(token2) + == num_tokens + 1 + ) + assert emb1[num_tokens].sum().item() == emb2[num_tokens].sum().item() == emb3[num_tokens].sum().item() + assert ( + emb1[num_tokens + 1].sum().item() == emb2[num_tokens + 1].sum().item() == emb3[num_tokens + 1].sum().item() + ) + def test_download_ignore_files(self): # Check https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe-ignore-files/blob/72f58636e5508a218c6b3f60550dc96445547817/model_index.json#L4 with tempfile.TemporaryDirectory() as tmpdirname: