Stable Diffusionのimg2imgをGoogle Colabでやってみる

cedro

2年前

1.はじめに

　AI画像生成を行うStable Diffusionに、img2img（画像と文から画像生成する機能）が追加されました。前回に続いて、Google Colabで動かしてみたいと思います。

2.コード

　Hugging Faceからアクセス・トークンの取得をしていない方は、前回のブログの「2.アクセス・トークンの取得」を参考に取得してから下記に進んで下さい。

　コードはGoogle Colabで動かす形にしてGithubに上げてありますので、それに沿って説明して行きます。自分で動かしてみたい方は、この「リンク」をクリックし表示されたノートブックの先頭にある「Colab on Web」ボタンをクリックすると動かせます。

　まず、セットアップをおこないます。

#@title **セットアップ**

# ライブラリ・インストール
! pip install transformers gradio scipy ftfy "ipywidgets>=7,<8" datasets

# githubからコードをコピーしインストール
! git clone https://github.com/huggingface/diffusers.git
! pip install git+https://github.com/huggingface/diffusers.git
%cd diffusers

# 関数定義（追加）
import PIL
from PIL import Image
import numpy as np

def preprocess(image):
    w, h = image.size
    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
    image = np.array(image).astype(np.float32) / 255.0
    image = image[None].transpose(0, 3, 1, 2)
    image = torch.from_numpy(image)
    return 2.*image - 1.

#@title **セットアップ**

# ライブラリ・インストール

! pip install transformers gradio scipy ftfy "ipywidgets>=7,<8" datasets

# githubからコードをコピーしインストール

! git clone https://github.com/huggingface/diffusers.git

! pip install git+https://github.com/huggingface/diffusers.git

%cd diffusers

# 関数定義（追加）

import PIL

from PIL import Image

import numpy as np

def preprocess(image):

w, h = image.size

w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32

image = image.resize((w, h), resample=PIL.Image.LANCZOS)

image = np.array(image).astype(np.float32) / 255.0

image = image[None].transpose(0, 3, 1, 2)

image = torch.from_numpy(image)

return 2.*image - 1.

　次に、Hugging Faceにログインします。

#@title **Hugging Faceへログイン**
#@markdown　・事前にHagging Faceでアクセス・トークンを取得しておいて下さい

from huggingface_hub import notebook_login

# ログイン
notebook_login()

#@title **Hugging Faceへログイン**

#@markdown　・事前にHagging Faceでアクセス・トークンを取得しておいて下さい

from huggingface_hub import notebook_login

# ログイン

notebook_login()

　すると下記の表示が現れます。your Hugging Face tokens page をクリックしてHugging FaceのHPに飛び、Access Tokensをコピーして来ます。そして、「Token」のところへAccess Tokenをペーストし、「Login」をクリックします。

　それでは、本体プログラムを実行します。

#@title **本体プログラム**
import gradio as gr
import torch
from torch import autocast
from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
import requests
from PIL import Image
from io import BytesIO
from IPython.display import clear_output ###

#from examples.inference.image_to_image import StableDiffusionImg2ImgPipeline, preprocess
from diffusers import StableDiffusionImg2ImgPipeline

lms = LMSDiscreteScheduler(
    beta_start=0.00085, 
    beta_end=0.012, 
    beta_schedule="scaled_linear"
)

pipe = StableDiffusionPipeline.from_pretrained(
    "CompVis/stable-diffusion-v1-4", 
    scheduler=lms,
    revision="fp16", 
    use_auth_token=True
).to("cuda")

pipeimg = StableDiffusionImg2ImgPipeline.from_pretrained(
    "CompVis/stable-diffusion-v1-4",
    revision="fp16", 
    torch_dtype=torch.float16,
    use_auth_token=True
).to("cuda")




block = gr.Blocks(css=".container { max-width: 800px; margin: auto; }")

num_samples = 2

def infer(prompt, init_image, strength):
    if init_image != None:
        init_image = init_image.resize((512, 512))
        init_image = preprocess(init_image)
        with autocast("cuda"):
            images = pipeimg([prompt] * num_samples, init_image=init_image, strength=strength, guidance_scale=7.5)[0]
    else: 
        with autocast("cuda"):
            images = pipe([prompt] * num_samples, guidance_scale=7.5)[0]

    return images


with block as demo:
    gr.Markdown("<h1><center>Stable Diffusion</center></h1>")
    gr.Markdown(
        "Stable Diffusion is an AI model that generates images from any prompt you give!"
    )
    with gr.Group():
        with gr.Box():
            with gr.Row().style(mobile_collapse=False, equal_height=True):

                text = gr.Textbox(
                    label="Enter your prompt", show_label=False, max_lines=1
                ).style(
                    border=(True, False, True, True),
                    rounded=(True, False, False, True),
                    container=False,
                )
                btn = gr.Button("Run").style(
                    margin=False,
                    rounded=(False, True, True, False),
                )
        strength_slider = gr.Slider(
            label="Strength",
            maximum = 1,
            value = 0.75         
        )
        image = gr.Image(
            label="Intial Image",
            type="pil"
        )
               
        gallery = gr.Gallery(label="Generated images", show_label=False).style(
            grid=[2], height="auto"
        )
        text.submit(infer, inputs=[text,image,strength_slider], outputs=gallery)
        btn.click(infer, inputs=[text,image,strength_slider], outputs=gallery)

    gr.Markdown(
        """___
   <p style='text-align: center'>
   Created by CompVis and Stability AI
   <br/>
   </p>"""
    )

clear_output() ###
demo.launch(debug=True)

#@title **本体プログラム**

import gradio as gr

import torch

from torch import autocast

from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler

import requests

from PIL import Image

from io import BytesIO

from IPython.display import clear_output ###

#from examples.inference.image_to_image import StableDiffusionImg2ImgPipeline, preprocess

from diffusers import StableDiffusionImg2ImgPipeline

lms = LMSDiscreteScheduler(

beta_start=0.00085,

beta_end=0.012,

beta_schedule="scaled_linear"

)

pipe = StableDiffusionPipeline.from_pretrained(

"CompVis/stable-diffusion-v1-4",

scheduler=lms,

revision="fp16",

use_auth_token=True

).to("cuda")

pipeimg = StableDiffusionImg2ImgPipeline.from_pretrained(

"CompVis/stable-diffusion-v1-4",

revision="fp16",

torch_dtype=torch.float16,

use_auth_token=True

).to("cuda")

block = gr.Blocks(css=".container { max-width: 800px; margin: auto; }")

num_samples = 2

def infer(prompt, init_image, strength):

if init_image != None:

init_image = init_image.resize((512, 512))

init_image = preprocess(init_image)

with autocast("cuda"):

images = pipeimg([prompt] * num_samples, init_image=init_image, strength=strength, guidance_scale=7.5)[0]

else:

with autocast("cuda"):

images = pipe([prompt] * num_samples, guidance_scale=7.5)[0]

return images

with block as demo:

gr.Markdown("<h1><center>Stable Diffusion</center></h1>")

gr.Markdown(

"Stable Diffusion is an AI model that generates images from any prompt you give!"

)

with gr.Group():

with gr.Box():

with gr.Row().style(mobile_collapse=False, equal_height=True):

text = gr.Textbox(

label="Enter your prompt", show_label=False, max_lines=1

).style(

border=(True, False, True, True),

rounded=(True, False, False, True),

container=False,

)

btn = gr.Button("Run").style(

margin=False,

rounded=(False, True, True, False),

)

strength_slider = gr.Slider(

label="Strength",

maximum = 1,

value = 0.75

)

image = gr.Image(

label="Intial Image",

type="pil"

)

gallery = gr.Gallery(label="Generated images", show_label=False).style(

grid=[2], height="auto"

)

text.submit(infer, inputs=[text,image,strength_slider], outputs=gallery)

btn.click(infer, inputs=[text,image,strength_slider], outputs=gallery)

gr.Markdown(

"""___

Created by CompVis and Stability AI

<br/>

</p>"""

)

clear_output() ###

demo.launch(debug=True)

　実行が完了すると下記の画面が最後に表示されます。以降は、このGUIから操作を行います。

　それでは、実際に動かしてみましょう。まず、英文を記入します。ここでは、「The Lady of the rose」と記入しています。次に、Strengthで画像にどれだけ文を影響させるか（数字が大きいほど影響度大）スライダーで設定します。ここでは0.75と設定しています。

　そして、画像（jpg）をInital imageにドラッグ＆ドロップします（上手く行かない場合はクリックするとファイルウインドウが開くのでそこから選択します）。ここでは、バラの花を背にした女性の画像を使っています。最後に、「Run」をクリックします。

　変換が完了すると下に画像が２枚表示されます。これは画像に「The Lady of the rose」という文を0.75のStrength（強さ）で影響させた結果です。基本的な構成は変更せず細部だけオリジナルなものに変わっていることが分かります。

　画像のダウンロードは、画像上で右クリックして「名前を付けて画像を保存」を選択します。

　なお、Inital imageを空白にして、文のみで画像生成しようとすると、「RuntimeError: CUDA out of memory.」でエラーが発生する場合があります。もしそうなった場合は、「ランタイム／ランタイムを接続解除して削除」をクリックして最初からやり直して下さい。

　この機能の応用例がTwitterにありましたので、参考に載せておきます。

#stablediffusion の #Img2Img (指定画像から画像を生成する)機能を使ってみました。
3分で描いた指示用雑絵(2枚目)に絵の要素のプロンプトを指示し1枚目を生成しました。
2枚とも生成時・指示時のもので、無編集。
いや、すごい…… pic.twitter.com/sDKYBNFCOA
— 852話 (@8co28) August 24, 2022

　この機能は色々な応用が出来そうで面白いですね。

　では、また。

2022.8.31 アップデート

　github.com/huggingface のコードの修正に伴い、コードをアップデートしました。