1.はじめに
2021年1月に、OpenAI が画像とテキストの類似度を測定できる CLIP を公開して以来、色々な画像生成系のネットワークと組み合わせて「文から絵を描く」のタスクが提案されています。このブログでも今まで、ディスクリートVAE、BigGAN、StyleGANとの組み合わせをご紹介して来ました。
今回ご紹介するのは、最適化プロセスを工夫することによって、「文から絵を描く」のタスクを今までとは違った生成過程で表現できる CLIPDraw という技術です。
*この論文は、2021.6に提出されました。
2.CLIPDraw
下記の図が、CLIPDraw のフレームワークです。CLIP は、画像エンコーダーとテキストエンコーダーから構成され、画像とテキストから同じ次元の特徴ベクトルを得ることができるので、そのCOS類似度を計算すると類似度が分かります。従って、これを利用して指定したテキストに最も適合した画像を求めれば良いわけです。
今回のポイントは、初期値にランダムなベジェ曲線を使い、指定されたテキストに最も一致するように、勾配降下法でこれらの曲線を徐々に調整する手法を取ることです。途中で使われている Image Augumentation は、作成した絵の特徴ベクトルの値を安定させるためのものです。
それでは、早速コードを動かしてみましょう。
3.コード
コードはGoogle Colabで動かす形にしてGithubに上げてありますので、それに沿って説明して行きます。自分で動かしてみたい方は、この「リンク」をクリックし表示されたノートブックの先頭にある「Colab on Web」ボタンをクリックすると動かせます。
まず、ライブラリーをインストールします。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
#@title Pre Installation {vertical-output: true} import subprocess CUDA_version = [s for s in subprocess.check_output(["nvcc", "--version"]).decode("UTF-8").split(", ") if s.startswith("release")][0].split(" ")[-1] print("CUDA version:", CUDA_version) if CUDA_version == "10.0": torch_version_suffix = "+cu100" elif CUDA_version == "10.1": torch_version_suffix = "+cu101" elif CUDA_version == "10.2": torch_version_suffix = "" else: torch_version_suffix = "+cu110" # !pip install torch==1.7.1{torch_version_suffix} torchvision==0.8.2{torch_version_suffix} -f https://download.pytorch.org/whl/torch_stable.html ftfy regex %cd /content/ !pip install svgwrite !pip install svgpathtools !pip install cssutils !pip install numba !pip install torch-tools !pip install visdom %tensorflow_version 1.x ### !git clone https://github.com/BachiLi/diffvg %cd diffvg # !ls !git submodule update --init --recursive !python setup.py install !pip install ftfy regex tqdm !pip install git+https://github.com/openai/CLIP.git --no-deps |
次に、Colab Notebook の左上にある、ランライム/「ランタイムを再起動」をクリックして下さい(クリックしないで次に進むと、後でエラーが発生しますので注意)。
次に、ライブラリーのインポートとクラス・関数の定義を行います。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
#@title Imports and Notebook Utilities {vertical-output: true} %tensorflow_version 2.x import os import io import PIL.Image, PIL.ImageDraw import base64 import zipfile import json import requests import numpy as np import matplotlib.pylab as pl import glob from IPython.display import Image, HTML, clear_output from tqdm import tqdm_notebook, tnrange os.environ['FFMPEG_BINARY'] = 'ffmpeg' import moviepy.editor as mvp from moviepy.video.io.ffmpeg_writer import FFMPEG_VideoWriter def imread(url, max_size=None, mode=None): if url.startswith(('http:', 'https:')): r = requests.get(url) f = io.BytesIO(r.content) else: f = url img = PIL.Image.open(f) if max_size is not None: img = img.resize((max_size, max_size)) if mode is not None: img = img.convert(mode) img = np.float32(img)/255.0 return img def np2pil(a): if a.dtype in [np.float32, np.float64]: a = np.uint8(np.clip(a, 0, 1)*255) return PIL.Image.fromarray(a) def imwrite(f, a, fmt=None): a = np.asarray(a) if isinstance(f, str): fmt = f.rsplit('.', 1)[-1].lower() if fmt == 'jpg': fmt = 'jpeg' f = open(f, 'wb') np2pil(a).save(f, fmt, quality=95) def imencode(a, fmt='jpeg'): a = np.asarray(a) if len(a.shape) == 3 and a.shape[-1] == 4: fmt = 'png' f = io.BytesIO() imwrite(f, a, fmt) return f.getvalue() def im2url(a, fmt='jpeg'): encoded = imencode(a, fmt) base64_byte_string = base64.b64encode(encoded).decode('ascii') return 'data:image/' + fmt.upper() + ';base64,' + base64_byte_string def imshow(a, fmt='jpeg'): display(Image(data=imencode(a, fmt))) def tile2d(a, w=None): a = np.asarray(a) if w is None: w = int(np.ceil(np.sqrt(len(a)))) th, tw = a.shape[1:3] pad = (w-len(a))%w a = np.pad(a, [(0, pad)]+[(0, 0)]*(a.ndim-1), 'constant') h = len(a)//w a = a.reshape([h, w]+list(a.shape[1:])) a = np.rollaxis(a, 2, 1).reshape([th*h, tw*w]+list(a.shape[4:])) return a from torchvision import utils def show_img(img): img = np.transpose(img, (1, 2, 0)) img = np.clip(img, 0, 1) img = np.uint8(img * 254) # img = np.repeat(img, 4, axis=0) # img = np.repeat(img, 4, axis=1) pimg = PIL.Image.fromarray(img, mode="RGB") imshow(pimg) def zoom(img, scale=4): img = np.repeat(img, scale, 0) img = np.repeat(img, scale, 1) return img class VideoWriter: def __init__(self, filename='_autoplay.mp4', fps=30.0, **kw): self.writer = None self.params = dict(filename=filename, fps=fps, **kw) def add(self, img): img = np.asarray(img) if self.writer is None: h, w = img.shape[:2] self.writer = FFMPEG_VideoWriter(size=(w, h), **self.params) if img.dtype in [np.float32, np.float64]: img = np.uint8(img.clip(0, 1)*255) if len(img.shape) == 2: img = np.repeat(img[..., None], 3, -1) self.writer.write_frame(img) def close(self): if self.writer: self.writer.close() def __enter__(self): return self def __exit__(self, *kw): self.close() if self.params['filename'] == '_autoplay.mp4': self.show() def show(self, **kw): self.close() fn = self.params['filename'] display(mvp.ipython_display(fn, **kw)) !nvidia-smi -L import numpy as np import torch import os # torch.set_default_tensor_type('torch.cuda.FloatTensor') print("Torch version:", torch.__version__) # !pip install DALL-E |
次に、CLIPをロードします。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
#@title Load CLIP {vertical-output: true} # os.environ['CUDA_LAUNCH_BLOCKING'] = '1' import os import clip import torch import torch.nn.functional as F import torchvision from torchvision import transforms from torchvision.datasets import CIFAR100 # Load the model device = torch.device('cuda') model, preprocess = clip.load('ViT-B/32', device, jit=False) nouns = "aardvark abyssinian accelerator accordion account accountant acknowledgment acoustic acrylic act action active activity actor actress adapter addition address adjustment adult advantage advertisement advice afghanistan africa aftermath afternoon aftershave afterthought age agenda agreement air airbus airmail airplane airport airship alarm albatross alcohol algebra algeria alibi alley alligator alloy almanac alphabet alto aluminium aluminum ambulance america amount amusement anatomy anethesiologist anger angle angora animal anime ankle answer ant antarctica anteater antelope anthony anthropology apartment apology apparatus apparel appeal appendix apple appliance approval april aquarius arch archaeology archeology archer architecture area argentina argument aries arithmetic arm armadillo armchair armenian army arrow art ash ashtray asia asparagus asphalt asterisk astronomy athlete atm atom attack attempt attention attic attraction august aunt australia australian author authorisation authority authorization avenue babies baboon baby back backbone bacon badge badger bag bagel bagpipe bail bait baker bakery balance balinese ball balloon bamboo banana band bandana bangladesh bangle banjo bank bankbook banker bar barbara barber barge baritone barometer base baseball basement basin basket basketball bass bassoon bat bath bathroom bathtub battery battle bay beach bead beam bean bear beard beast beat beautician beauty beaver bed bedroom bee beech beef beer beet beetle beggar beginner begonia behavior belgian belief believe bell belt bench bengal beret berry bestseller betty bibliography bicycle bike bill billboard biology biplane birch bird birth birthday bit bite black bladder blade blanket blinker blizzard block blood blouse blow blowgun blue board boat bobcat body bolt bomb bomber bone bongo bonsai book bookcase booklet boot border botany bottle bottom boundary bow bowl bowling box boy bra brace bracket brain brake branch brand brandy brass brazil bread break breakfast breath brian brick bridge british broccoli brochure broker bronze brother brother-in-law brow brown brush bubble bucket budget buffer buffet bugle building bulb bull bulldozer bumper bun burglar burma burn burst bus bush business butane butcher butter button buzzard cabbage cabinet cable cactus cafe cake calculator calculus calendar calf call camel camera camp can canada canadian cancer candle cannon canoe canvas cap capital cappelletti capricorn captain caption car caravan carbon card cardboard cardigan care carnation carol carp carpenter carriage carrot cart cartoon case cast castanet cat catamaran caterpillar cathedral catsup cattle cauliflower cause caution cave c-clamp cd ceiling celery celeste cell cellar cello celsius cement cemetery cent centimeter century ceramic cereal certification chain chair chalk chance change channel character chard charles chauffeur check cheek cheese cheetah chef chemistry cheque cherries cherry chess chest chick chicken chicory chief child children chill chime chimpanzee chin china chinese chive chocolate chord christmas christopher chronometer church cicada cinema circle circulation cirrus citizenship city clam clarinet class claus clave clef clerk click client climb clipper cloakroom clock close closet cloth cloud cloudy clover club clutch coach coal coast coat cobweb cockroach cocktail cocoa cod coffee coil coin coke cold collar college collision colombia colon colony color colt column columnist comb comfort comic comma command commission committee community company comparison competition competitor composer composition computer condition condor cone confirmation conga congo conifer connection consonant continent control cook cooking copper copy copyright cord cork cormorant corn cornet correspondent cost cotton couch cougar cough country course court cousin cover cow cowbell crab crack cracker craftsman crate crawdad crayfish crayon cream creator creature credit creditor creek crib cricket crime criminal crocodile crocus croissant crook crop cross crow crowd crown crush cry cub cuban cucumber cultivator cup cupboard cupcake curler currency current curtain curve cushion custard customer cut cuticle cycle cyclone cylinder cymbal dad daffodil dahlia daisy damage dance dancer danger daniel dash dashboard database date daughter david day dead deadline deal death deborah debt debtor decade december decimal decision decrease dedication deer defense deficit degree delete delivery den denim dentist deodorant department deposit description desert design desire desk dessert destruction detail detective development dew diamond diaphragm dibble dictionary dietician difference digestion digger digital dill dime dimple dinghy dinner dinosaur diploma dipstick direction dirt disadvantage discovery discussion disease disgust dish distance distribution distributor diving division divorced dock doctor dog dogsled doll dollar dolphin domain donald donkey donna door dorothy double doubt downtown dragon dragonfly drain drake drama draw drawbridge drawer dream dredger dress dresser dressing drill drink drive driver driving drizzle drop drug drum dry dryer duck duckling dugout dungeon dust eagle ear earth earthquake ease east edge edger editor editorial education edward eel effect egg eggnog eggplant egypt eight elbow element elephant elizabeth ellipse emery employee employer encyclopedia end enemy energy engine engineer engineering english enquiry entrance environment epoch epoxy equinox equipment era error estimate ethernet ethiopia euphonium europe evening event examination example exchange exclamation exhaust ex-husband existence expansion experience expert explanation ex-wife eye eyebrow eyelash eyeliner face facilities fact factory fahrenheit fairies fall family fan fang farm farmer fat father father-in-law faucet fear feast feather feature february fedelini feedback feeling feet felony female fender ferry ferryboat fertilizer fiber fiberglass fibre fiction field fifth fight fighter file find fine finger fir fire fired fireman fireplace firewall fish fisherman flag flame flare flat flavor flax flesh flight flock flood floor flower flugelhorn flute fly foam fog fold font food foot football footnote force forecast forehead forest forgery fork form format fortnight foundation fountain fowl fox foxglove fragrance frame france freckle freeze freezer freighter french freon friction friday fridge friend frog front frost frown fruit fuel fur furniture galley gallon game gander garage garden garlic gas gasoline gate gateway gauge gazelle gear gearshift geese gemini gender geography geology geometry george geranium german germany ghana ghost giant giraffe girdle girl gladiolus glass glider gliding glockenspiel glove glue goal goat gold goldfish golf gondola gong good-bye goose gore-tex gorilla gosling government governor grade grain gram granddaughter grandfather grandmother grandson grape graphic grass grasshopper gray grease great-grandfather great-grandmother greece greek green grenade grey grill grip ground group grouse growth guarantee guatemalan guide guilty guitar gum gun gym gymnast hacksaw hail hair haircut half-brother half-sister halibut hall hallway hamburger hammer hamster hand handball handicap handle handsaw harbor hardboard hardcover hardhat hardware harmonica harmony harp hat hate hawk head headlight headline health hearing heart heat heaven hedge height helen helicopter helium hell helmet help hemp hen heron herring hexagon hill himalayan hip hippopotamus history hobbies hockey hoe hole holiday home honey hood hook hope horn horse hose hospital hot hour hourglass house hovercraft hub hubcap humidity humor hurricane hyacinth hydrant hydrofoil hydrogen hyena hygienic ice icebreaker icicle icon idea ikebana illegal imprisonment improvement impulse inch income increase index india indonesia industry ink innocent input insect instruction instrument insulation insurance interactive interest internet interviewer intestine invention inventory invoice iran iraq iris iron island israel italian italy jacket jaguar jail jam james january japan japanese jar jasmine jason jaw jeans jeep jeff jelly jellyfish jennifer jet jewel jogging john join joke joseph journey judge judo juice july jumbo jump jumper june jury justice jute kale kamikaze kangaroo karate karen kayak kendo kenneth kenya ketchup kettle kettledrum kevin key keyboard keyboarding kick kidney kilogram kilometer kimberly kiss kitchen kite kitten kitty knee knickers knife knight knot knowledge kohlrabi korean laborer lace ladybug lake lamb lamp lan land landmine language larch lasagna latency latex lathe laugh laundry laura law lawyer layer lead leaf learning leather leek leg legal lemonade lentil leo leopard letter lettuce level libra library license lier lift light lightning lilac lily limit linda line linen link lion lip lipstick liquid liquor lisa list literature litter liver lizard llama loaf loan lobster lock locket locust look loss lotion love low lumber lunch lunchroom lung lunge lute luttuce lycra lynx lyocell lyre lyric macaroni machine macrame magazine magic magician maid mail mailbox mailman makeup malaysia male mall mallet man manager mandolin manicure manx map maple maraca marble march margaret margin maria marimba mark mark market married mary mascara mask mass match math mattock may mayonnaise meal measure meat mechanic medicine meeting melody memory men menu mercury message metal meteorology meter methane mexican mexico mice michael michelle microwave middle mile milk milkshake millennium millimeter millisecond mimosa mind mine minibus mini-skirt minister mint minute mirror missile mist mistake mitten moat modem mole mom monday money monkey month moon morning morocco mosque mosquito mother mother-in-law motion motorboat motorcycle mountain mouse moustache mouth move multi-hop multimedia muscle museum music musician mustard myanmar nail name nancy napkin narcissus nation neck need needle neon nepal nephew nerve nest net network news newsprint newsstand nic nickel niece nigeria night nitrogen node noise noodle north north america north korea norwegian nose note notebook notify novel november number numeric nurse nut nylon oak oatmeal objective oboe observation occupation ocean ocelot octagon octave october octopus odometer offence offer office oil okra olive onion open opera operation ophthalmologist opinion option orange orchestra orchid order organ organisation organization ornament ostrich otter ounce output outrigger oval oven overcoat owl owner ox oxygen oyster package packet page pail pain paint pair pajama pakistan palm pamphlet pan pancake pancreas panda pansy panther panties pantry pants panty pantyhose paper paperback parade parallelogram parcel parent parentheses park parrot parsnip part particle partner partridge party passbook passenger passive pasta paste pastor pastry patch path patient patio patricia paul payment pea peace peak peanut pear pedestrian pediatrician peen peer-to-peer pelican pen penalty pencil pendulum pentagon peony pepper perch perfume period periodical peripheral permission persian person peru pest pet pharmacist pheasant philippines philosophy phone physician piano piccolo pickle picture pie pig pigeon pike pillow pilot pimple pin pine ping pink pint pipe pisces pizza place plain plane planet plant plantation plaster plasterboard plastic plate platinum play playground playroom pleasure plier plot plough plow plywood pocket poet point poison poland police policeman polish politician pollution polo polyester pond popcorn poppy population porch porcupine port porter position possibility postage postbox pot potato poultry pound powder power precipitation preface prepared pressure price priest print printer prison probation process processing produce product production professor profit promotion propane property prose prosecution protest protocol pruner psychiatrist psychology ptarmigan puffin pull puma pump pumpkin punch punishment puppy purchase purple purpose push pvc pyjama pyramid quail quality quart quarter quartz queen question quicksand quiet quill quilt quince quit quiver quotation rabbi rabbit racing radar radiator radio radish raft rail railway rain rainbow raincoat rainstorm rake ramie random range rat rate raven ravioli ray rayon reaction reading reason receipt recess record recorder rectangle red reduction refrigerator refund regret reindeer relation relative religion relish reminder repair replace report representative request resolution respect responsibility rest restaurant result retailer revolve revolver reward rhinoceros rhythm rice richard riddle rifle ring rise risk river riverbed road roadway roast robert robin rock rocket rod roll romania romanian ronald roof room rooster root rose rotate route router rowboat rub rubber rugby rule run russia russian rutabaga ruth sack sagittarius sail sailboat sailor salad salary sale salesman salmon salt sampan samurai sand sandra sandwich santa sarah sardine satin saturday sauce saudi arabia sausage save saw saxophone scale scallion scanner scarecrow scarf scene scent schedule school science scissors scooter scorpio scorpion scraper screen screw screwdriver sea seagull seal seaplane search seashore season seat second secretary secure security seed seeder segment select selection self semicircle semicolon sense sentence separated september servant server session sex shade shadow shake shallot shame shampoo shape share shark sharon shears sheep sheet shelf shell shield shingle ship shirt shock shoe shoemaker shop shorts shoulder shovel show shrimp shrine siamese siberian side sideboard sidecar sidewalk sign signature silica silk silver sing singer single sink sister sister-in-law size skate skiing skill skin skirt sky slash slave sled sleep sleet slice slime slip slipper slope smash smell smile smoke snail snake sneeze snow snowboarding snowflake snowman snowplow snowstorm soap soccer society sociology sock soda sofa softball softdrink software soil soldier son song soprano sort sound soup sousaphone south africa south america south korea soy soybean space spade spaghetti spain spandex spark sparrow spear specialist speedboat sphere sphynx spider spike spinach spleen sponge spoon spot spring sprout spruce spy square squash squid squirrel stage staircase stamp star start starter state statement station statistic steam steel stem step step-aunt step-brother stepdaughter step-daughter step-father step-grandfather step-grandmother stepmother step-mother step-sister stepson step-son step-uncle steven stew stick stinger stitch stock stocking stomach stone stool stop stopsign stopwatch store storm story stove stranger straw stream street streetcar stretch string structure study sturgeon submarine substance subway success sudan suede sugar suggestion suit summer sun sunday sundial sunflower sunshine supermarket supply support surfboard surgeon surname surprise susan sushi swallow swamp swan sweater sweatshirt sweatshop swedish sweets swim swimming swing swiss switch sword swordfish sycamore syria syrup system table tablecloth tabletop tachometer tadpole tail tailor taiwan talk tank tanker tanzania target taste taurus tax taxi taxicab tea teacher teaching team technician teeth television teller temper temperature temple tempo tendency tennis tenor tent territory test text textbook texture thailand theater theory thermometer thing thistle thomas thought thread thrill throat throne thumb thunder thunderstorm thursday ticket tie tiger tights tile timbale time timer timpani tin tip tire titanium title toad toast toe toenail toilet tomato tom-tom ton tongue tooth toothbrush toothpaste top tornado tortellini tortoise touch tower town toy tractor trade traffic trail train tramp transaction transmission transport trapezoid tray treatment tree trial triangle trick trigonometry trip trombone trouble trousers trout trowel truck trumpet trunk t-shirt tsunami tub tuba tuesday tugboat tulip tuna tune turkey turkey turkish turn turnip turnover turret turtle tv twig twilight twine twist typhoon tyvek uganda ukraine ukrainian umbrella uncle underclothes underpants undershirt underwear unit united kingdom unshielded use utensil uzbekistan vacation vacuum valley value van vase vault vegetable vegetarian veil vein velvet venezuela venezuelan verdict vermicelli verse vessel vest veterinarian vibraphone vietnam view vinyl viola violet violin virgo viscose vise vision visitor voice volcano volleyball voyage vulture waiter waitress walk wall wallaby wallet walrus war warm wash washer wasp waste watch watchmaker water waterfall wave wax way wealth weapon weasel weather wedge wednesday weed weeder week weight whale wheel whip whiskey whistle white wholesaler whorl wilderness william willow wind windchime window windscreen windshield wine wing winter wire wish witch withdrawal witness wolf woman women wood wool woolen word work workshop worm wound wrecker wren wrench wrinkle wrist writer xylophone yacht yak yam yard yarn year yellow yew yogurt yoke yugoslavian zebra zephyr zinc zipper zone zoo zoology" nouns = nouns.split(" ") noun_prompts = ["a drawing of a " + x for x in nouns] # Calculate features with torch.no_grad(): nouns_features = model.encode_text(torch.cat([clip.tokenize(noun_prompts).to(device)])) print(nouns_features.shape, nouns_features.dtype) |
そして、テキストから絵を生成します。5行目の prompt =
に、テキストを入力して実行して下さい。ここでは、prompt = “Watercolor painting of an underwater submarine.”(水中の潜水艦の水彩画)としています。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
#@title Curve Optimizer {vertical-output: true} %cd /content/diffvg/apps/ prompt = "Watercolor painting of an underwater submarine." neg_prompt = "A badly drawn sketch." neg_prompt_2 = "Many ugly, messy drawings." text_input = clip.tokenize(prompt).to(device) text_input_neg1 = clip.tokenize(neg_prompt).to(device) text_input_neg2 = clip.tokenize(neg_prompt_2).to(device) use_negative = False # Use negative prompts? # Thanks to Katherine Crowson for this. # In the CLIPDraw code used to generate examples, we don't normalize images # before passing into CLIP, but really you should. Turn this to True to do that. use_normalized_clip = False # Calculate features with torch.no_grad(): text_features = model.encode_text(text_input) text_features_neg1 = model.encode_text(text_input_neg1) text_features_neg2 = model.encode_text(text_input_neg2) import pydiffvg import torch import skimage import skimage.io import random import ttools.modules import argparse import math import torchvision import torchvision.transforms as transforms pydiffvg.set_print_timing(False) gamma = 1.0 # ARGUMENTS. Feel free to play around with these, especially num_paths. args = lambda: None args.num_paths = 256 args.num_iter = 1000 args.max_width = 50 # Use GPU if available pydiffvg.set_use_gpu(torch.cuda.is_available()) device = torch.device('cuda') pydiffvg.set_device(device) canvas_width, canvas_height = 224, 224 num_paths = args.num_paths max_width = args.max_width # Image Augmentation Transformation augment_trans = transforms.Compose([ transforms.RandomPerspective(fill=1, p=1, distortion_scale=0.5), transforms.RandomResizedCrop(224, scale=(0.7,0.9)), ]) if use_normalized_clip: augment_trans = transforms.Compose([ transforms.RandomPerspective(fill=1, p=1, distortion_scale=0.5), transforms.RandomResizedCrop(224, scale=(0.7,0.9)), transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)) ]) # Initialize Random Curves shapes = [] shape_groups = [] for i in range(num_paths): num_segments = random.randint(1, 3) num_control_points = torch.zeros(num_segments, dtype = torch.int32) + 2 points = [] p0 = (random.random(), random.random()) points.append(p0) for j in range(num_segments): radius = 0.1 p1 = (p0[0] + radius * (random.random() - 0.5), p0[1] + radius * (random.random() - 0.5)) p2 = (p1[0] + radius * (random.random() - 0.5), p1[1] + radius * (random.random() - 0.5)) p3 = (p2[0] + radius * (random.random() - 0.5), p2[1] + radius * (random.random() - 0.5)) points.append(p1) points.append(p2) points.append(p3) p0 = p3 points = torch.tensor(points) points[:, 0] *= canvas_width points[:, 1] *= canvas_height path = pydiffvg.Path(num_control_points = num_control_points, points = points, stroke_width = torch.tensor(1.0), is_closed = False) shapes.append(path) path_group = pydiffvg.ShapeGroup(shape_ids = torch.tensor([len(shapes) - 1]), fill_color = None, stroke_color = torch.tensor([random.random(), random.random(), random.random(), random.random()])) shape_groups.append(path_group) # Just some diffvg setup scene_args = pydiffvg.RenderFunction.serialize_scene(\ canvas_width, canvas_height, shapes, shape_groups) render = pydiffvg.RenderFunction.apply img = render(canvas_width, canvas_height, 2, 2, 0, None, *scene_args) points_vars = [] stroke_width_vars = [] color_vars = [] for path in shapes: path.points.requires_grad = True points_vars.append(path.points) path.stroke_width.requires_grad = True stroke_width_vars.append(path.stroke_width) for group in shape_groups: group.stroke_color.requires_grad = True color_vars.append(group.stroke_color) # Optimizers points_optim = torch.optim.Adam(points_vars, lr=1.0) width_optim = torch.optim.Adam(stroke_width_vars, lr=0.1) color_optim = torch.optim.Adam(color_vars, lr=0.01) # Run the main optimization loop for t in range(args.num_iter): # Anneal learning rate (makes videos look cleaner) if t == int(args.num_iter * 0.5): for g in points_optim.param_groups: g['lr'] = 0.4 if t == int(args.num_iter * 0.75): for g in points_optim.param_groups: g['lr'] = 0.1 points_optim.zero_grad() width_optim.zero_grad() color_optim.zero_grad() scene_args = pydiffvg.RenderFunction.serialize_scene(\ canvas_width, canvas_height, shapes, shape_groups) img = render(canvas_width, canvas_height, 2, 2, t, None, *scene_args) img = img[:, :, 3:4] * img[:, :, :3] + torch.ones(img.shape[0], img.shape[1], 3, device = pydiffvg.get_device()) * (1 - img[:, :, 3:4]) if t % 5 == 0: pydiffvg.imwrite(img.cpu(), '/content/res/iter_{}.png'.format(int(t/5)), gamma=gamma) img = img[:, :, :3] img = img.unsqueeze(0) img = img.permute(0, 3, 1, 2) # NHWC -> NCHW loss = 0 NUM_AUGS = 4 img_augs = [] for n in range(NUM_AUGS): img_augs.append(augment_trans(img)) im_batch = torch.cat(img_augs) image_features = model.encode_image(im_batch) for n in range(NUM_AUGS): loss -= torch.cosine_similarity(text_features, image_features[n:n+1], dim=1) if use_negative: loss += torch.cosine_similarity(text_features_neg1, image_features[n:n+1], dim=1) * 0.3 loss += torch.cosine_similarity(text_features_neg2, image_features[n:n+1], dim=1) * 0.3 # Backpropagate the gradients. loss.backward() # Take a gradient descent step. points_optim.step() width_optim.step() color_optim.step() for path in shapes: path.stroke_width.data.clamp_(1.0, max_width) for group in shape_groups: group.stroke_color.data.clamp_(0.0, 1.0) if t % 10 == 0: show_img(img.detach().cpu().numpy()[0]) # show_img(torch.cat([img.detach(), img_aug.detach()], axis=3).cpu().numpy()[0]) print('render loss:', loss.item()) print('iteration:', t) with torch.no_grad(): im_norm = image_features / image_features.norm(dim=-1, keepdim=True) noun_norm = nouns_features / nouns_features.norm(dim=-1, keepdim=True) similarity = (100.0 * im_norm @ noun_norm.T).softmax(dim=-1) values, indices = similarity[0].topk(5) print("\nTop predictions:\n") for value, index in zip(values, indices): print(f"{nouns[index]:>16s}: {100 * value.item():.2f}%") |
最後に生成プロセスの動画を作成します。作成した動画は以下に保存されます。diffvg/res/out_longer.mp4
(ベジェ曲線調整)、diffvg/res/out_strokes_longer.mp4
(ストローク)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
#@title Video Renderer {vertical-output: true} # Render a picture with each stroke. with torch.no_grad(): for i in range(args.num_paths): print(i) scene_args = pydiffvg.RenderFunction.serialize_scene(\ canvas_width, canvas_height, shapes[:i+1], shape_groups[:i+1]) img = render(canvas_width, canvas_height, 2, 2, t, None, *scene_args) img = img[:, :, 3:4] * img[:, :, :3] + torch.ones(img.shape[0], img.shape[1], 3, device = pydiffvg.get_device()) * (1 - img[:, :, 3:4]) pydiffvg.imwrite(img.cpu(), '/content/res/stroke_{}.png'.format(i), gamma=gamma) print("ffmpeging") # Convert the intermediate renderings to a video. from subprocess import call call(["ffmpeg", "-y", "-framerate", "60", "-i", "/content/res/iter_%d.png", "-vb", "20M", "/content/res/out.mp4"]) call(["ffmpeg", "-y", "-framerate", "60", "-i", "/content/res/stroke_%d.png", "-vb", "20M", "/content/res/out_strokes.mp4"]) call(["ffmpeg", "-y", "-i", "/content/res/out.mp4", "-filter_complex", "[0]trim=0:2[hold];[0][hold]concat[extended];[extended][0]overlay", "/content/res/out_longer.mp4"]) call(["ffmpeg", "-y", "-i", "/content/res/out_strokes.mp4", "-filter_complex", "[0]trim=0:2[hold];[0][hold]concat[extended];[extended][0]overlay", "/content/res/out_strokes_longer.mp4"]) display(mvp.ipython_display("/content/res/out_longer.mp4")) display(mvp.ipython_display("/content/res/out_strokes_longer.mp4")) |
これは、「ベジェ曲線調整」の方の動画です。
それでは、違うテキストでいくつか生成してみましょう。
prompt = "Watercolor painting of a woman holding a parasol."
(傘をさした女性の水彩画)
prompt = "Watercolor painting of a woman riding a bicycle."
(自転車に乗っている女性の水彩画)
では、また。