forked from gordicaleksa/pytorch-deepdream
-
Notifications
You must be signed in to change notification settings - Fork 0
/
playground.py
216 lines (162 loc) · 9.17 KB
/
playground.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
"""
This file serves as a playground for understanding some of the concepts used
in the development of the DeepDream algorithm.
"""
import time
import os
import enum
import numpy as np
import scipy.ndimage as nd
import matplotlib.pyplot as plt
import torch
import cv2 as cv
from torchvision import transforms
from utils.constants import *
import utils.utils as utils
import utils.video_utils as video_utils
from models.definitions.vggs import Vgg16
# Note: don't use scipy.ndimage it's way slower than OpenCV
def understand_frame_transform():
"""
Pick different transform matrices here and see what they do.
"""
height, width, num_channels = [500, 500, 3]
s = 0.05
# Create a white square on the black background
img = np.zeros((height, width, num_channels))
img[100:400, 100:400] = 1.0
img_center = (width / 2, height / 2)
# Translation
tx, ty = [10, 5]
translation_matrix = np.asarray([[1., 0., tx],
[0., 1., ty],
[0., 0., 1.]])
# Rotation
deg = 10 # rotation in degrees
theta = (deg / 180) * np.pi # convert to radians
origin_rotation_matrix = np.asarray([[np.cos(theta), -np.sin(theta), 0],
[np.sin(theta), np.cos(theta), 0],
[0., 0., 1.]])
# Does a similar thing to above but returns 2x3 matrix so just append the last row
rotation_matrix = cv.getRotationMatrix2D(img_center, deg, scale=1.09)
full_rotation_matrix = np.vstack([rotation_matrix, np.asarray([0., 0., 1.])])
# Affine
pts1 = np.float32([[50, 50], [200, 50], [50, 200]])
pts2 = np.float32([[10, 100], [200, 50], [100, 250]])
affine_matrix = cv.getAffineTransform(pts1, pts2)
full_affine_matrix = np.vstack([affine_matrix, np.asarray([0., 0., 1.])])
# Perspective
pts1 = np.float32([[56, 65], [368, 52], [28, 387], [389, 390]])
pts2 = np.float32([[0, 0], [300, 0], [0, 300], [300, 300]])
perspective_matrix = cv.getPerspectiveTransform(pts1, pts2) # This one gives 3x3 transform matrix directly
# This one was originally used and it represents diagonal values of the 3x3 projective matrix
# You can't use it with OpenCV in this form
zoom_matrix_simple = np.asarray([1 - s, 1 - s, 1])
ts = time.time() # start perf timer
transformed_img = img
for i in range(10):
# transformed_img = nd.affine_transform(transformed_img, zoom_matrix, [height * s / 2, width * s / 2, 0], order=1)
transformed_img = cv.warpPerspective(transformed_img, full_rotation_matrix, (width, height))
plt.imshow(np.hstack([img, transformed_img])); plt.show()
# And this kids is why you should use OpenCV
# nd.affine_transform: ~720 ms
# cv.warpPerspective: ~ 17 ms
print(f'{(time.time()-ts)*1000} ms') # result readout
def understand_blend():
img1 = utils.load_image(os.path.join(INPUT_DATA_PATH, 'figures.jpg'), (500, 500))
img2 = utils.load_image(os.path.join(INPUT_DATA_PATH, 'cloud.jpg'), (500, 500))
for alpha in np.arange(0, 1.2, 0.2):
blend = img1 + alpha * (img2 - img1) # This is how PIL's blend works simple linear interpolation
plt.imshow(blend)
plt.show()
def understand_pytorch_gradients():
"""
This builds up a computational graph in PyTorch the same way as a neural network does and is enough to understand
why dst.diff[:] = dst.data (used in the original repo) is equivalent to MSE loss with sum reduction divided by 2.
Most of the implementations use some form of MSE loss or L2, so it's worth understanding the equivalence.
I found this blog super useful for understanding how automatic differentiation engine works in PyTorch:
https://blog.paperspace.com/pytorch-101-understanding-graphs-and-automatic-differentiation/
"""
def print_tensor_summary(t): # helper function
print(f'data={t.data}')
print(f'requires_grad={t.requires_grad}')
print(f'grad={t.grad}')
print(f'grad_fn={t.grad_fn}')
print(f'is_leaf={t.is_leaf}')
x = torch.tensor([[-2.0, 1.0], [1.0, 3.0]], requires_grad=True) # think of x as the input image
y = x + 2 # some random processing builds up the computational graph in PyTorch
z = y * y * 3
z.backward(z) # this one is equivalent to the commented out expression below
# z is a matrix like this z = [[z11, z12], [z21, z22]] so doing MSE loss with sum reduction will give us this:
# out = (z11^2 + z12^2 + z21^2 + z22^2) / 2 -> so dL/dz11 = z11 similarly for z12, z21, z22
# that means that grad of z11 node will be populated with exactly z11 value (that's dL/dz11)
# because the grad field of z11 should store dL/dz11 and that's the reason why z.backward(z) also works.
# backward() implicitly passes torch.tensor(1.) as the argument,
# because dL/L = 1 (derivative of loss with respect to loss equals 1)
# out = torch.nn.MSELoss(reduction='sum')(z, torch.zeros_like(z)) / 2
# out.backward()
print_tensor_summary(x) # Try both out and you will see that grad field of x ("the image") is the same
# On calling backward(), gradients are populated only for the nodes which have both requires_grad and is_leaf True.
# Here only x is both the leaf and has requires_grad set to true. Print other tensors and you'll see that grad=None.
# The backward graph is created dynamically during the forward pass.
# Graph consists of Function objects (the .grad_fn attribute of each torch.Tensor is an entry point into this graph)
# Function class has 2 important member functions: 1) forward 2) backward which are called during forward/backprop
# Take your time to understand this, it's actually really easy once it sinks in.
def deep_dream_simple(img_path, dump_path):
"""
Contains the gist of DeepDream algorithm - takes 5 minutes to write down - if you know what you're doing.
No support for: spatial shifting (aka jitter), octaves/image pyramid, clipping, gradient smoothing, etc.
Most of the "code" are comments otherwise it literally takes 15 minutes to write down.
"""
img = utils.load_image(img_path, target_shape=500) # load numpy, [0, 1] image
# Normalize image - VGG 16 and in general Pytorch (torchvision) models were trained like this,
# so they learned to work with this particular distribution
img = (img - IMAGENET_MEAN_1) / IMAGENET_STD_1
# Transform into PyTorch tensor, send to GPU and add dummy batch dimension. Models are expecting it, GPUs are
# highly parallel computing machines so in general we'd like to process multiple images all at once
# shape = (1, 3, H, W)
img_tensor = transforms.ToTensor()(img).to(DEVICE).unsqueeze(0)
img_tensor.requires_grad = True # set this to true so that PyTorch will start calculating gradients for img_tensor
model = Vgg16(requires_grad=False).to(DEVICE) # Instantiate VGG 16 and send it to GPU
n_iterations = 10
learning_rate = 0.3
for iter in range(n_iterations):
out = model(img_tensor)
activations = out.relu4_3 # pick out particular feature maps (aka activations) that you're interested in
activations.backward(activations) # whatever is the biggest activation value make it even bigger
img_tensor_grad = img_tensor.grad.data
smooth_grads = img_tensor_grad / torch.std(img_tensor_grad)
img_tensor.data += learning_rate * smooth_grads # gradient ascent
img_tensor.grad.data.zero_() # clear the gradients otherwise they would get accumulated
# Send the PyTorch tensor back to CPU, detach it from the computational graph, convert to numpy
# and make it channel last format again (calling ToTensor converted it to channel-first format)
img = np.moveaxis(img_tensor.to('cpu').detach().numpy()[0], 0, 2)
img = (img * IMAGENET_STD_1) + IMAGENET_MEAN_1 # de-normalize
img = (np.clip(img, 0., 1.) * 255).astype(np.uint8)
cv.imwrite(dump_path, img[:, :, ::-1]) # ::-1 because opencv expects BGR (and not RGB) format...
print(f'Saved naive deep dream image to {os.path.relpath(dump_path)}')
class PLAYGROUND(enum.Enum):
GEOMETRIC_TRANSFORMS = 0,
BLEND = 1,
PT_GRADIENTS = 2,
DEEPDREAM_NAIVE = 3,
CREATE_GIF = 4
if __name__ == "__main__":
# Pick the function you want to play with here
playground_fn = PLAYGROUND.DEEPDREAM_NAIVE
if playground_fn == PLAYGROUND.GEOMETRIC_TRANSFORMS:
understand_frame_transform()
elif playground_fn == PLAYGROUND.BLEND:
understand_blend()
elif playground_fn == PLAYGROUND.PT_GRADIENTS:
understand_pytorch_gradients()
elif playground_fn == PLAYGROUND.DEEPDREAM_NAIVE:
img_path = os.path.join(INPUT_DATA_PATH, 'figures.jpg')
dump_path = os.path.join(OUT_IMAGES_PATH, 'simple.jpg')
deep_dream_simple(img_path, dump_path)
elif playground_fn == PLAYGROUND.CREATE_GIF:
# change this to a directory where you've saved the frames of interest
input_frames_dir = os.path.join(DATA_DIR_PATH, 'input')
video_utils.create_gif(input_frames_dir, os.path.join(OUT_GIF_PATH, 'default.gif'))
else:
raise Exception(f'{playground_fn} not supported!')