CIS565-Fall-2016 · itoupeter · Nov 13, 2016 · Nov 16, 2016 · Dec 4, 2016
diff --git a/README.md b/README.md
@@ -3,13 +3,24 @@ Vulkan Flocking: compute and shading in one pipeline!
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 6**
 
-* (TODO) YOUR NAME HERE
-  Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Liang Peng
+* Tested on: **54.0.2840.87 m (64-bit)** on
+  Windows 10, i7-6700HQ @ 2.6GHz 8GB, GTX 960 (Personal Laptop)
 
-  ### (TODO: Your README)
+## Overview
 
-  Include screenshots, analysis, etc. (Remember, this is public, so don't put
-  anything here that you don't want to share with the world.)
+![](img/1.gif)
+
+![](img/2.gif)
+
+## Analysis
+
+* Vulkan expects explicit descriptors for generating pipelines and commands because it needs to know how data such as vertex attributes (position, normal, texcoord, index, etc.) are stored in GPU memory and bound to uniforms and variables in shader code so that vs, cs, ps can get correct data to do computation or shading and pass over data to next stage in the pipeline.
+* A situation in which processing algorithm is the same but input data sources are different, such as rendering a textured cube and a textured sphere, with different color maps.
+* Problems to keep in mind when using multiple Vulkan queues
+  * Synchronization. Use fences to make sure a queue is finished processing before other queues denpendent on it.
+  * Race condition. Use mutex to avoid simultanious read/write operations at same address in same buffer.
+* Avoiding the time and memory cost of copying output data of compute stage to input of render stage.
 
 ### Credits
 

diff --git a/data/shaders/computeparticles/particle.comp b/data/shaders/computeparticles/particle.comp
@@ -24,7 +24,7 @@ layout(std140, binding = 1) buffer ParticlesB
    Particle particlesB[ ];
 };
 
-layout (local_size_x = 16, local_size_y = 16) in;
+layout (local_size_x = 256, local_size_y = 1) in;
 
 // LOOK: rule weights and distances, as well as particle count, based off uniforms.
 // The deltaT here has to be updated every frame to account for changes in
@@ -43,10 +43,10 @@ layout (binding = 2) uniform UBO
 
 void main()
 {
-		// LOOK: This is very similar to a CUDA kernel.
-		// Right now, the compute shader only advects the particles with their
-		// velocity and handles wrap-around.
-		// TODO: implement flocking behavior.
+	// LOOK: This is very similar to a CUDA kernel.
+	// Right now, the compute shader only advects the particles with their
+	// velocity and handles wrap-around.
+	// TODO: implement flocking behavior.
 
     // Current SSBO index
     uint index = gl_GlobalInvocationID.x;
@@ -55,23 +55,61 @@ void main()
 		return;
 
     // Read position and velocity
-		vec2 vPos = particlesA[index].pos.xy;
-    vec2 vVel = particlesA[index].vel.xy;
+	vec2 vPos = particlesA[index].pos;
+    vec2 vVel = particlesA[index].vel;
 
-		// clamp velocity for a more pleasing simulation.
-		vVel = normalize(vVel) * clamp(length(vVel), 0.0, 0.1);
+	// flocking calculation
+	{
+		vec2 centerOfMass = vec2(0.);
+		vec2 averageVelocity = vec2(0.);
+		int neighbors1 = 0;
+		int neighbors3 = 0;
 
-		// kinematic update
-		vPos += vVel * ubo.deltaT;
+		for (int i = 0; i < ubo.particleCount; ++i) {
+			// skip self
+			if (i == index) continue;
 
-    // Wrap around boundary
-		if (vPos.x < -1.0) vPos.x = 1.0;
-		if (vPos.x > 1.0) vPos.x = -1.0;
-		if (vPos.y < -1.0) vPos.y = 1.0;
-		if (vPos.y > 1.0) vPos.y = -1.0;
+			float dist = distance(particlesA[i].pos, vPos);
+
+			// cohesion
+			if (dist < ubo.rule1Distance) {
+				centerOfMass += particlesA[i].pos;
+				neighbors1 += 1;
+			}
+
+			// seperation
+			if (dist < ubo.rule2Distance) {
+				vVel += (vPos - particlesA[i].pos) * ubo.rule2Scale;
+			}
+
+			// velocity adaptation
+			if (dist < ubo.rule3Distance) {
+				averageVelocity += particlesA[i].vel;
+				neighbors3 += 1;
+			}
+		}
 
-    particlesB[index].pos.xy = vPos;
+		if (neighbors1 > 0) {
+			vVel += (centerOfMass / neighbors1 - vPos) * ubo.rule1Scale;
+		}
+		if (neighbors3 > 0) {
+			vVel += averageVelocity / neighbors3 * ubo.rule3Scale;;
+		}
+	}
+
+	// clamp velocity for a more pleasing simulation.
+	vVel = normalize(vVel) * clamp(length(vVel), 0.0, 0.1);
+
+	// kinematic update
+	vPos += vVel * ubo.deltaT;
+
+    // Wrap around boundary
+	if (vPos.x < -1.0) vPos.x = 1.0;
+	if (vPos.x > 1.0) vPos.x = -1.0;
+	if (vPos.y < -1.0) vPos.y = 1.0;
+	if (vPos.y > 1.0) vPos.y = -1.0;
 
-    // Write back
-    particlesB[index].vel.xy = vVel;
+	// Write back
+    particlesB[index].pos = vPos;
+    particlesB[index].vel = vVel;
 }
diff --git a/data/shaders/computeparticles/particle.comp.spv b/data/shaders/computeparticles/particle.comp.spv
diff --git a/img/1.gif b/img/1.gif
diff --git a/img/2.gif b/img/2.gif
diff --git a/vulkanBoids/vulkanBoids.cpp b/vulkanBoids/vulkanBoids.cpp
@@ -17,6 +17,7 @@
 #include <assert.h>
 #include <vector>
 #include <random>
+#include <algorithm>
 
 #define GLM_FORCE_RADIANS
 #define GLM_FORCE_DEPTH_ZERO_TO_ONE
@@ -33,7 +34,7 @@
 // LOOK: constants for the boids algorithm. These will be passed to the GPU compute part of the assignment
 // using a Uniform Buffer. These parameters should yield a stable and pleasing simulation for an
 // implementation based off the code here: http://studio.sketchpad.cc/sp/pad/view/ro.9cbgCRcgbPOI6/rev.23
-#define RULE1DISTANCE 0.1f // cohesion
+#define RULE1DISTANCE .1f // cohesion
 #define RULE2DISTANCE 0.05f // separation
 #define RULE3DISTANCE 0.05f // alignment
 #define RULE1SCALE 0.02f
@@ -48,7 +49,7 @@ class VulkanExample : public VulkanExampleBase
 	bool animate = true;
 
 	// LOOK: this struct contains descriptions of how the vertex buffer should be interpreted by
-	// a strictly graphics pipeline. 
+	// a strictly graphics pipeline.
 	struct {
 		// inputState encapsulates bindingDescriptions and  attributeDescriptions
 		VkPipelineVertexInputStateCreateInfo inputState;
@@ -158,6 +159,7 @@ class VulkanExample : public VulkanExampleBase
 		{
 			particle.pos = glm::vec2(rDistribution(rGenerator), rDistribution(rGenerator));
 			// TODO: add randomized velocities with a slight scale here, something like 0.1f.
+			particle.vel = glm::vec2(rDistribution(rGenerator), rDistribution(rGenerator)) * .1f;
 		}
 
 		VkDeviceSize storageBufferSize = particleBuffer.size() * sizeof(Particle);
@@ -235,7 +237,7 @@ class VulkanExample : public VulkanExampleBase
 		vertices.attributeDescriptions[0] =
 			vkTools::initializers::vertexInputAttributeDescription(
 			VERTEX_BUFFER_BIND_ID,
-			0, // corresponds to `layout (location = 0) in` in particle.vert 
+			0, // corresponds to `layout (location = 0) in` in particle.vert
 			VK_FORMAT_R32G32_SFLOAT, // what kind of data? vec2
 			offsetof(Particle, pos)); // offset into each Particle struct
 		// Location 1 : Velocity
@@ -244,7 +246,7 @@ class VulkanExample : public VulkanExampleBase
 			VERTEX_BUFFER_BIND_ID,
 			1,
 			VK_FORMAT_R32G32_SFLOAT,
-			offsetof(Particle, pos)); // TODO: change this so that we can color the particles based on velocity.
+			offsetof(Particle, vel)); // TODO: change this so that we can color the particles based on velocity.
 
 		// vertices.inputState encapsulates everything we need for these particular buffers to
 		// interface with the graphics pipeline.
@@ -540,13 +542,34 @@ class VulkanExample : public VulkanExampleBase
 			compute.descriptorSets[0],
 			VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
 			2,
-			&compute.uniformBuffer.descriptor)
+			&compute.uniformBuffer.descriptor),
 
 			// TODO: write the second descriptorSet, using the top for reference.
 			// We want the descriptorSets to be used for flip-flopping:
 			// on one frame, we use one descriptorSet with the compute pass,
 			// on the next frame, we use the other.
 			// What has to be different about how the second descriptorSet is written here?
+
+			// Binding 0 : Particle position storage buffer
+			vkTools::initializers::writeDescriptorSet(
+			compute.descriptorSets[1], // LOOK: which descriptor set to write to?
+			VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+			0, // LOOK: which binding in the descriptor set Layout?
+			&compute.storageBufferB.descriptor), // LOOK: which SSBO?
+
+			// Binding 1 : Particle position storage buffer
+			vkTools::initializers::writeDescriptorSet(
+			compute.descriptorSets[1],
+			VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+			1,
+			&compute.storageBufferA.descriptor),
+
+			// Binding 2 : Uniform buffer
+			vkTools::initializers::writeDescriptorSet(
+			compute.descriptorSets[1],
+			VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+			2,
+			&compute.uniformBuffer.descriptor)
 		};
 
 		vkUpdateDescriptorSets(device, static_cast<uint32_t>(computeWriteDescriptorSets.size()), computeWriteDescriptorSets.data(), 0, NULL);
@@ -590,6 +613,8 @@ class VulkanExample : public VulkanExampleBase
 		// We also want to flip what SSBO we draw with in the next
 		// pass through the graphics pipeline.
 		// Feel free to use std::swap here. You should need it twice.
+		std::swap(compute.descriptorSets[0], compute.descriptorSets[1]);
+		std::swap(compute.storageBufferA, compute.storageBufferB);
 	}
 
 	// Record command buffers for drawing using the graphics pipeline
@@ -671,7 +696,7 @@ class VulkanExample : public VulkanExampleBase
 		bufferBarrier.size = compute.storageBufferA.descriptor.range;
 		bufferBarrier.srcAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;						// Vertex shader invocations have finished reading from the buffer
 		bufferBarrier.dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT;								// Compute shader wants to write to the buffer
-		
+
 		// Compute and graphics queue may have different queue families (see VulkanDevice::createLogicalDevice)
 		// For the barrier to work across different queues, we need to set their family indices
 		bufferBarrier.srcQueueFamilyIndex = vulkanDevice->queueFamilyIndices.graphics;			// Required as compute and graphics queue may have different families
@@ -694,7 +719,7 @@ class VulkanExample : public VulkanExampleBase
 		vkCmdBindDescriptorSets(compute.commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipelineLayout, 0, 1, compute.descriptorSets, 0, 0);
 
 		// Record a dispatch of the compute job
-		vkCmdDispatch(compute.commandBuffer, PARTICLE_COUNT / 16, 1, 1);
+		vkCmdDispatch(compute.commandBuffer, PARTICLE_COUNT / 256, 1, 1);
 
 		// Add memory barrier to ensure that compute shader has finished writing to the buffer
 		// Without this the (rendering) vertex shader may display incomplete results (partial data from last frame)