diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8e3eb28..5cbf9a1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,3 +1,4 @@
+message("ROOT")
 cmake_minimum_required(VERSION 3.0)
 
 project(cis565_rasterizer)
diff --git a/Capture.PNG b/Capture.PNG
new file mode 100644
index 0000000..b9a766c
Binary files /dev/null and b/Capture.PNG differ
diff --git a/MilkTruck.PNG b/MilkTruck.PNG
new file mode 100644
index 0000000..dd969d4
Binary files /dev/null and b/MilkTruck.PNG differ
diff --git a/README.md b/README.md
index cad1abd..26c8be3 100644
--- a/README.md
+++ b/README.md
@@ -1,20 +1,103 @@
 CUDA Rasterizer
 ===============
 
-[CLICK ME FOR INSTRUCTION OF THIS PROJECT](./INSTRUCTION.md)
-
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Ethan Brooks
+* Tested on: Windows 7, Intel(R) Xeon(R), GeForce GTX 1070 8GB (SIG Lab)
+
+### Summary
+For this project we implemented a graphics rasterizer. Like path-tracing, rasterization is a method for converting a specification of points in 3d space into a 2d image. Whereas a path-tracer simulates the movement of light rays through space, a rasterizer uses matrix transforms to project 3d objectives onto the screen. Also, instead of representing objects as Platonic solids as in the path tracer, a rasterizer decomposes objects into smaller "primitives", usually triangles.
+
+Our basic pipeline is as follows:
+
+ Vertex Assembly -> Vertex Transform -> Primitive Assembly -> Rasterization -> Fragment Shading
+
+It's worth noting that while most rasterizer pipelines look something like this, there are variations from one to the next.
+
+## Vertex Assembly
+The objects that a rasterizer uses require assembly. Initially, a GLTF loader iterates through various "meshes" or collections of associated points and fill buffers with
+
+- positions (in model space) of vertices
+- normals (also in model space)
+- pointers to shared texture images
+- coordinates into these shared texture images
+- vertices (associated with triangles in the next step)
+- indices for associating vertices with primitives
+
+Vertex assembly uses common indices to fill all the values associated with each vertex struct. These include:
+- positions
+- normals
+- texture coordinates
+
+## Vertex Transform
+Our rasterizer essentially combines this step with the previous, since we transform vertex positions while assigning them to vertex structs. Transformation is actually not a single step.
+
+Positions start in world space, in which the origin is at an arbitrary global position.
+
+Next they are transformed into view space, where the origin is at the camera. This space is primarily used during vertex shading, when positions relative to the viewer and relative to light sources are taken into account.
+
+Next they are transformed into "clip" space, in which object are projected onto the 2d plane of the screen, but parts that extend past the edge of the screen have not yet been clipped. This space is sometimes also called NDC space for "Normalized Device Coordinate" space. Finally, we scale this space so that the origin is at the lower left corner of the screen and a unit corresponds to a pixel.
+
+## Primitive Assembly
+This step simple associates vertices with triangles (or whatever primitive is being used). The indices mentioned earlier map each vertex to its parent primitive.
+
+## Rasterization
+This step actually accounts for the bulk of the code, although the tasks it performs are seemingly trivial: rasterization takes on two challenges: coverage and occlusion.
+
+# Coverage
+Coverage is mapping the vertices of a primitive to pixels that fall within the area of the primitive. We use the AABB method, where we search within the smallest possible bounding box that surrounds a primitive. Specifically, we scan from the upper left of the box and stop at the lower right, testing each pixel to see if it falls within the primitive and assigning a fragment to it if it does.
+
+# Occlusion
+Points in space should only be rendered if other objects don't obstruct them. To check for occlusion we use "depth testing." We use a "Z Buffer" with size proportional to the number of pixels (we scale up in the case of supersampling). For each sample of the screen, we update the Z buffer with an integer that corresponds to the depth of the closest point with larger values corresponding to greater distances. Usually we measure these depths from 0 to INT_MAX. Once all the depths have been updated, we iterate back through the sampled points and only assign fragments where the depth of the fragment corresponds to the depth in the Z buffer -- this indicates that the fragment was closer than any other at that sample point.
+
+# Fragment Shading
+During this part, we shade the base color of pixels to reflect material properties or lighting. We used a standard technique: Blinn-Phong shading, in which the intensity of the lighting is proportional to the angle between the surface normal and the mean of light angle and view angle. Thus light is brightest when a viewer views an object head on with a light source at the camera. Surfaces are dark, essentially when an object is lit from behind.
+
+## Additional Features
+# Texture Mapping.
+In order to apply more complex color patterns, our rasterizer gives each primitive a pointer to a texture image. A texture image looks a bit like a smashed version of the object. Each vertex is assigned a "texture coordinate" that points to the spot in the texture image where the object gets its base color. In order to assign colors to fragments which are usually between vertices (not exactly at them) we simply interpolate the texture coordinates of all three vertices using barycentric coordinates.
+
+Barycentric coordinates associated with a triangle (as in our case) have values that are proportional to a points nearness to each vertex of the triangle. For example, if a point is colocated with the third vertex, its barycentric coordinate would be (0, 0, 1). Moreover, a point falls inside a triangle only if all three of its barycentric coordinates are in the range [0, 1].
+
+Interpolation over barycentric coordinates simply involves weighting the contribution of each vertex by the value of the barycentric coordinate associated with it.
+
+Here is an image of textures applied to a duck:
+
+![alt text] (https://github.com/lobachevzky/Project4-CUDA-Rasterizer/blob/master/duckAA32-2.PNG)
+
+and to a milk truck:
+![alt text] (https://github.com/lobachevzky/Project4-CUDA-Rasterizer/blob/master/milktruckAA32.PNG)
+
+The main performance cost of texture mapping is the requirement to repeatedly access global memory, both for texture coordinates and the texture image itself. However, an arbitrarily complex texture can be used with only minor additional cost in memory.
+
+Since the texture coordinates of the three vertices are repeatedly accessed by the pixels that fall within them, this is a feature that would strongly benefit from the use of shared memory.
+
+# Antialiasing
+Unlike previous efforts, we this time used randomization to perform antialiasing. Antialiasing is a process wherein the value assigned to a pixel is actually the average of several colors calculated from points within the pixel. These points are called "samples" and the technique of taking multiple samples per pixel is known as "supersampling." The result may be seen below:
+
+This is the duck with antialiasing (x32):
+
+![alt text] (https://github.com/lobachevzky/Project4-CUDA-Rasterizer/blob/master/duckAA32.PNG)
+
+And this is the duck without:
+
+![alt text] (https://github.com/lobachevzky/Project4-CUDA-Rasterizer/blob/master/duckAA1.PNG)
+
+In many ways the contrast is clearest in the case of geometric objects, especially when viewed at an oblique angle:
+
+With antialiasing:
+
+![alt text](https://github.com/lobachevzky/Project4-CUDA-Rasterizer/blob/master/checkerboardAA32.PNG)
 
-### (TODO: Your README)
+Without:
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
+![alt text](https://github.com/lobachevzky/Project4-CUDA-Rasterizer/blob/master/checkerboardAA1.PNG)
 
+Like texture mapping, antialiasing comes with a performance cost -- probably an even more significant one, actually. In general runtime scales with the number of samples taken per pixel as demonstrated by this chart:
 
-### Credits
+![alt text] (https://github.com/lobachevzky/Project4-CUDA-Rasterizer/blob/master/antialiasing-profile.PNG)
 
-* [tinygltfloader](https://github.com/syoyo/tinygltfloader) by [@soyoyo](https://github.com/syoyo)
-* [glTF Sample Models](https://github.com/KhronosGroup/glTF/blob/master/sampleModels/README.md)
+Two major memory optimizations include:
+1. Only sampling at edges, this the effects of aliasing are really only observable there.
+2. Directly averaging colors in place in the fragment buffer, instead of increasing the size of the fragment buffer, assigning separate samples to separate indices and subsequently averaging. This proved tricky, since multiple threads would have to access the same index in the fragment buffer simultaneously leading to race conditions.
diff --git a/antialiasing-profile.PNG b/antialiasing-profile.PNG
new file mode 100644
index 0000000..b4b04c8
Binary files /dev/null and b/antialiasing-profile.PNG differ
diff --git a/box1.png b/box1.png
new file mode 100644
index 0000000..79bb017
Binary files /dev/null and b/box1.png differ
diff --git a/box2.png b/box2.png
new file mode 100644
index 0000000..2c02a49
Binary files /dev/null and b/box2.png differ
diff --git a/boxAA1.PNG b/boxAA1.PNG
new file mode 100644
index 0000000..a46fc12
Binary files /dev/null and b/boxAA1.PNG differ
diff --git a/boxAA32.PNG b/boxAA32.PNG
new file mode 100644
index 0000000..624fa3f
Binary files /dev/null and b/boxAA32.PNG differ
diff --git a/checkerboard.PNG b/checkerboard.PNG
new file mode 100644
index 0000000..4157525
Binary files /dev/null and b/checkerboard.PNG differ
diff --git a/checkerboard2.PNG b/checkerboard2.PNG
new file mode 100644
index 0000000..aa9a945
Binary files /dev/null and b/checkerboard2.PNG differ
diff --git a/checkerboardAA1.PNG b/checkerboardAA1.PNG
new file mode 100644
index 0000000..def7c53
Binary files /dev/null and b/checkerboardAA1.PNG differ
diff --git a/checkerboardAA32.PNG b/checkerboardAA32.PNG
new file mode 100644
index 0000000..575a50b
Binary files /dev/null and b/checkerboardAA32.PNG differ
diff --git a/debug.txt.txt b/debug.txt.txt
new file mode 100644
index 0000000..d60c3ee
--- /dev/null
+++ b/debug.txt.txt
@@ -0,0 +1,16 @@
+
+bcCoord=0.007811,0.009287,0.982902
+texCoord[0]=0.000100,0.999900
+texCoord[1]=0.999900,0.999900
+texCoord[2]=0.999900,0.000100
+weighted texCoord=0.992090,0.017195
+rescaled coord=535.728760,9.285239
+color=0.905882,0.905882,0.905882
+
+bcCoord=0.006082,0.009879,0.984039
+texCoord[0]=0.000100,0.999900
+texCoord[1]=0.999900,0.999900
+texCoord[2]=0.999900,0.000100
+weighted texCoord=0.993819,0.016058
+rescaled coord=536.662292,8.671115
+color=0.000000,0.000000,0.000000
diff --git a/duck2.PNG b/duck2.PNG
new file mode 100644
index 0000000..8576cae
Binary files /dev/null and b/duck2.PNG differ
diff --git a/duckAA1.PNG b/duckAA1.PNG
new file mode 100644
index 0000000..75f9bc9
Binary files /dev/null and b/duckAA1.PNG differ
diff --git a/duckAA32-2.PNG b/duckAA32-2.PNG
new file mode 100644
index 0000000..81e375e
Binary files /dev/null and b/duckAA32-2.PNG differ
diff --git a/duckAA32.PNG b/duckAA32.PNG
new file mode 100644
index 0000000..192aea9
Binary files /dev/null and b/duckAA32.PNG differ
diff --git a/external/include/glm/CMakeLists.txt b/external/include/glm/CMakeLists.txt
index aaeae62..b8d652a 100755
--- a/external/include/glm/CMakeLists.txt
+++ b/external/include/glm/CMakeLists.txt
@@ -1,3 +1,4 @@
+message("GLM")
 set(NAME glm_dummy)
 
 file(GLOB ROOT_SOURCE *.cpp)
diff --git a/gltfs/checkerboard.gltf b/gltfs/checkerboard.gltf
new file mode 100644
index 0000000..a333738
--- /dev/null
+++ b/gltfs/checkerboard.gltf
@@ -0,0 +1,301 @@
+{
+  "accessors": {
+    "accessor_index_0": {
+      "bufferView": "bufferView_1",
+      "byteOffset": 0,
+      "byteStride": 0,
+      "componentType": 5123,
+      "count": 6,
+      "type": "SCALAR",
+      "min": [
+        0
+      ],
+      "max": [
+        3
+      ]
+    },
+    "accessor_position": {
+      "bufferView": "bufferView_0",
+      "byteOffset": 0,
+      "byteStride": 0,
+      "componentType": 5126,
+      "count": 4,
+      "min": [
+        -1,
+        0,
+        -1
+      ],
+      "max": [
+        1,
+        0,
+        1
+      ],
+      "type": "VEC3"
+    },
+    "accessor_uv": {
+      "bufferView": "bufferView_0",
+      "byteOffset": 48,
+      "byteStride": 0,
+      "componentType": 5126,
+      "count": 4,
+      "type": "VEC2",
+      "min": [
+        0.00009999999747378752,
+        0.00009999999747378752
+      ],
+      "max": [
+        0.9998999834060669,
+        0.9998999834060669
+      ]
+    },
+    "accessor_normal_generated": {
+      "bufferView": "bufferView_0",
+      "byteOffset": 80,
+      "byteStride": 0,
+      "componentType": 5126,
+      "count": 4,
+      "type": "VEC3",
+      "min": [
+        0,
+        1,
+        0
+      ],
+      "max": [
+        0,
+        1,
+        0
+      ]
+    }
+  },
+  "asset": {
+    "generator": "OBJ2GLTF",
+    "premultipliedAlpha": true,
+    "profile": {
+      "api": "WebGL",
+      "version": "1.0"
+    },
+    "version": "1.0"
+  },
+  "buffers": {
+    "buffer_0": {
+      "type": "arraybuffer",
+      "byteLength": 140,
+      "uri": "data:application/octet-stream;base64,AACAvwAAAAAAAIA/AACAPwAAAAAAAIA/AACAPwAAAAAAAIC/AACAvwAAAAAAAIC/F7fROHL5fz9y+X8/cvl/P3L5fz8Xt9E4F7fROBe30TgAAAAAAACAPwAAAAAAAAAAAACAPwAAAAAAAAAAAACAPwAAAAAAAAAAAACAPwAAAAAAAAEAAgAAAAIAAwA="
+    }
+  },
+  "bufferViews": {
+    "bufferView_0": {
+      "buffer": "buffer_0",
+      "byteLength": 128,
+      "byteOffset": 0,
+      "target": 34962
+    },
+    "bufferView_1": {
+      "buffer": "buffer_0",
+      "byteLength": 12,
+      "byteOffset": 128,
+      "target": 34963
+    }
+  },
+  "images": {
+    "Checkered": {
+      "uri": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAhwAAAIcCAIAAAAynOArAAAACXBIWXMAAAsTAAALEwEAmpwYAAALjUlEQVR42u3ZMU5jZxuGYY5tJtiWEGnRrGNSjKZmD6np2Uf2geQGiWWkQyILCQVgG5njvxhpVnAX/3GuawevnuLWp2/4559/lsvl2cTNZrO///77zz///P3336d+yx9//PHXX3+tViujGMUoRpmWi4uLxWq1OoFV5vP5zzT++++/U7/l7e1ttVqt12ujGMUoRpmW1Wo1m/oNAPz/EBUAMqICQEZUAMiICgAZUQEgIyoAZEQFgIyoAJARFQAyogJARlQAyIgKABlRASAjKgBkRAWAjKgAkBEVADKiAkBGVADIiAoAGVEBICMqAGREBYCMqACQERUAMqICQEZUAMiICgAZUQEgIyoAZEQFgIyoAJARFQAyogJARlQAyIgKABlRASAjKgBkRAWAjKgAkBEVADKiAkBGVADIiAoAGVEBICMqAGREBYCMqACQERUAMqICQEZUAMiICgAZUQEgIyoAZEQFgIyoAJARFQAyogJARlQAyIgKABlRASAjKgBkRAWAzGI2m83n86mfcQIn/LLb7YZhMIpRjGKUyXl/fx82m81yuTyBY8ZxfH19PR6PUz/kcDhcXl6en58bxShGMcq0fHx8DCewx0/39/c3NzfjOE79kO12e3d39/j4aBSjGMUo07Lf7xdXV1cvLy8nsMrxeBzH8fPz8wRuGYYTib1RjGKU/9Qo4zj6qAcgIyoAZEQFgIyoAJARFQAyogJARlQAyIgKABlRASAjKgBkRAWAjKgAkBEVADKiAkBGVADIiAoAGVEBICMqAGREBYCMqACQERUAMqICQEZUAMiICgAZUQEgIyoAZEQFgIyoAJARFQAyogJARlQAyIgKABlRASAjKgBkRAWAjKgAkBEVADKiAkBGVADIiAoAGVEBICMqAGREBYCMqACQERUAMqICQEZUAMiICgAZUQEgIyoAZEQFgIyoAJARFQAyogJARlQAyIgKABlRASAjKgBkRAWAjKgAkBEVADKiAkBGVADIiAoAmcW3b9/e3t6mfsZutzscDrvd7ng8Tv2W7Xb79evX79+/G8UoRjHKtOz3++H5+Xm5XE79kmEYnp6eNpvNbDb5t9f19fXt7e3FxYVRjGIUo0zLly9fFuv1erVaTf2S+Xy+WCweHh6mfsjZ2dmPHz+Wy+V6vTaKUYxilGlZrVb+VADIiAoAGVEBICMqAGREBYCMqACQERUAMqICQEZUAMiICgAZUQEgIyoAZEQFgIyoAJARFQAyogJARlQAyIgKABlRASAjKgBkRAWAjKgAkBEVADKiAkBGVADIiAoAGVEBICMqAGREBYCMqACQERUAMqICQEZUAMiICgAZUQEgIyoAZEQFgIyoAJARFQAyogJARlQAyIgKABlRASAjKgBkRAWAjKgAkBEVADKiAkBGVADIiAoAGVEBICMqAGREBYCMqACQERUAMqICQEZUAMiICgAZUQEgIyoAZEQFgIyoAJARFQAyogJAZjGbzebz+dTPOIETftntdsMwGMUoRjHK5Ly/vw+bzWa5XJ7AMeM4vr6+Ho/HqR9yOBwuLy/Pz8+NYhSjGGVaPj4+hhPY46f7+/ubm5txHKd+yHa7vbu7e3x8NIpRjGKUadnv94urq6uXl5cTWOV4PI7j+Pn5eQK3DMOJxN4oRjHKf2qUcRx91AOQERUAMqICQEZUAMiICgAZUQEgIyoAZEQFgIyoAJARFQAyogJARlQAyIgKABlRASAjKgBkRAWAjKgAkBEVADKiAkBGVADIiAoAGVEBICMqAGREBYCMqACQERUAMqICQEZUAMiICgAZUQEgIyoAZEQFgIyoAJARFQAyogJARlQAyIgKABlRASAjKgBkRAWAjKgAkBEVADKiAkBGVADIiAoAGVEBICMqAGREBYCMqACQERUAMqICQEZUAMiICgAZUQEgIyoAZEQFgIyoAJARFQAyogJARlQAyIgKABlRASAjKgBkRAWAzOLbt29vb29TP2O32x0Oh91udzwep37Ldrv9+vXr9+/fjWIUoxhlWvb7/fD8/LxcLqd+yTAMT09Pm81mNpv82+v6+vr29vbi4sIoRjGKUably5cvi/V6vVqtpn7JfD5fLBYPDw9TP+Ts7OzHjx/L5XK9XhvFKEYxyrSsVit/KgBkRAWAjKgAkBEVADKiAkBGVADIiAoAGVEBICMqAGREBYCMqACQERUAMqICQEZUAMiICgAZUQEgIyoAZEQFgIyoAJARFQAyogJARlQAyIgKABlRASAjKgBkRAWAjKgAkBEVADKiAkBGVADIiAoAGVEBICMqAGREBYCMqACQERUAMqICQEZUAMiICgAZUQEgIyoAZEQFgIyoAJARFQAyogJARlQAyIgKABlRASAjKgBkRAWAjKgAkBEVADKiAkBGVADIiAoAGVEBICMqAGREBYCMqACQERUAMqICQEZUAMiICgAZUQEgs5jNZvP5fOpnnMAJv+x2u2EYjGIUoxhlct7f34fNZrNcLk/gmHEcX19fj8fj1A85HA6Xl5fn5+dGMYpRjDItHx8fwwns8dP9/f3Nzc04jlM/ZLvd3t3dPT4+GsUoRjHKtOz3+8XV1dXLy8sJrHI8Hsdx/Pz8PIFbhuFEYm8UoxjlPzXKOI4+6gHIiAoAGVEBICMqAGREBYCMqACQERUAMqICQEZUAMiICgAZUQEgIyoAZEQFgIyoAJARFQAyogJARlQAyIgKABlRASAjKgBkRAWAjKgAkBEVADKiAkBGVADIiAoAGVEBICMqAGREBYCMqACQERUAMqICQEZUAMiICgAZUQEgIyoAZEQFgIyoAJARFQAyogJARlQAyIgKABlRASAjKgBkRAWAjKgAkBEVADKiAkBGVADIiAoAGVEBICMqAGREBYCMqACQERUAMqICQEZUAMiICgAZUQEgIyoAZEQFgIyoAJARFQAyogJAZvHt27e3t7epn7Hb7Q6Hw263Ox6PU79lu91+/fr1+/fvRjGKUYwyLfv9fnh+fl4ul1O/ZBiGp6enzWYzm03+7XV9fX17e3txcWEUoxjFKNPy5cuXxXq9Xq1WU79kPp8vFouHh4epH3J2dvbjx4/lcrler41iFKMYZVpWq5U/FQAyogJARlQAyIgKABlRASAjKgBkRAWAjKgAkBEVADKiAkBGVADIiAoAGVEBICMqAGREBYCMqACQERUAMqICQEZUAMiICgAZUQEgIyoAZEQFgIyoAJARFQAyogJARlQAyIgKABlRASAjKgBkRAWAjKgAkBEVADKiAkBGVADIiAoAGVEBICMqAGREBYCMqACQERUAMqICQEZUAMiICgAZUQEgIyoAZEQFgIyoAJARFQAyogJARlQAyIgKABlRASAjKgBkRAWAjKgAkBEVADKiAkBGVADIiAoAGVEBICMqAGREBYCMqACQWcxms/l8PvUzTuCEX3a73TAMRjGKUYwyOe/v78Nms1kulydwzDiOr6+vx+Nx6occDofLy8vz83OjGMUoRpmWj4+P4QT2+On+/v7m5mYcx6kfst1u7+7uHh8fjWIUoxhlWvb7/eLq6url5eUEVjkej+M4fn5+nsAtw3AisTeKUYzynxplHEcf9QBkRAWAjKgAkBEVADKiAkBGVADIiAoAGVEBICMqAGREBYCMqACQERUAMqICQEZUAMiICgAZUQEgIyoAZEQFgIyoAJARFQAyogJARlQAyIgKABlRASAjKgBkRAWAjKgAkBEVADKiAkBGVADIiAoAGVEBICMqAGREBYCMqACQERUAMqICQEZUAMiICgAZUQEgIyoAZEQFgIyoAJARFQAyogJARlQAyIgKABlRASAjKgBkRAWAjKgAkBEVADKiAkBGVADIiAoAGVEBICMqAGREBYCMqACQERUAMqICQEZUAMiICgAZUQEgs7i4uFgul1M/Y7vdfnx8fHx8jOM49Vv2+/1vv/1mFKMYxShTHOV/51nv6/az2UUAAAAASUVORK5CYII="
+    }
+  },
+  "materials": {
+    "material_Material.001": {
+      "name": "Material.001",
+      "extensions": {},
+      "values": {
+        "ambient": [
+          0,
+          0,
+          0,
+          1
+        ],
+        "diffuse": "texture_Checkered",
+        "emission": [
+          0,
+          0,
+          0,
+          1
+        ],
+        "specular": [
+          0.5,
+          0.5,
+          0.5,
+          1
+        ],
+        "shininess": 96.078431,
+        "transparency": 1
+      },
+      "technique": "technique0"
+    }
+  },
+  "meshes": {
+    "mesh_checkerboard": {
+      "name": "checkerboard",
+      "primitives": [
+        {
+          "attributes": {
+            "POSITION": "accessor_position",
+            "TEXCOORD_0": "accessor_uv",
+            "NORMAL": "accessor_normal_generated"
+          },
+          "indices": "accessor_index_0",
+          "material": "material_Material.001",
+          "mode": 4
+        }
+      ]
+    }
+  },
+  "nodes": {
+    "rootNode": {
+      "children": [],
+      "meshes": [
+        "mesh_checkerboard"
+      ],
+      "matrix": [
+        1,
+        0,
+        0,
+        0,
+        0,
+        1,
+        0,
+        0,
+        0,
+        0,
+        1,
+        0,
+        0,
+        0,
+        0,
+        1
+      ]
+    }
+  },
+  "samplers": {
+    "sampler_0": {
+      "magFilter": 9729,
+      "minFilter": 9986,
+      "wrapS": 10497,
+      "wrapT": 10497
+    }
+  },
+  "scene": "scene_checkerboard",
+  "scenes": {
+    "scene_checkerboard": {
+      "nodes": [
+        "rootNode"
+      ]
+    }
+  },
+  "textures": {
+    "texture_Checkered": {
+      "format": 6407,
+      "internalFormat": 6407,
+      "sampler": "sampler_0",
+      "source": "Checkered",
+      "target": 3553,
+      "type": 5121
+    }
+  },
+  "extensionsUsed": [],
+  "animations": {},
+  "cameras": {},
+  "techniques": {
+    "technique0": {
+      "attributes": {
+        "a_position": "position",
+        "a_normal": "normal",
+        "a_texcoord_0": "texcoord_0"
+      },
+      "parameters": {
+        "modelViewMatrix": {
+          "semantic": "MODELVIEW",
+          "type": 35676
+        },
+        "projectionMatrix": {
+          "semantic": "PROJECTION",
+          "type": 35676
+        },
+        "normalMatrix": {
+          "semantic": "MODELVIEWINVERSETRANSPOSE",
+          "type": 35675
+        },
+        "ambient": {
+          "type": 35666
+        },
+        "diffuse": {
+          "type": 35678
+        },
+        "emission": {
+          "type": 35666
+        },
+        "specular": {
+          "type": 35666
+        },
+        "shininess": {
+          "type": 5126
+        },
+        "transparency": {
+          "type": 5126
+        },
+        "position": {
+          "semantic": "POSITION",
+          "type": 35665
+        },
+        "normal": {
+          "semantic": "NORMAL",
+          "type": 35665
+        },
+        "texcoord_0": {
+          "semantic": "TEXCOORD_0",
+          "type": 35664
+        }
+      },
+      "program": "program0",
+      "states": {
+        "enable": [
+          2884,
+          2929
+        ]
+      },
+      "uniforms": {
+        "u_modelViewMatrix": "modelViewMatrix",
+        "u_projectionMatrix": "projectionMatrix",
+        "u_normalMatrix": "normalMatrix",
+        "u_ambient": "ambient",
+        "u_diffuse": "diffuse",
+        "u_emission": "emission",
+        "u_specular": "specular",
+        "u_shininess": "shininess",
+        "u_transparency": "transparency"
+      }
+    }
+  },
+  "programs": {
+    "program0": {
+      "attributes": [
+        "a_position",
+        "a_normal",
+        "a_texcoord_0"
+      ],
+      "fragmentShader": "fragmentShader0",
+      "vertexShader": "vertexShader0"
+    }
+  },
+  "shaders": {
+    "vertexShader0": {
+      "type": 35633,
+      "uri": "data:text/plain;base64,cHJlY2lzaW9uIGhpZ2hwIGZsb2F0Owp1bmlmb3JtIG1hdDQgdV9tb2RlbFZpZXdNYXRyaXg7CnVuaWZvcm0gbWF0NCB1X3Byb2plY3Rpb25NYXRyaXg7CnVuaWZvcm0gbWF0MyB1X25vcm1hbE1hdHJpeDsKYXR0cmlidXRlIHZlYzMgYV9wb3NpdGlvbjsKdmFyeWluZyB2ZWMzIHZfcG9zaXRpb25FQzsKYXR0cmlidXRlIHZlYzMgYV9ub3JtYWw7CnZhcnlpbmcgdmVjMyB2X25vcm1hbDsKYXR0cmlidXRlIHZlYzIgYV90ZXhjb29yZF8wOwp2YXJ5aW5nIHZlYzIgdl90ZXhjb29yZF8wOwp2b2lkIG1haW4odm9pZCkgewogIHZlYzQgcG9zID0gdV9tb2RlbFZpZXdNYXRyaXggKiB2ZWM0KGFfcG9zaXRpb24sMS4wKTsKICB2X3Bvc2l0aW9uRUMgPSBwb3MueHl6OwogIGdsX1Bvc2l0aW9uID0gdV9wcm9qZWN0aW9uTWF0cml4ICogcG9zOwogIHZfbm9ybWFsID0gdV9ub3JtYWxNYXRyaXggKiBhX25vcm1hbDsKICB2X3RleGNvb3JkXzAgPSBhX3RleGNvb3JkXzA7Cn0K"
+    },
+    "fragmentShader0": {
+      "type": 35632,
+      "uri": "data:text/plain;base64,cHJlY2lzaW9uIGhpZ2hwIGZsb2F0Owp1bmlmb3JtIHZlYzQgdV9hbWJpZW50Owp1bmlmb3JtIHNhbXBsZXIyRCB1X2RpZmZ1c2U7CnVuaWZvcm0gdmVjNCB1X2VtaXNzaW9uOwp1bmlmb3JtIHZlYzQgdV9zcGVjdWxhcjsKdW5pZm9ybSBmbG9hdCB1X3NoaW5pbmVzczsKdW5pZm9ybSBmbG9hdCB1X3RyYW5zcGFyZW5jeTsKdmFyeWluZyB2ZWMzIHZfcG9zaXRpb25FQzsKdmFyeWluZyB2ZWMzIHZfbm9ybWFsOwp2YXJ5aW5nIHZlYzIgdl90ZXhjb29yZF8wOwp2b2lkIG1haW4odm9pZCkgewogIHZlYzMgbm9ybWFsID0gbm9ybWFsaXplKHZfbm9ybWFsKTsKICB2ZWM0IGRpZmZ1c2UgPSB0ZXh0dXJlMkQodV9kaWZmdXNlLCB2X3RleGNvb3JkXzApOwogIHZlYzMgZGlmZnVzZUxpZ2h0ID0gdmVjMygwLjAsIDAuMCwgMC4wKTsKICB2ZWMzIHNwZWN1bGFyID0gdV9zcGVjdWxhci5yZ2I7CiAgdmVjMyBzcGVjdWxhckxpZ2h0ID0gdmVjMygwLjAsIDAuMCwgMC4wKTsKICB2ZWMzIGVtaXNzaW9uID0gdV9lbWlzc2lvbi5yZ2I7CiAgdmVjMyBhbWJpZW50ID0gdV9hbWJpZW50LnJnYjsKICB2ZWMzIHZpZXdEaXIgPSAtbm9ybWFsaXplKHZfcG9zaXRpb25FQyk7CiAgdmVjMyBhbWJpZW50TGlnaHQgPSB2ZWMzKDAuMCwgMC4wLCAwLjApOwogIGFtYmllbnRMaWdodCArPSB2ZWMzKDAuMiwgMC4yLCAwLjIpOwogIHZlYzMgbCA9IHZlYzMoMC4wLCAwLjAsIDEuMCk7CiAgZGlmZnVzZUxpZ2h0ICs9IHZlYzMoMS4wLCAxLjAsIDEuMCkgKiBtYXgoZG90KG5vcm1hbCxsKSwgMC4pOwogIHZlYzMgaCA9IG5vcm1hbGl6ZShsICsgdmlld0Rpcik7CiAgZmxvYXQgc3BlY3VsYXJJbnRlbnNpdHkgPSBtYXgoMC4sIHBvdyhtYXgoZG90KG5vcm1hbCwgaCksIDAuKSwgdV9zaGluaW5lc3MpKTsKICBzcGVjdWxhckxpZ2h0ICs9IHZlYzMoMS4wLCAxLjAsIDEuMCkgKiBzcGVjdWxhckludGVuc2l0eTsKICB2ZWMzIGNvbG9yID0gdmVjMygwLjAsIDAuMCwgMC4wKTsKICBjb2xvciArPSBkaWZmdXNlLnJnYiAqIGRpZmZ1c2VMaWdodDsKICBjb2xvciArPSBzcGVjdWxhciAqIHNwZWN1bGFyTGlnaHQ7CiAgY29sb3IgKz0gZW1pc3Npb247CiAgY29sb3IgKz0gYW1iaWVudCAqIGFtYmllbnRMaWdodDsKICBnbF9GcmFnQ29sb3IgPSB2ZWM0KGNvbG9yICogZGlmZnVzZS5hLCBkaWZmdXNlLmEgKiB1X3RyYW5zcGFyZW5jeSk7Cn0K"
+    }
+  },
+  "skins": {},
+  "extensions": {}
+}
diff --git a/milkTruckAA1.PNG b/milkTruckAA1.PNG
new file mode 100644
index 0000000..2aa5792
Binary files /dev/null and b/milkTruckAA1.PNG differ
diff --git a/milktruckAA32.PNG b/milktruckAA32.PNG
new file mode 100644
index 0000000..74299f8
Binary files /dev/null and b/milktruckAA32.PNG differ
diff --git a/performance.xlsx b/performance.xlsx
new file mode 100644
index 0000000..a19a8ba
Binary files /dev/null and b/performance.xlsx differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a57f69f..cb7b974 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,3 +1,4 @@
+message("src")
 set(SOURCE_FILES
     "rasterize.cu"
     "rasterize.h"
diff --git a/src/main.cpp b/src/main.cpp
index a36b955..eb0e926 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -9,6 +9,7 @@
 
 
 #include "main.hpp"
+#include <chrono>
 
 #define STB_IMAGE_IMPLEMENTATION
 #define TINYGLTF_LOADER_IMPLEMENTATION
@@ -24,6 +25,7 @@ int main(int argc, char **argv) {
         return 0;
     }
 
+	cout << "Loading " << argv[1] << endl;
 	tinygltf::Scene scene;
 	tinygltf::TinyGLTFLoader loader;
 	std::string err;
@@ -119,7 +121,13 @@ void runCuda() {
 	glm::mat4 MVP = P * MV;
 
     cudaGLMapBufferObject((void **)&dptr, pbo);
+
+	using micro = std::chrono::microseconds;
+	  auto start = std::chrono::high_resolution_clock::now();
 	rasterize(dptr, MVP, MV, MV_normal);
+	  auto finish = std::chrono::high_resolution_clock::now();
+	  std::cout
+		  << std::chrono::duration_cast<micro>(finish - start).count() << endl;
     cudaGLUnmapBufferObject(pbo);
 
     frame++;
diff --git a/src/rasterize.cu b/src/rasterize.cu
index 4e3504b..e966328 100644
--- a/src/rasterize.cu
+++ b/src/rasterize.cu
@@ -9,6 +9,7 @@
 #include <cmath>
 #include <cstdio>
 #include <cuda.h>
+#include <chrono>
 #include <cuda_runtime.h>
 #include <thrust/random.h>
 #include <util/checkCUDAError.h>
@@ -17,86 +18,113 @@
 #include "rasterize.h"
 #include <glm/gtc/quaternion.hpp>
 #include <glm/gtc/matrix_transform.hpp>
+#include <glm/gtx/component_wise.hpp>
+
+#define IDx ((blockIdx.x * blockDim.x) + threadIdx.x)
+#define IDy ((blockIdx.y * blockDim.y) + threadIdx.y)
+#define MAX_DEPTH 10000.0f
+#define DEPTH_QUANTUM (float)(INT_MAX / MAX_DEPTH)
+#define getIndex(x, y, width) ((x) + (y) * (width))
+#define SAMPLES_PER_PIXEL 32
+#define AMBIENT_LIGHT 0.1
+#define TUNE_SHADE 0
+#define DEFAULT_COLOR (glm::vec3(0.5))
+
+
+#define DEBUG 1
+#define debug(...) if (DEBUG == 1) { printf (__VA_ARGS__); }
+#define debug0(...) if (DEBUG == 1 && id == 0) { printf (__VA_ARGS__); }
+#define debug1(...) if (DEBUG == 1 && id == 1) { printf (__VA_ARGS__); }
+#define debugDuck(...) if (DEBUG == 1 && id == 330033) { printf (__VA_ARGS__); }
+#define debugMinMax(...) if (DEBUG == 1 && id == 320) { printf (__VA_ARGS__); }
+#define debugDepthsId 369228
+#define debugDepths(...) if (DEBUG == 1 && id == debugDepthsId) { printf (__VA_ARGS__); }
+#define debugBox(...) if (DEBUG == 1 && id == 305194) { printf (__VA_ARGS__); }
+
+#define range(i, start, stop) for (i = start; i < stop; i++)
+#define SHOW_TEXTURE 0
+#define debug(...) if (DEBUG == 1) { printf (__VA_ARGS__); }
 
 namespace {
 
-	typedef unsigned short VertexIndex;
-	typedef glm::vec3 VertexAttributePosition;
-	typedef glm::vec3 VertexAttributeNormal;
-	typedef glm::vec2 VertexAttributeTexcoord;
-	typedef unsigned char TextureData;
-
-	typedef unsigned char BufferByte;
-
-	enum PrimitiveType{
-		Point = 1,
-		Line = 2,
-		Triangle = 3
-	};
-
-	struct VertexOut {
-		glm::vec4 pos;
-
-		// TODO: add new attributes to your VertexOut
-		// The attributes listed below might be useful, 
-		// but always feel free to modify on your own
-
-		 glm::vec3 eyePos;	// eye space position used for shading
-		 glm::vec3 eyeNor;	// eye space normal used for shading, cuz normal will go wrong after perspective transformation
-		// glm::vec3 col;
-		 glm::vec2 texcoord0;
-		 TextureData* dev_diffuseTex = NULL;
-		// int texWidth, texHeight;
-		// ...
-	};
-
-	struct Primitive {
-		PrimitiveType primitiveType = Triangle;	// C++ 11 init
-		VertexOut v[3];
-	};
-
-	struct Fragment {
-		glm::vec3 color;
-
-		// TODO: add new attributes to your Fragment
-		// The attributes listed below might be useful, 
-		// but always feel free to modify on your own
-
-		// glm::vec3 eyePos;	// eye space position used for shading
-		// glm::vec3 eyeNor;
-		// VertexAttributeTexcoord texcoord0;
-		// TextureData* dev_diffuseTex;
-		// ...
-	};
-
-	struct PrimitiveDevBufPointers {
-		int primitiveMode;	//from tinygltfloader macro
-		PrimitiveType primitiveType;
-		int numPrimitives;
-		int numIndices;
-		int numVertices;
-
-		// Vertex In, const after loaded
-		VertexIndex* dev_indices;
-		VertexAttributePosition* dev_position;
-		VertexAttributeNormal* dev_normal;
-		VertexAttributeTexcoord* dev_texcoord0;
-
-		// Materials, add more attributes when needed
-		TextureData* dev_diffuseTex;
-		// TextureData* dev_specularTex;
-		// TextureData* dev_normalTex;
-		// ...
-
-		// Vertex Out, vertex used for rasterization, this is changing every frame
-		VertexOut* dev_verticesOut;
-
-		// TODO: add more attributes when needed
-	};
+  typedef unsigned short VertexIndex;
+  typedef glm::vec3 VertexAttributePosition;
+  typedef glm::vec3 VertexAttributeNormal;
+  typedef glm::vec2 VertexAttributeTexcoord;
+  typedef unsigned char TextureData;
+
+  typedef unsigned char BufferByte;
+
+  enum PrimitiveType{
+    Point = 1,
+    Line = 2,
+    Triangle = 3
+  };
+
+  struct Vertex {
+    glm::vec4 screenPos;
+
+    // TODO: add new attributes to your VertexOut
+    // The attributes listed below might be useful,
+    // but always feel free to modify on your own
+
+    glm::vec3 viewPos;  // eye space position used for shading
+    glm::vec3 viewNorm;  // eye space normal used for shading, cuz normal will go wrong after perspective transformation
+    // glm::vec3 col;
+    VertexAttributeTexcoord texcoord0;
+  };
+
+  struct Primitive {
+    PrimitiveType primitiveType = Triangle; // C++ 11 init
+    Vertex v[3];
+    TextureData* diffuseTex = NULL;
+    glm::vec2 texRes;
+    // ...
+  };
+
+  struct Fragment {
+    glm::vec3 color;
+
+    // TODO: add new attributes to your Fragment
+    // The attributes listed below might be useful,
+    // but always feel free to modify on your own
+
+     glm::vec3 viewPos;  // eye space position used for shading
+     glm::vec3 viewNorm;
+     TextureData* diffuseTex;
+	 int numSamples;
+    // ...
+  };
+
+  struct VertexParts {
+    int primitiveMode;  //from tinygltfloader macro
+    PrimitiveType primitiveType;
+    int numPrimitives;
+    int numIndices;
+    int numVertices;
+
+    // Vertex, const after loaded
+    VertexIndex* indices;
+    VertexAttributePosition* pos;
+    VertexAttributeNormal* normal;
+    VertexAttributeTexcoord* texcoord0;
+
+    // Materials, add more attributes when needed
+    TextureData* diffuseTex;
+    // TextureData* dev_specularTex;
+    // TextureData* dev_normalTex;
+    // ...
+
+    // Vertex Out, vertex used for rasterization, this is changing every frame
+    Vertex* vertices;
+
+    // TODO: add more attributes when needed
+    glm::vec2 texRes;
+  };
 
 }
 
-static std::map<std::string, std::vector<PrimitiveDevBufPointers>> mesh2PrimitivesMap;
+static std::map<std::string, std::vector<VertexParts>> mesh2vertexParts;
 
 
 static int width = 0;
@@ -106,16 +134,15 @@ static int totalNumPrimitives = 0;
 static Primitive *dev_primitives = NULL;
 static Fragment *dev_fragmentBuffer = NULL;
 static glm::vec3 *dev_framebuffer = NULL;
-
-static int * dev_depth = NULL;	// you might need this buffer when doing depth test
+static unsigned int *dev_depth = NULL;  // you might need this buffer when doing depth test
 
 /**
  * Kernel that writes the image to the OpenGL PBO directly.
  */
-__global__ 
-void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+__global__
+void _sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) {
+	int x = IDx;
+	int y = IDy;
     int index = x + (y * w);
 
     if (x < w && y < h) {
@@ -131,53 +158,35 @@ void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) {
     }
 }
 
-/** 
-* Writes fragment colors to the framebuffer
-*/
-__global__
-void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-    int index = x + (y * w);
-
-    if (x < w && y < h) {
-        framebuffer[index] = fragmentBuffer[index].color;
-
-		// TODO: add your fragment shader code here
-
-    }
-}
-
 /**
  * Called once at the beginning of the program to allocate memory.
  */
 void rasterizeInit(int w, int h) {
-    width = w;
-    height = h;
-	cudaFree(dev_fragmentBuffer);
-	cudaMalloc(&dev_fragmentBuffer, width * height * sizeof(Fragment));
-	cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
-    cudaFree(dev_framebuffer);
-    cudaMalloc(&dev_framebuffer,   width * height * sizeof(glm::vec3));
-    cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3));
-    
-	cudaFree(dev_depth);
-	cudaMalloc(&dev_depth, width * height * sizeof(int));
-
-	checkCUDAError("rasterizeInit");
+  width = w;
+  height = h;
+  int numSamples = SAMPLES_PER_PIXEL * width * height;
+
+  cudaFree(dev_fragmentBuffer);
+  cudaMalloc(&dev_fragmentBuffer, numSamples * sizeof(Fragment));
+  cudaMemset(dev_fragmentBuffer, 0, numSamples * sizeof(Fragment));
+
+  cudaFree(dev_framebuffer);
+  cudaMalloc(&dev_framebuffer,   width * height * sizeof(glm::vec3));
+  cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3));
+
+  cudaFree(dev_depth);
+  cudaMalloc(&dev_depth, numSamples * sizeof(unsigned int));
+
+  checkCUDAError("rasterizeInit");
 }
 
 __global__
-void initDepth(int w, int h, int * depth)
+void _initDepth(int length, int *depth)
 {
-	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-
-	if (x < w && y < h)
-	{
-		int index = x + (y * w);
-		depth[index] = INT_MAX;
-	}
+  if (IDx < length * SAMPLES_PER_PIXEL)
+  {
+    depth[IDx] = INT_MAX;
+  }
 }
 
 
@@ -185,548 +194,715 @@ void initDepth(int w, int h, int * depth)
 * kern function with support for stride to sometimes replace cudaMemcpy
 * One thread is responsible for copying one component
 */
-__global__ 
-void _deviceBufferCopy(int N, BufferByte* dev_dst, const BufferByte* dev_src, int n, int byteStride, int byteOffset, int componentTypeByteSize) {
-	
-	// Attribute (vec3 position)
-	// component (3 * float)
-	// byte (4 * byte)
-
-	// id of component
-	int i = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-	if (i < N) {
-		int count = i / n;
-		int offset = i - count * n;	// which component of the attribute
-
-		for (int j = 0; j < componentTypeByteSize; j++) {
-			
-			dev_dst[count * componentTypeByteSize * n 
-				+ offset * componentTypeByteSize 
-				+ j]
-
-				= 
-
-			dev_src[byteOffset 
-				+ count * (byteStride == 0 ? componentTypeByteSize * n : byteStride) 
-				+ offset * componentTypeByteSize 
-				+ j];
-		}
-	}
-	
+
+__global__
+void _deviceBufferCopy(
+int N, BufferByte* dev_dst, const BufferByte* dev_src,
+int n, int byteStride, int byteOffset, int componentTypeByteSize) {
+
+  // Attribute (vec3 position)
+  // component (3 * float)
+  // byte (4 * byte)
+
+  // id of component
+  if (IDx < N) {
+    int count = IDx / n;
+    int offset = IDx - count * n; // which component of the attribute
+
+    for (int j = 0; j < componentTypeByteSize; j++) {
+
+      dev_dst[count * componentTypeByteSize * n
+        + offset * componentTypeByteSize
+        + j]
+
+        =
+
+      dev_src[byteOffset
+        + count * (byteStride == 0 ? componentTypeByteSize * n : byteStride)
+        + offset * componentTypeByteSize
+        + j];
+    }
+  }
+
 
 }
 
 __global__
 void _nodeMatrixTransform(
-	int numVertices,
-	VertexAttributePosition* position,
-	VertexAttributeNormal* normal,
-	glm::mat4 MV, glm::mat3 MV_normal) {
-
-	// vertex id
-	int vid = (blockIdx.x * blockDim.x) + threadIdx.x;
-	if (vid < numVertices) {
-		position[vid] = glm::vec3(MV * glm::vec4(position[vid], 1.0f));
-		normal[vid] = glm::normalize(MV_normal * normal[vid]);
-	}
+  int numVertices,
+  VertexAttributePosition* position,
+  VertexAttributeNormal* normal,
+  glm::mat4 MV, glm::mat3 MV_normal) {
+
+  // vertex id
+  if (IDx < numVertices) {
+    position[IDx] = glm::vec3(MV * glm::vec4(position[IDx], 1.0f));
+    normal[IDx] = glm::normalize(MV_normal * normal[IDx]);
+  }
 }
 
 glm::mat4 getMatrixFromNodeMatrixVector(const tinygltf::Node & n) {
-	
-	glm::mat4 curMatrix(1.0);
-
-	const std::vector<double> &m = n.matrix;
-	if (m.size() > 0) {
-		// matrix, copy it
-
-		for (int i = 0; i < 4; i++) {
-			for (int j = 0; j < 4; j++) {
-				curMatrix[i][j] = (float)m.at(4 * i + j);
-			}
-		}
-	} else {
-		// no matrix, use rotation, scale, translation
-
-		if (n.translation.size() > 0) {
-			curMatrix[3][0] = n.translation[0];
-			curMatrix[3][1] = n.translation[1];
-			curMatrix[3][2] = n.translation[2];
-		}
-
-		if (n.rotation.size() > 0) {
-			glm::mat4 R;
-			glm::quat q;
-			q[0] = n.rotation[0];
-			q[1] = n.rotation[1];
-			q[2] = n.rotation[2];
-
-			R = glm::mat4_cast(q);
-			curMatrix = curMatrix * R;
-		}
-
-		if (n.scale.size() > 0) {
-			curMatrix = curMatrix * glm::scale(glm::vec3(n.scale[0], n.scale[1], n.scale[2]));
-		}
-	}
-
-	return curMatrix;
+
+  glm::mat4 curMatrix(1.0);
+
+  const std::vector<double> &m = n.matrix;
+  if (m.size() > 0) {
+    // matrix, copy it
+
+    for (int i = 0; i < 4; i++) {
+      for (int j = 0; j < 4; j++) {
+        curMatrix[i][j] = (float)m.at(4 * i + j);
+      }
+    }
+  } else {
+    // no matrix, use rotation, scale, translation
+
+    if (n.translation.size() > 0) {
+      curMatrix[3][0] = n.translation[0];
+      curMatrix[3][1] = n.translation[1];
+      curMatrix[3][2] = n.translation[2];
+    }
+
+    if (n.rotation.size() > 0) {
+      glm::mat4 R;
+      glm::quat q;
+      q[0] = n.rotation[0];
+      q[1] = n.rotation[1];
+      q[2] = n.rotation[2];
+
+      R = glm::mat4_cast(q);
+      curMatrix = curMatrix * R;
+    }
+
+    if (n.scale.size() > 0) {
+      curMatrix = curMatrix * glm::scale(glm::vec3(n.scale[0], n.scale[1], n.scale[2]));
+    }
+  }
+
+  return curMatrix;
 }
 
 void traverseNode (
-	std::map<std::string, glm::mat4> & n2m,
-	const tinygltf::Scene & scene,
-	const std::string & nodeString,
-	const glm::mat4 & parentMatrix
-	) 
+  std::map<std::string, glm::mat4> & n2m,
+  const tinygltf::Scene & scene,
+  const std::string & nodeString,
+  const glm::mat4 & parentMatrix
+  )
 {
-	const tinygltf::Node & n = scene.nodes.at(nodeString);
-	glm::mat4 M = parentMatrix * getMatrixFromNodeMatrixVector(n);
-	n2m.insert(std::pair<std::string, glm::mat4>(nodeString, M));
+  const tinygltf::Node & n = scene.nodes.at(nodeString);
+  glm::mat4 M = parentMatrix * getMatrixFromNodeMatrixVector(n);
+  n2m.insert(std::pair<std::string, glm::mat4>(nodeString, M));
 
-	auto it = n.children.begin();
-	auto itEnd = n.children.end();
+  auto it = n.children.begin();
+  auto itEnd = n.children.end();
 
-	for (; it != itEnd; ++it) {
-		traverseNode(n2m, scene, *it, M);
-	}
+  for (; it != itEnd; ++it) {
+    traverseNode(n2m, scene, *it, M);
+  }
 }
 
 void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 
-	totalNumPrimitives = 0;
+  totalNumPrimitives = 0;
+
+  std::map<std::string, BufferByte*> bufferViewDevPointers;
+
+  // 1. copy all `bufferViews` to device memory
+  {
+    std::map<std::string, tinygltf::BufferView>::const_iterator it(
+      scene.bufferViews.begin());
+    std::map<std::string, tinygltf::BufferView>::const_iterator itEnd(
+      scene.bufferViews.end());
+
+    for (; it != itEnd; it++) {
+      const std::string key = it->first;
 
-	std::map<std::string, BufferByte*> bufferViewDevPointers;
+      const tinygltf::BufferView &bufferView = it->second;
+      if (bufferView.target == 0) {
+        continue; // Unsupported bufferView.
+      }
 
-	// 1. copy all `bufferViews` to device memory
-	{
-		std::map<std::string, tinygltf::BufferView>::const_iterator it(
-			scene.bufferViews.begin());
-		std::map<std::string, tinygltf::BufferView>::const_iterator itEnd(
-			scene.bufferViews.end());
+      const tinygltf::Buffer &buffer = scene.buffers.at(bufferView.buffer);
 
-		for (; it != itEnd; it++) {
-			const std::string key = it->first;
-			const tinygltf::BufferView &bufferView = it->second;
-			if (bufferView.target == 0) {
-				continue; // Unsupported bufferView.
-			}
+      BufferByte* dev_bufferView;
+      cudaMalloc(&dev_bufferView, bufferView.byteLength);
+      cudaMemcpy(dev_bufferView, &buffer.data.front() + bufferView.byteOffset, bufferView.byteLength, cudaMemcpyHostToDevice);
 
-			const tinygltf::Buffer &buffer = scene.buffers.at(bufferView.buffer);
+      checkCUDAError("Set BufferView Device Mem");
 
-			BufferByte* dev_bufferView;
-			cudaMalloc(&dev_bufferView, bufferView.byteLength);
-			cudaMemcpy(dev_bufferView, &buffer.data.front() + bufferView.byteOffset, bufferView.byteLength, cudaMemcpyHostToDevice);
+      bufferViewDevPointers.insert(std::make_pair(key, dev_bufferView));
 
-			checkCUDAError("Set BufferView Device Mem");
+    }
+  }
+
+
+
+  // 2. for each mesh:
+  //    for each primitive:
+  //      build device buffer of indices, materail, and each attributes
+  //      and store these pointers in a map
+  {
+
+    std::map<std::string, glm::mat4> nodeString2Matrix;
+    auto rootNodeNamesList = scene.scenes.at(scene.defaultScene);
+
+    {
+      auto it = rootNodeNamesList.begin();
+      auto itEnd = rootNodeNamesList.end();
+      for (; it != itEnd; ++it) {
+        traverseNode(nodeString2Matrix, scene, *it, glm::mat4(1.0f));
+      }
+    }
+
+
+    // parse through node to access mesh
+
+    auto itNode = nodeString2Matrix.begin();
+    auto itEndNode = nodeString2Matrix.end();
+    for (; itNode != itEndNode; ++itNode) {
+
+      const tinygltf::Node & N = scene.nodes.at(itNode->first);
+      const glm::mat4 & matrix = itNode->second;
+      const glm::mat3 & matrixNormal = glm::transpose(glm::inverse(glm::mat3(matrix)));
+
+      auto itMeshName = N.meshes.begin();
+      auto itEndMeshName = N.meshes.end();
+
+      for (; itMeshName != itEndMeshName; ++itMeshName) {
+
+        const tinygltf::Mesh & mesh = scene.meshes.at(*itMeshName);
+
+        auto res = mesh2vertexParts.insert(std::pair<std::string, std::vector<VertexParts>>(mesh.name, std::vector<VertexParts>()));
+        std::vector<VertexParts> & vertexPartsVector = (res.first)->second;
+
+        // for each primitive
+        for (size_t i = 0; i < mesh.primitives.size(); i++) {
+          const tinygltf::Primitive &primitive = mesh.primitives[i];
+
+          if (primitive.indices.empty())
+            return;
+
+          // TODO: add new attributes for your PrimitiveDevBufPointers when you add new attributes
+          VertexIndex* dev_indices = NULL;
+          VertexAttributePosition* dev_position = NULL;
+          VertexAttributeNormal* dev_normal = NULL;
+          VertexAttributeTexcoord* dev_texcoord0 = NULL;
+
+          // ----------Indices-------------
+
+          const tinygltf::Accessor &indexAccessor = scene.accessors.at(primitive.indices);
+          const tinygltf::BufferView &bufferView = scene.bufferViews.at(indexAccessor.bufferView);
+          BufferByte* dev_bufferView = bufferViewDevPointers.at(indexAccessor.bufferView);
+
+          // assume type is SCALAR for indices
+          int n = 1;
+          int numIndices = indexAccessor.count;
+          int componentTypeByteSize = sizeof(VertexIndex);
+          int byteLength = numIndices * n * componentTypeByteSize;
+
+          dim3 numThreadsPerBlock(128);
+          dim3 numBlocks((numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
+          cudaMalloc(&dev_indices, byteLength);
+          _deviceBufferCopy << <numBlocks, numThreadsPerBlock >> > (
+            numIndices,
+            (BufferByte*)dev_indices,
+            dev_bufferView,
+            n,
+            indexAccessor.byteStride,
+            indexAccessor.byteOffset,
+            componentTypeByteSize);
+
+
+          checkCUDAError("Set Index Buffer");
+
+
+          // ---------Primitive Info-------
+
+          // Warning: LINE_STRIP is not supported in tinygltfloader
+          int numPrimitives;
+          PrimitiveType primitiveType;
+          switch (primitive.mode) {
+          case TINYGLTF_MODE_TRIANGLES:
+            primitiveType = PrimitiveType::Triangle;
+            numPrimitives = numIndices / 3;
+            break;
+          case TINYGLTF_MODE_TRIANGLE_STRIP:
+            primitiveType = PrimitiveType::Triangle;
+            numPrimitives = numIndices - 2;
+            break;
+          case TINYGLTF_MODE_TRIANGLE_FAN:
+            primitiveType = PrimitiveType::Triangle;
+            numPrimitives = numIndices - 2;
+            break;
+          case TINYGLTF_MODE_LINE:
+            primitiveType = PrimitiveType::Line;
+            numPrimitives = numIndices / 2;
+            break;
+          case TINYGLTF_MODE_LINE_LOOP:
+            primitiveType = PrimitiveType::Line;
+            numPrimitives = numIndices + 1;
+            break;
+          case TINYGLTF_MODE_POINTS:
+            primitiveType = PrimitiveType::Point;
+            numPrimitives = numIndices;
+            break;
+          default:
+            // output error
+            break;
+          };
+
+
+          // ----------Attributes-------------
+
+          auto it(primitive.attributes.begin());
+          auto itEnd(primitive.attributes.end());
+
+          int numVertices = 0;
+          // for each attribute
+          for (; it != itEnd; it++) {
+            const tinygltf::Accessor &accessor = scene.accessors.at(it->second);
+            const tinygltf::BufferView &bufferView = scene.bufferViews.at(accessor.bufferView);
+
+            int n = 1;
+            if (accessor.type == TINYGLTF_TYPE_SCALAR) {
+              n = 1;
+            }
+            else if (accessor.type == TINYGLTF_TYPE_VEC2) {
+              n = 2;
+            }
+            else if (accessor.type == TINYGLTF_TYPE_VEC3) {
+              n = 3;
+            }
+            else if (accessor.type == TINYGLTF_TYPE_VEC4) {
+              n = 4;
+            }
+
+            BufferByte * dev_bufferView = bufferViewDevPointers.at(accessor.bufferView);
+            BufferByte ** dev_attribute = NULL;
+
+            numVertices = accessor.count;
+            int componentTypeByteSize;
+
+            // Note: since the type of our attribute array (dev_position) is static (float32)
+            // We assume the glTF model attribute type are 5126(FLOAT) here
+
+            if (it->first.compare("POSITION") == 0) {
+              componentTypeByteSize = sizeof(VertexAttributePosition) / n;
+              dev_attribute = (BufferByte**)&dev_position;
+            }
+            else if (it->first.compare("NORMAL") == 0) {
+              componentTypeByteSize = sizeof(VertexAttributeNormal) / n;
+              dev_attribute = (BufferByte**)&dev_normal;
+            }
+            else if (it->first.compare("TEXCOORD_0") == 0) {
+              componentTypeByteSize = sizeof(VertexAttributeTexcoord) / n;
+              dev_attribute = (BufferByte**)&dev_texcoord0;
+            }
+
+            std::cout << accessor.bufferView << "  -  " << it->second << "  -  " << it->first << '\n';
+
+            dim3 numThreadsPerBlock(128);
+            dim3 numBlocks((n * numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
+            int byteLength = numVertices * n * componentTypeByteSize;
+            cudaMalloc(dev_attribute, byteLength);
+
+            _deviceBufferCopy << <numBlocks, numThreadsPerBlock >> > (
+              n * numVertices,
+              *dev_attribute,
+              dev_bufferView,
+              n,
+              accessor.byteStride,
+              accessor.byteOffset,
+              componentTypeByteSize);
+
+            std::string msg = "Set Attribute Buffer: " + it->first;
+            checkCUDAError(msg.c_str());
+          }
+
+          // malloc for Vertex
+          Vertex* dev_vertex;
+          cudaMalloc(&dev_vertex, numVertices * sizeof(Vertex));
+          checkCUDAError("Malloc VertexOut Buffer");
+
+          // ----------Materials-------------
+
+          // You can only worry about this part once you started to
+          // implement textures for your rasterizer
+          TextureData* dev_diffuseTex = NULL;
+          glm::vec2 texRes;
+          if (!primitive.material.empty()) {
+            const tinygltf::Material &mat = scene.materials.at(primitive.material);
+            printf("material.name = %s\n", mat.name.c_str());
+
+            if (mat.values.find("diffuse") != mat.values.end()) {
+              std::string diffuseTexName = mat.values.at("diffuse").string_value;
+              if (scene.textures.find(diffuseTexName) != scene.textures.end()) {
+                const tinygltf::Texture &tex = scene.textures.at(diffuseTexName);
+                if (scene.images.find(tex.source) != scene.images.end()) {
+                  const tinygltf::Image &image = scene.images.at(tex.source);
+
+                  size_t s = image.image.size() * sizeof(TextureData);
+                  cudaMalloc(&dev_diffuseTex, s);
+                  cudaMemcpy(dev_diffuseTex, &image.image.at(0), s, cudaMemcpyHostToDevice);
+
+                  // TODO: store the image size to your PrimitiveDevBufPointers
+                  texRes = glm::vec2(image.width, image.height);
+
+                  checkCUDAError("Set Texture Image data");
+                }
+              }
+            }
+
+            // TODO: write your code for other materials
+            // You may have to take a look at tinygltfloader
+            // You can also use the above code loading diffuse material as a start point
+          }
+
+
+          // ---------Node hierarchy transform--------
+          cudaDeviceSynchronize();
+
+          dim3 numBlocksNodeTransform((numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
+          _nodeMatrixTransform << <numBlocksNodeTransform, numThreadsPerBlock >> > (
+            numVertices,
+            dev_position,
+            dev_normal,
+            matrix,
+            matrixNormal);
+
+          checkCUDAError("Node hierarchy transformation");
+
+          // at the end of the for loop of primitive
+          // push dev pointers to map
+          vertexPartsVector.push_back(VertexParts{
+            primitive.mode,
+            primitiveType,
+            numPrimitives,
+            numIndices,
+            numVertices,
+
+            dev_indices,
+            dev_position,
+            dev_normal,
+            dev_texcoord0,
+
+            dev_diffuseTex,
+
+            dev_vertex,  //VertexOut
+            texRes
+          });
 
-			bufferViewDevPointers.insert(std::make_pair(key, dev_bufferView));
+          totalNumPrimitives += numPrimitives;
 
-		}
-	}
+        } // for each primitive
 
+      } // for each mesh
 
+    } // for each node
 
-	// 2. for each mesh: 
-	//		for each primitive: 
-	//			build device buffer of indices, materail, and each attributes
-	//			and store these pointers in a map
-	{
-
-		std::map<std::string, glm::mat4> nodeString2Matrix;
-		auto rootNodeNamesList = scene.scenes.at(scene.defaultScene);
-
-		{
-			auto it = rootNodeNamesList.begin();
-			auto itEnd = rootNodeNamesList.end();
-			for (; it != itEnd; ++it) {
-				traverseNode(nodeString2Matrix, scene, *it, glm::mat4(1.0f));
-			}
-		}
+  }
 
 
-		// parse through node to access mesh
-
-		auto itNode = nodeString2Matrix.begin();
-		auto itEndNode = nodeString2Matrix.end();
-		for (; itNode != itEndNode; ++itNode) {
-
-			const tinygltf::Node & N = scene.nodes.at(itNode->first);
-			const glm::mat4 & matrix = itNode->second;
-			const glm::mat3 & matrixNormal = glm::transpose(glm::inverse(glm::mat3(matrix)));
-
-			auto itMeshName = N.meshes.begin();
-			auto itEndMeshName = N.meshes.end();
-
-			for (; itMeshName != itEndMeshName; ++itMeshName) {
-
-				const tinygltf::Mesh & mesh = scene.meshes.at(*itMeshName);
-
-				auto res = mesh2PrimitivesMap.insert(std::pair<std::string, std::vector<PrimitiveDevBufPointers>>(mesh.name, std::vector<PrimitiveDevBufPointers>()));
-				std::vector<PrimitiveDevBufPointers> & primitiveVector = (res.first)->second;
-
-				// for each primitive
-				for (size_t i = 0; i < mesh.primitives.size(); i++) {
-					const tinygltf::Primitive &primitive = mesh.primitives[i];
-
-					if (primitive.indices.empty())
-						return;
-
-					// TODO: add new attributes for your PrimitiveDevBufPointers when you add new attributes
-					VertexIndex* dev_indices;
-					VertexAttributePosition* dev_position;
-					VertexAttributeNormal* dev_normal;
-					VertexAttributeTexcoord* dev_texcoord0;
-
-					// ----------Indices-------------
-
-					const tinygltf::Accessor &indexAccessor = scene.accessors.at(primitive.indices);
-					const tinygltf::BufferView &bufferView = scene.bufferViews.at(indexAccessor.bufferView);
-					BufferByte* dev_bufferView = bufferViewDevPointers.at(indexAccessor.bufferView);
-
-					// assume type is SCALAR for indices
-					int n = 1;
-					int numIndices = indexAccessor.count;
-					int componentTypeByteSize = sizeof(VertexIndex);
-					int byteLength = numIndices * n * componentTypeByteSize;
-
-					dim3 numThreadsPerBlock(128);
-					dim3 numBlocks((numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
-					cudaMalloc(&dev_indices, byteLength);
-					_deviceBufferCopy << <numBlocks, numThreadsPerBlock >> > (
-						numIndices,
-						(BufferByte*)dev_indices,
-						dev_bufferView,
-						n,
-						indexAccessor.byteStride,
-						indexAccessor.byteOffset,
-						componentTypeByteSize);
-
-
-					checkCUDAError("Set Index Buffer");
-
-
-					// ---------Primitive Info-------
-
-					// Warning: LINE_STRIP is not supported in tinygltfloader
-					int numPrimitives;
-					PrimitiveType primitiveType;
-					switch (primitive.mode) {
-					case TINYGLTF_MODE_TRIANGLES:
-						primitiveType = PrimitiveType::Triangle;
-						numPrimitives = numIndices / 3;
-						break;
-					case TINYGLTF_MODE_TRIANGLE_STRIP:
-						primitiveType = PrimitiveType::Triangle;
-						numPrimitives = numIndices - 2;
-						break;
-					case TINYGLTF_MODE_TRIANGLE_FAN:
-						primitiveType = PrimitiveType::Triangle;
-						numPrimitives = numIndices - 2;
-						break;
-					case TINYGLTF_MODE_LINE:
-						primitiveType = PrimitiveType::Line;
-						numPrimitives = numIndices / 2;
-						break;
-					case TINYGLTF_MODE_LINE_LOOP:
-						primitiveType = PrimitiveType::Line;
-						numPrimitives = numIndices + 1;
-						break;
-					case TINYGLTF_MODE_POINTS:
-						primitiveType = PrimitiveType::Point;
-						numPrimitives = numIndices;
-						break;
-					default:
-						// output error
-						break;
-					};
-
-
-					// ----------Attributes-------------
-
-					auto it(primitive.attributes.begin());
-					auto itEnd(primitive.attributes.end());
-
-					int numVertices = 0;
-					// for each attribute
-					for (; it != itEnd; it++) {
-						const tinygltf::Accessor &accessor = scene.accessors.at(it->second);
-						const tinygltf::BufferView &bufferView = scene.bufferViews.at(accessor.bufferView);
-
-						int n = 1;
-						if (accessor.type == TINYGLTF_TYPE_SCALAR) {
-							n = 1;
-						}
-						else if (accessor.type == TINYGLTF_TYPE_VEC2) {
-							n = 2;
-						}
-						else if (accessor.type == TINYGLTF_TYPE_VEC3) {
-							n = 3;
-						}
-						else if (accessor.type == TINYGLTF_TYPE_VEC4) {
-							n = 4;
-						}
-
-						BufferByte * dev_bufferView = bufferViewDevPointers.at(accessor.bufferView);
-						BufferByte ** dev_attribute = NULL;
-
-						numVertices = accessor.count;
-						int componentTypeByteSize;
-
-						// Note: since the type of our attribute array (dev_position) is static (float32)
-						// We assume the glTF model attribute type are 5126(FLOAT) here
-
-						if (it->first.compare("POSITION") == 0) {
-							componentTypeByteSize = sizeof(VertexAttributePosition) / n;
-							dev_attribute = (BufferByte**)&dev_position;
-						}
-						else if (it->first.compare("NORMAL") == 0) {
-							componentTypeByteSize = sizeof(VertexAttributeNormal) / n;
-							dev_attribute = (BufferByte**)&dev_normal;
-						}
-						else if (it->first.compare("TEXCOORD_0") == 0) {
-							componentTypeByteSize = sizeof(VertexAttributeTexcoord) / n;
-							dev_attribute = (BufferByte**)&dev_texcoord0;
-						}
-
-						std::cout << accessor.bufferView << "  -  " << it->second << "  -  " << it->first << '\n';
-
-						dim3 numThreadsPerBlock(128);
-						dim3 numBlocks((n * numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
-						int byteLength = numVertices * n * componentTypeByteSize;
-						cudaMalloc(dev_attribute, byteLength);
-
-						_deviceBufferCopy << <numBlocks, numThreadsPerBlock >> > (
-							n * numVertices,
-							*dev_attribute,
-							dev_bufferView,
-							n,
-							accessor.byteStride,
-							accessor.byteOffset,
-							componentTypeByteSize);
-
-						std::string msg = "Set Attribute Buffer: " + it->first;
-						checkCUDAError(msg.c_str());
-					}
-
-					// malloc for VertexOut
-					VertexOut* dev_vertexOut;
-					cudaMalloc(&dev_vertexOut, numVertices * sizeof(VertexOut));
-					checkCUDAError("Malloc VertexOut Buffer");
-
-					// ----------Materials-------------
-
-					// You can only worry about this part once you started to 
-					// implement textures for your rasterizer
-					TextureData* dev_diffuseTex = NULL;
-					if (!primitive.material.empty()) {
-						const tinygltf::Material &mat = scene.materials.at(primitive.material);
-						printf("material.name = %s\n", mat.name.c_str());
-
-						if (mat.values.find("diffuse") != mat.values.end()) {
-							std::string diffuseTexName = mat.values.at("diffuse").string_value;
-							if (scene.textures.find(diffuseTexName) != scene.textures.end()) {
-								const tinygltf::Texture &tex = scene.textures.at(diffuseTexName);
-								if (scene.images.find(tex.source) != scene.images.end()) {
-									const tinygltf::Image &image = scene.images.at(tex.source);
-
-									size_t s = image.image.size() * sizeof(TextureData);
-									cudaMalloc(&dev_diffuseTex, s);
-									cudaMemcpy(dev_diffuseTex, &image.image.at(0), s, cudaMemcpyHostToDevice);
-									
-									// TODO: store the image size to your PrimitiveDevBufPointers
-									// image.width;
-									// image.height;
-
-									checkCUDAError("Set Texture Image data");
-								}
-							}
-						}
-
-						// TODO: write your code for other materails
-						// You may have to take a look at tinygltfloader
-						// You can also use the above code loading diffuse material as a start point 
-					}
-
-
-					// ---------Node hierarchy transform--------
-					cudaDeviceSynchronize();
-					
-					dim3 numBlocksNodeTransform((numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
-					_nodeMatrixTransform << <numBlocksNodeTransform, numThreadsPerBlock >> > (
-						numVertices,
-						dev_position,
-						dev_normal,
-						matrix,
-						matrixNormal);
-
-					checkCUDAError("Node hierarchy transformation");
-
-					// at the end of the for loop of primitive
-					// push dev pointers to map
-					primitiveVector.push_back(PrimitiveDevBufPointers{
-						primitive.mode,
-						primitiveType,
-						numPrimitives,
-						numIndices,
-						numVertices,
-
-						dev_indices,
-						dev_position,
-						dev_normal,
-						dev_texcoord0,
-
-						dev_diffuseTex,
-
-						dev_vertexOut	//VertexOut
-					});
-
-					totalNumPrimitives += numPrimitives;
-
-				} // for each primitive
-
-			} // for each mesh
-
-		} // for each node
-
-	}
-	
-
-	// 3. Malloc for dev_primitives
-	{
-		cudaMalloc(&dev_primitives, totalNumPrimitives * sizeof(Primitive));
-	}
-	
-
-	// Finally, cudaFree raw dev_bufferViews
-	{
-
-		std::map<std::string, BufferByte*>::const_iterator it(bufferViewDevPointers.begin());
-		std::map<std::string, BufferByte*>::const_iterator itEnd(bufferViewDevPointers.end());
-			
-			//bufferViewDevPointers
-
-		for (; it != itEnd; it++) {
-			cudaFree(it->second);
-		}
-
-		checkCUDAError("Free BufferView Device Mem");
-	}
+  // 3. Malloc for dev_primitives
+  {
+    cudaMalloc(&dev_primitives, totalNumPrimitives * sizeof(Primitive));
+  }
+
+
+  // Finally, cudaFree raw dev_bufferViews
+  {
+
+    std::map<std::string, BufferByte*>::const_iterator it(bufferViewDevPointers.begin());
+    std::map<std::string, BufferByte*>::const_iterator itEnd(bufferViewDevPointers.end());
+
+      //bufferViewDevPointers
+
+    for (; it != itEnd; it++) {
+      cudaFree(it->second);
+    }
+
+    checkCUDAError("Free BufferView Device Mem");
+  }
 
 
 }
 
+//////////////// PIPELINE CODE ////////////////
 
 
-__global__ 
+__global__
 void _vertexTransformAndAssembly(
-	int numVertices, 
-	PrimitiveDevBufPointers primitive, 
-	glm::mat4 MVP, glm::mat4 MV, glm::mat3 MV_normal, 
-	int width, int height) {
-
-	// vertex id
-	int vid = (blockIdx.x * blockDim.x) + threadIdx.x;
-	if (vid < numVertices) {
-
-		// TODO: Apply vertex transformation here
-		// Multiply the MVP matrix for each vertex position, this will transform everything into clipping space
-		// Then divide the pos by its w element to transform into NDC space
-		// Finally transform x and y to viewport space
-
-		// TODO: Apply vertex assembly here
-		// Assemble all attribute arraies into the primitive array
-		
-	}
+  const int numVertices,
+  VertexParts vertexParts,
+  const glm::mat4 MVP, const glm::mat4 MV, const glm::mat3 MV_normal,
+  const int width, const int height) {
+
+  // vertex id
+  if (IDx >= numVertices) return;
+
+  Vertex &vertex = vertexParts.vertices[IDx];
+
+  // Multiply the MVP matrix for each vertex position, this will transform everything into clipping space
+  // Then divide the pos by its w element to transform into NDC space
+  // Finally transform x and y to viewport space
+  // TODO: Apply vertex transformation here
+  glm::vec4 modelPos = glm::vec4(vertexParts.pos[IDx], 1); // this is in model space
+  vertex.viewNorm = glm::vec3(MV_normal * vertexParts.normal[IDx]);
+  vertex.texcoord0 = vertexParts.texcoord0[IDx];
+  vertex.viewPos = glm::vec3(MV * modelPos);
+  glm::vec4 clipPos(MVP * modelPos);
+  glm::vec4 screenDims(width, height, 1, 1);
+  vertex.screenPos = screenDims * (clipPos / clipPos.w + glm::vec4(1, 1, 1, 0)) / 2.0f;
+
+  // Assemble all attribute arrays into the primitive array
 }
 
 
 
 static int curPrimitiveBeginId = 0;
 
-__global__ 
-void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_primitives, PrimitiveDevBufPointers primitive) {
-
-	// index id
-	int iid = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-	if (iid < numIndices) {
+// START HERE: figure out where to put texture info
 
-		// TODO: uncomment the following code for a start
-		// This is primitive assembly for triangles
+__global__
+void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* primitives, VertexParts vertexParts) {
+
+  // index id
+  if (IDx < numIndices) {
+
+    // TODO: uncomment the following code for a start
+//This is primitive assembly for triangles
+
+if (vertexParts.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
+  int n_vertices = vertexParts.primitiveType;
+  int prim_id = IDx / n_vertices + curPrimitiveBeginId;
+  Primitive &primitive = primitives[prim_id];
+  primitive.v[IDx % n_vertices]
+    = vertexParts.vertices[vertexParts.indices[IDx]];
+  primitive.diffuseTex = vertexParts.diffuseTex;
+  primitive.texRes = vertexParts.texRes;
+}
+// TODO: other primitive types (point, line)
+  }
 
-		//int pid;	// id for cur primitives vector
-		//if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
-		//	pid = iid / (int)primitive.primitiveType;
-		//	dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
-		//		= primitive.dev_verticesOut[primitive.dev_indices[iid]];
-		//}
+}
 
+__device__
+unsigned int getFragmentDepth(glm::vec3 bcCoord, glm::vec3 tri[3]) {
+  // get depth of fragment represented as an integer
+  int i;
+  float depth = 0;
+
+  // interpolate vertex depths
+  range(i, 0, 3) {
+    depth += bcCoord[i] * tri[i].z;
+  }
+  if (depth > 1) {
+    return INT_MAX;
+  }
+  else if (depth < 0) {
+    return 0;
+  }
+  else {
+    return depth * INT_MAX;
+  };
+}
 
-		// TODO: other primitive types (point, line)
-	}
-	
+__device__
+ void atomicAddVec3(glm::vec3 &vec1, glm::vec3 vec2) {
+  int i;
+  range(i, 0, 3) {
+    atomicAdd(&vec1[i], vec2[i]);
+  }
 }
 
+__global__
+void _rasterize(int n_primitives, int height, int width,
+const Primitive *primitives, unsigned int *depths, Fragment *fragments) {
+  if (IDx >= n_primitives) return;
+
+  int id = IDx;
+
+  int i, y, x, offset;
+  Primitive primitive = primitives[IDx];
+  glm::vec3 tri[3];
+  range(i, 0, 3) {
+    // get coordinates of tri points
+    tri[i] = glm::vec3(primitive.v[i].screenPos);
+  }
+  thrust::uniform_real_distribution<float> u01(0, 1);
+  AABB aabb = getAABBForTriangle(tri);
+  range(y, aabb.min.y, aabb.max.y) {
+    range(x, aabb.min.x, aabb.max.x) {
+      thrust::default_random_engine seed(getIndex(x, y, width));
+      range(offset, 0, SAMPLES_PER_PIXEL) {
+        int index = SAMPLES_PER_PIXEL * getIndex(width - x, height - y, width) + offset;
+        int id = index;
+        glm::vec2 screenPos = glm::vec2(x + u01(seed), y + u01(seed));
+
+        // determine if screenPos is inside polygon
+        glm::vec3 barycentricCoord = calculateBarycentricCoordinate(tri, screenPos);
+
+        if (isBarycentricCoordInBounds(barycentricCoord)) {
+          unsigned int depth = getFragmentDepth(barycentricCoord, tri);
+
+          // assign fragEyePos.z to dev_depth[i] iff it is smaller
+          // (fragment is closer to camera)
+          atomicMin(depths + index, depth);
+        }
+      }
+    }
+  }
+
+  __syncthreads(); // wait for all depths to be updated
+
+
+  range(y, aabb.min.y, aabb.max.y) {
+    range(x, aabb.min.x, aabb.max.x) {
+      thrust::default_random_engine seed(getIndex(x, y, width));
+      range(offset, 0, SAMPLES_PER_PIXEL) {
+
+        int index = SAMPLES_PER_PIXEL *  getIndex(width - x, height - y, width) + offset;
+        glm::vec2 screenPos = glm::vec2(x + u01(seed), y + u01(seed));
+
+        // determine if screenPos is inside polygon
+        glm::vec3 barycentricCoord = calculateBarycentricCoordinate(tri, screenPos);
+        unsigned int depth = getFragmentDepth(barycentricCoord, tri);
+
+        bool condition = isBarycentricCoordInBounds(barycentricCoord) && depth == depths[index];
+        if (condition) {
+
+          // if the sample is not occluded
+          Fragment &fragment = fragments[index];
+
+
+          // interpolate texcoord and viewPos and texnorm
+          fragment.viewPos = glm::vec3(0);
+          fragment.viewNorm = glm::vec3(0);
+          glm::vec2 texcoord(0);
+          float texWeightNorm = 0;
+          range(i, 0, 3) {
+            float weight = barycentricCoord[i];
+            Vertex v = primitive.v[i];
+
+            fragment.viewNorm += weight * v.viewNorm;
+            fragment.viewPos += weight * v.viewPos;
+
+            float texWeight = weight / (v.viewPos.z + EPSILON);
+            texcoord += texWeight * v.texcoord0;
+            texWeightNorm += texWeight;
+          }
+
+          // get the color using texcoord
+          texcoord /= (texWeightNorm + EPSILON);
+          glm::vec2 texRes = primitive.texRes;
+          glm::vec2 scaledCoord = texcoord * glm::vec2(texRes.x, texRes.y);
+          int tid = 3 * getIndex((int)scaledCoord.x, (int)scaledCoord.y, texRes.x);
+          TextureData *tex = primitive.diffuseTex;
+          if (tex) {
+            fragment.color = glm::vec3(tex[tid + 0], tex[tid + 1], tex[tid + 2]) / 255.0f;
+          }
+          else {
+            fragment.color = DEFAULT_COLOR;
+          }
+        }
+      }
+    }
+  }
+}
 
+/**
+* Writes fragment colors to the framebuffer
+*/
+__global__
+void _render(int w, int h, const Fragment *fragmentBuffer, glm::vec3 *framebuffer) {
+  if (IDx >= w || IDy >= h) return;
+  int index = getIndex(IDx, IDy, w);
+  int offset;
+  range(offset, 0, SAMPLES_PER_PIXEL) {
+    int sampleId = SAMPLES_PER_PIXEL * index + offset;
+    Fragment frag = fragmentBuffer[sampleId];
+    glm::vec3 lightPos(0);
+    glm::vec3 L = glm::normalize(glm::vec3(0, -1, -1));//lightPos - frag.viewPos);
+    glm::vec3 V = glm::normalize(-frag.viewPos);
+    glm::vec3 H = glm::normalize(L + V);
+    float intensity = saturate(glm::dot(frag.viewNorm, H) + 0.2) + AMBIENT_LIGHT;
+    if (TUNE_SHADE) {
+      intensity = intensity > 0.5 ? 1 : AMBIENT_LIGHT;
+    }
+    framebuffer[index] += intensity * frag.color / (float)SAMPLES_PER_PIXEL;
+  }
+}
 
 /**
  * Perform rasterization.
  */
 void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal) {
-    int sideLength2d = 8;
-    dim3 blockSize2d(sideLength2d, sideLength2d);
+    int sideLength = 8;
+    dim3 blockSize2d(sideLength, sideLength);
     dim3 blockCount2d((width  - 1) / blockSize2d.x + 1,
-		(height - 1) / blockSize2d.y + 1);
-
-	// Execute your rasterization pipeline here
-	// (See README for rasterization pipeline outline.)
-
-	// Vertex Process & primitive assembly
-	{
-		curPrimitiveBeginId = 0;
-		dim3 numThreadsPerBlock(128);
-
-		auto it = mesh2PrimitivesMap.begin();
-		auto itEnd = mesh2PrimitivesMap.end();
-
-		for (; it != itEnd; ++it) {
-			auto p = (it->second).begin();	// each primitive
-			auto pEnd = (it->second).end();
-			for (; p != pEnd; ++p) {
-				dim3 numBlocksForVertices((p->numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
-				dim3 numBlocksForIndices((p->numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
-
-				_vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> >(p->numVertices, *p, MVP, MV, MV_normal, width, height);
-				checkCUDAError("Vertex Processing");
-				cudaDeviceSynchronize();
-				_primitiveAssembly << < numBlocksForIndices, numThreadsPerBlock >> >
-					(p->numIndices, 
-					curPrimitiveBeginId, 
-					dev_primitives, 
-					*p);
-				checkCUDAError("Primitive Assembly");
-
-				curPrimitiveBeginId += p->numPrimitives;
-			}
-		}
-
-		checkCUDAError("Vertex Processing and Primitive Assembly");
-	}
-	
-	cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
-	initDepth << <blockCount2d, blockSize2d >> >(width, height, dev_depth);
-	
-	// TODO: rasterize
-
-
-
-    // Copy depthbuffer colors into framebuffer
-	render << <blockCount2d, blockSize2d >> >(width, height, dev_fragmentBuffer, dev_framebuffer);
-	checkCUDAError("fragment shader");
-    // Copy framebuffer into OpenGL buffer for OpenGL previewing
-    sendImageToPBO<<<blockCount2d, blockSize2d>>>(pbo, width, height, dev_framebuffer);
-    checkCUDAError("copy render result to pbo");
+    (height - 1) / blockSize2d.y + 1);
+
+  // Execute your rasterization pipeline here
+  cudaMemset(dev_primitives, 0, width * height * sizeof(Primitive));
+  cudaMemset(dev_fragmentBuffer, 0, SAMPLES_PER_PIXEL * width * height * sizeof(Fragment));
+
+  // (See README for rasterization pipeline outline.)
+
+  // Vertex Process & primitive assembly
+
+  dim3 numThreadsPerBlock(128);
+  curPrimitiveBeginId = 0;
+  {
+    auto it = mesh2vertexParts.begin();
+    auto itEnd = mesh2vertexParts.end();
+
+    for (; it != itEnd; ++it) {
+      auto parts = (it->second).begin();  // each primitive
+      auto partsEnd = (it->second).end();
+      for (; parts != partsEnd; ++parts) {
+        dim3 numBlocksForVertices((parts->numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
+        dim3 numBlocksForIndices((parts->numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
+
+        _vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> >
+          (parts->numVertices,
+          *parts,
+          MVP, MV, MV_normal,
+          width, height);
+        checkCUDAError("Vertex Transform and Assembly");
+        cudaDeviceSynchronize();
+
+        _primitiveAssembly << < numBlocksForIndices, numThreadsPerBlock >> >
+          (parts->numIndices,
+          curPrimitiveBeginId,
+          dev_primitives,
+          *parts);
+        checkCUDAError("Primitive Assembly");
+
+        curPrimitiveBeginId += parts->numPrimitives;
+      }
+    }
+
+    checkCUDAError("Vertex Processing and Primitive Assembly");
+  }
+
+  //_initDepth << <blockSize, sampleBlockSize2d >> >(numSamples, dev_depth);
+  cudaMemset(dev_depth, 0xff, SAMPLES_PER_PIXEL * width * height * sizeof(dev_depth[0]));
+  cudaMemset(dev_framebuffer, 0, width * height * sizeof(dev_framebuffer[0]));
+
+  // TODO: rasterize
+
+  dim3 blockSize = totalNumPrimitives / numThreadsPerBlock.x + 1;
+  _rasterize<< <blockSize, numThreadsPerBlock>> >
+    (totalNumPrimitives, height, width,
+    dev_primitives, dev_depth, dev_fragmentBuffer);
+  checkCUDAError("rasterizer");
+
+  // Copy depthbuffer colors into framebuffer
+  _render << <blockCount2d, blockSize2d >> >(width, height, dev_fragmentBuffer, dev_framebuffer);
+  checkCUDAError("fragment shader");
+  // Copy framebuffer into OpenGL buffer for OpenGL previewing
+  _sendImageToPBO<<<blockCount2d, blockSize2d>>>(pbo, width, height, dev_framebuffer);
+  checkCUDAError("copy render result to pbo");
 }
 
 /**
@@ -736,36 +912,35 @@ void rasterizeFree() {
 
     // deconstruct primitives attribute/indices device buffer
 
-	auto it(mesh2PrimitivesMap.begin());
-	auto itEnd(mesh2PrimitivesMap.end());
-	for (; it != itEnd; ++it) {
-		for (auto p = it->second.begin(); p != it->second.end(); ++p) {
-			cudaFree(p->dev_indices);
-			cudaFree(p->dev_position);
-			cudaFree(p->dev_normal);
-			cudaFree(p->dev_texcoord0);
-			cudaFree(p->dev_diffuseTex);
+  auto it(mesh2vertexParts.begin());
+  auto itEnd(mesh2vertexParts.end());
+  for (; it != itEnd; ++it) {
+    for (auto p = it->second.begin(); p != it->second.end(); ++p) {
+      cudaFree(p->indices);
+      cudaFree(p->pos);
+      cudaFree(p->normal);
+      cudaFree(p->texcoord0);
+      cudaFree(p->diffuseTex);
+      cudaFree(p->vertices);
 
-			cudaFree(p->dev_verticesOut);
 
-			
-			//TODO: release other attributes and materials
-		}
-	}
+      //TODO: release other attributes and materials
+    }
+  }
 
-	////////////
+  ////////////
 
-    cudaFree(dev_primitives);
-    dev_primitives = NULL;
+  cudaFree(dev_primitives);
+  dev_primitives = NULL;
 
-	cudaFree(dev_fragmentBuffer);
-	dev_fragmentBuffer = NULL;
+  cudaFree(dev_fragmentBuffer);
+  dev_fragmentBuffer = NULL;
 
-    cudaFree(dev_framebuffer);
-    dev_framebuffer = NULL;
+  cudaFree(dev_framebuffer);
+  dev_framebuffer = NULL;
 
-	cudaFree(dev_depth);
-	dev_depth = NULL;
+  cudaFree(dev_depth);
+  dev_depth = NULL;
 
-    checkCUDAError("rasterize Free");
+  checkCUDAError("rasterize Free");
 }
diff --git a/src/rasterizeTools.h b/src/rasterizeTools.h
index 46c701e..fe5eeb8 100644
--- a/src/rasterizeTools.h
+++ b/src/rasterizeTools.h
@@ -1,4 +1,4 @@
-/**
+﻿/**
  * @file      rasterizeTools.h
  * @brief     Tools/utility functions for rasterization.
  * @authors   Yining Karl Li
@@ -12,6 +12,8 @@
 #include <glm/glm.hpp>
 #include <util/utilityCore.hpp>
 
+#define range(i, start, stop) for (i = start; i < stop; i++)
+
 struct AABB {
     glm::vec3 min;
     glm::vec3 max;
@@ -70,7 +72,7 @@ float calculateBarycentricCoordinateValue(glm::vec2 a, glm::vec2 b, glm::vec2 c,
  * Calculate barycentric coordinates.
  */
 __host__ __device__ static
-glm::vec3 calculateBarycentricCoordinate(const glm::vec3 tri[3], glm::vec2 point) {
+glm::vec3 calculateBarycentricCoordinate(const glm::vec3 tri[3], const glm::vec2 point) {
     float beta  = calculateBarycentricCoordinateValue(glm::vec2(tri[0].x, tri[0].y), point, glm::vec2(tri[2].x, tri[2].y), tri);
     float gamma = calculateBarycentricCoordinateValue(glm::vec2(tri[0].x, tri[0].y), glm::vec2(tri[1].x, tri[1].y), point, tri);
     float alpha = 1.0 - beta - gamma;
@@ -99,3 +101,18 @@ float getZAtCoordinate(const glm::vec3 barycentricCoord, const glm::vec3 tri[3])
            + barycentricCoord.y * tri[1].z
            + barycentricCoord.z * tri[2].z);
 }
+
+__host__ __device__ 
+glm::vec3 interpolateVec3(const glm::vec3 tri[3], const glm::vec3 bcCoord) {
+  int i;
+  glm::vec3 val;
+  range(i, 0, 3) {
+    val += bcCoord[i] * tri[i];
+  }
+  return val;
+}
+
+__host__ __device__ 
+bool approxEq(float a, float b) {
+	return a + EPSILON > b && a - EPSILON < b;
+}
diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt
index c995fae..b53ad72 100644
--- a/util/CMakeLists.txt
+++ b/util/CMakeLists.txt
@@ -1,3 +1,4 @@
+message("util")
 set(SOURCE_FILES
     "utilityCore.hpp"
     "utilityCore.cpp"