CSCI-4830-002-2014 · ianks · Nov 17, 2014 · Nov 17, 2014 · Nov 17, 2014 · Nov 17, 2014
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+node_modules/*
+data/*
diff --git a/README.md b/README.md
@@ -4,75 +4,141 @@
 
 ## Challenge 1
 
-[Insert Screenshot]
+![screenshot](https://www.dropbox.com/s/b4holedxkt5u92k/Screenshot%202014-11-16%2014.19.11.png?dl=1)
 
 ## Challenge 2
 
-[Explain what's interesting]
+What immediately struck me as interesting was that seemingly all entries had
+a `parent_id`. Looking into it more, I noticed that they are prefaced with
+either a `t1` or a `t3`. `t1` means that is a direct response to a comment
+while `t3` means it is a response to a response. With this, you can form
+a tree of comments, with the root being `link`. The interesting thing about
+this is one can filter our 'noise' of commenters by using this data.
 
 ## Challenge 3
 
-[Explain possible Insights]
+Theoretically, this dataset could tell you almost *anything* regarding
+Redditors social opinions on issues. This comes with a caveat; we would need
+near infinite compututational power and a huge advance in NLP techniques.
+Barring that, we can actually set our bar a little lower and analyze what are
+the hottest topics of all time, and we can categorize them using information
+about the subreddit, the content of the title, znd the number of likes. We could
+then construct graphs or do fancy statistical analyses to see if there is any
+'truth' behind our hypotheses.
 
 ## Challenge 4
 
-[What it would tell you about the Reddit Community]
+I think we can find some interesting insights about the popularity of certain
+topics in the Reddit community. From that information, we can gain insights
+the relevance of issues with the 'internet generation'. We could compare the
+topics seen on Reddit to topics seen on say, Fox news. We can see how Redditors
+are fundamentally different from other internet communities.
 
 ## Challenge 5
 
-[Link to Code or pasted code]
-[Answer]
+After spending 3 hours attempting to deal with asynchronous Javascript, I jumped
+off of a bridge. I mean, I wasn't able to complete this challenge. If you can
+please explain to me how the hell to somehow wrestly line 28-35 into something
+that can return an actual result, that would be huge! (micha.js)
 
 ## Challenge 6
 
-[What does this change about our analysis?]
+This would bias our data towards commenters that, on average, recieve a higher
+amount of upvotes per post. At first glance, it is hard to know whether the
+frequency of comments is positively or negatively correlated with the number
+of upvotes ones recieves, but it woul have to be a factor to rule out.
 
 ## Challenge 7
 
-[How would you change your conclusions?]
+It would be affected by filtering out the results of people that were not
+getting more than 10 upvotes. This does not neccesarily affect the data,
+but if for some number of upvotes per posts were correlated with the freqency
+of posts this could cause bias in our data.
 
 ## Challenge 8
 
-[Bias in answer]
+It would change if there were a correlation between freq. of posts and the
+the number of upvotes in a post. We would have a smaller confidence interval
+for our results.
 
 ## Challenge 9
 
-[Other Biases]
+It may be biased due to the time period in which the data is from. The data
+only covers a period of 15 days which is a long time in the world of the
+internet. There might have been overlapping topics during this time period.
 
 ## Challenge 10
 
-[How may you try and prove the bias]
+- Time period of 15 days
+- Topic overlap
+- Bots
+- Level of moderation in forum
+- Change in Reddit policy over the time period
 
 # Yelp and Weather 
 
 ## Challenge 1
 
-[Screenshot your query and a result]
+```javascript
+db.precipitation.aggregate([
+  {$match: {date: {$regex: /20100410.*/}}},
+  {$group: {_id: null, prec: {$sum: "$hpcp"}}}
+])
+
+// { "result" : [ { "_id" : null, "prec" : 62 } ], "ok" : 1 }
+```
+
+![Screenshot your query and a result](https://www.dropbox.com/s/ep7c8t3k0uslyo0/Screenshot%202014-11-16%2023.23.16.png?dl=1)
 
 ## Challenge 2
 
 [Query snippet]
+
+```javascript
+db.normals.aggregate([{$match: {"DATE": {$regex: /20100425.*/}}},{$group: {_id: null, wind: {$avg: "$HLY-WIND-AVGSPD"}}}])
+```
+
 [Answer]
+92.708 mph
 
 ## Challenge 3
 
 [Query snippet]
+
+```javascript
+db.businesses.aggregate([
+  {$match: {city: "Madison"}},
+  {$group: {_id: null, sumReviews: {$sum: "$review_count"}}}
+])
+```
+
 [Answer]
+34410
 
 ## Challenge 4
 
 [Query snippet]
+
+```javascript
+db.businesses.aggregate([
+  {$match: {city: "Las Vegas"}},
+  {$group: {_id: null, sumReviews: {$sum: "$review_count"}}}
+])
+```
+
 [Answer]
+577550
 
 ## Challenge 5
 
 [Query snippet]
-[Answer]
 
-## Challenge 6 [BONUS]
+```javascript
+db.businesses.aggregate([
+  {$match: {city: "Phoenix"}},
+  {$group: {_id: null, sumReviews: {$sum: "$review_count"}}}
+])
+```
 
-[Code]
 [Answer]
-
-
-
+200089
diff --git a/micha.js b/micha.js
@@ -0,0 +1,52 @@
+var MongoClient = require("mongodb").MongoClient;
+var _ = require("lodash");
+var fs = require('fs');
+var async = require('async');
+
+var url = "mongodb://104.236.191.166:27017/bigdata";
+
+// Get the top 50 subreddits
+var subreddits = fs.readFileSync('top-50-subreddits.txt')
+  .toString()
+  .split("\n");
+
+subreddits.pop();
+
+// Parse it into array of JSON objects
+var subQuery = _.map(subreddits, function(sub){
+  return {'subreddit': sub}
+});
+
+// Use connect method to connect to the Server
+MongoClient.connect(url, function(err, db) {
+  if (err) throw err;
+
+  var collection = db.collection("reddit");
+  var counter = 0;
+
+  async.map(subQuery,
+    function(item){
+      collection.distinct("author", item, function(err, authors){
+        counter++;
+        console.log("Finished query: " + counter);
+
+        var key = item['subreddit'];
+        return { key: authors };
+      })
+    },
+
+    function(err, result){
+      if (err)
+        console.log(err);
+      else{
+        console.log("Preparing to write to file...");
+
+        fs.writeFile("michaAggregate.json", result, function() {
+          console.log("The file was saved!");
+        });
+      }
+
+      db.close();
+    }
+  );
+});
diff --git a/package.json b/package.json
@@ -0,0 +1,27 @@
+{
+  "name": "challenge-week-12",
+  "version": "1.0.0",
+  "description": "# Reddit Data Challenges",
+  "main": "index.js",
+  "directories": {
+    "example": "examples"
+  },
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/CSCI-4830-002-2014/challenge-week-12"
+  },
+  "author": "",
+  "license": "ISC",
+  "bugs": {
+    "url": "https://github.com/CSCI-4830-002-2014/challenge-week-12/issues"
+  },
+  "homepage": "https://github.com/CSCI-4830-002-2014/challenge-week-12",
+  "dependencies": {
+    "async": "^0.9.0",
+    "lodash": "^2.4.1",
+    "mongodb": "^1.4.20"
+  }
+}
diff --git a/top-50-subreddits.txt b/top-50-subreddits.txt
@@ -0,0 +1,50 @@
+funny
+AdviceAnimals
+pics
+aww
+WTF
+todayilearned
+videos
+gifs
+leagueoflegends
+gaming
+gonewild
+wow
+AskReddit
+pcmasterrace
+worldnews
+TrollXChromosomes
+trees
+news
+chan
+reactiongifs
+mildlyinteresting
+politics
+Showerthoughts
+soccer
+nba
+TheLastAirbender
+Jokes
+DestinyTheGame
+movies
+technology
+woahdude
+DotA
+cats
+RealGirls
+Android
+nsfw
+gentlemanboners
+atheism
+pokemon
+ImGoingToHellForThis
+KotakuInAction
+TumblrInAction
+fatpeoplehate
+explainlikeimfive
+Games
+IAmA
+science
+smashbros
+space
+cringepics