diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..60baa9c --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +data/* diff --git a/README.md b/README.md index bcddb79..2c6a589 100644 --- a/README.md +++ b/README.md @@ -4,70 +4,82 @@ ## Challenge 1 -[Insert Screenshot] +[image](reddit/ch1.png) ## Challenge 2 -[Explain what's interesting] +I find it interesting that with each comment, the flair and flair text is stored in the json object. This seems like it could provide some interesting information about a commenter, it might even be interesting to analyze how these change over time for a single user. When I did a findOne(), I got back a flair_text of "Do you even quest, bro? [scaper since 2004] . . . " which is a pretty gamer nerdy tag, and could provide interesting insights. ## Challenge 3 -[Explain possible Insights] +The possibilities with this dataset are huge because of the sheer magnitude of data. You could learn about how people comment, do sentiment analysis, do some sort of intelligence analysis based upon sentence structure used in different subreddits, or try and create the perfect comment (a fusion of the most upvoted). ## Challenge 4 -[What it would tell you about the Reddit Community] +As mentioned above, you could learn a lot with NLP to learn how the Reddit community types, how supportive they are of each other, or how many people are just trolls. With some complex analysis you could figure out what they are interested in by using the most upvoted comments, there are so many options. ## Challenge 5 -[Link to Code or pasted code] -[Answer] +[Link to Code or pasted code](reddit/mostSimilar.js) +I was having some issues with javascript and could not get the concurrency issues fixed in time. Unfortunately I am not sure, sorry! ## Challenge 6 -[What does this change about our analysis?] +This would imply that only popular comments were analyzed. This would skew the way I was attempting to count all the upvotes. Not counting a bunch of minimally liked comments would skew towards smaller communities with more active communities. ## Challenge 7 -[How would you change your conclusions?] +My conclusion based on my (not very strong anyway) technique actually probably wouldn't change. Since they all have ~ the same comments, a larger number of ups likely corresponds to a few comments that are very popular versus the case discussed above. This is really only true that it wouldn't change because of the way I executed my analysis, generally this is a huge ommision and is important to re-evaluate. ## Challenge 8 -[Bias in answer] +Only basing similarities on upvotes creates a bias of only caring about popularity of comments in subreddits which ignores content, literacy, etc. This biases towards closer communities and will make them seem similar if they are tight knit. ## Challenge 9 -[Other Biases] + * Only popular subreddits included + * No private subreddits included + * Filtered for bad words + * Generally the reddit community is heavily slated towards tech so the data should not be applied to anything but the reddit community ## Challenge 10 -[How may you try and prove the bias] +Proving the bias that only comments with 10 or more upvotes would be quite simple. Doing a count of those matching less than 10 vs. 10 or greater should be 0 for the first and the count of all the comments for the second. If not 0 you could do probabilistic analysis to determine if there is any skew in either direction. # Yelp and Weather ## Challenge 1 - -[Screenshot your query and a result] +db.prec.aggregate([{"$match":{"DATE":{$regex:/20100425/}}},{"$group":{"_id":null,"total_precip":{"$sum":"$HPCP"}}}]) +{ "_id" : null, "total_precip" : 62 } +62/100ths of an in. +[image](weather/cp1.png) ## Challenge 2 -[Query snippet] -[Answer] +> db.norm.aggregate([{"$match":{"DATE":{$regex:/20100425/}}},{"$group":{"_id":null,"Avg Wind Speed: ":{"$avg":"$HLY-WIND-AVGSPD"}}}]) +{ "_id" : null, "Avg Wind Speed: " : 92.70833333333333 } +9.27 mph. ## Challenge 3 -[Query snippet] -[Answer] +> db.all.aggregate([{"$match":{"type":"business","city":"Madison"}},{"$group":{" +_id":null,"Reviews in Madison: ":{"$sum":"$review_count"}}}]) +{ "_id" : null, "Reviews in Madison: " : 34410 } +34410 reviews ## Challenge 4 -[Query snippet] -[Answer] +> db.all.aggregate([{"$match":{"type":"business","city":"Las Vegas"}},{"$group": +{"_id":null,"Reviews in Vegas: ":{"$sum":"$review_count"}}}]) +{ "_id" : null, "Reviews in Vegas: " : 577550 } +577,550 reviews ## Challenge 5 -[Query snippet] -[Answer] +> db.all.aggregate([{"$match":{"type":"business","city":"Pheonix"}},{"$group":{" +_id":null,"Reviews in Pheonix: ":{"$sum":"$review_count"}}}]) +{ "_id" : null, "Reviews in Pheonix: " : 16 } +16 reviews ## Challenge 6 [BONUS] diff --git a/reddit/ch1.png b/reddit/ch1.png new file mode 100644 index 0000000..f7614b9 Binary files /dev/null and b/reddit/ch1.png differ diff --git a/reddit/mostSimilar.js b/reddit/mostSimilar.js new file mode 100644 index 0000000..9eb0805 --- /dev/null +++ b/reddit/mostSimilar.js @@ -0,0 +1,63 @@ +var express = require('express'); +var mongoskin = require('mongoskin'); + +var db = mongoskin.db('mongodb://198.199.113.194/bigdata', {safe:true}) +var app = express(); + +// view engine setup +app.set('view engine', 'ejs'); + +app.get('/', function(req, res) { + res.render('index'); +}) + +var MongoClient = require('mongodb').MongoClient + , format = require('util').format; + + +//Most popular reddits from http://redditlist.com/sfw +sr = ['funny', 'pics', 'AskReddit','todayilearned','worldnews','science','IAmI','videos','gaming','movies','Music','aww','technology', + 'bestof','WTF','AdviceAnimals','news','gifs','askscience','explainlikeimfive','EarthPort','books','television','politics','LifeProTips', + 'sports','atheism', 'mildlyinteresting','DIY','Fitness','food','space','Jokes','Showerthoughts','photoshopbattles','tifu','GetMotivated','nottheonion', + 'InternetIsBeautiful','history','dataisbeautiful','Futurology','gadgets','listentothis','nosleep','Documentaries','personalfinance', + 'philosophy','creepy','Art'] +var out = []; + +var perRedditLimit = 1; +var ups = 0; +function run(){ + for (var i = 0; i < sr.length-1 ; i++){ + MongoClient.connect('mongodb://127.0.0.1/reddit', function (err, db) { + if (err) { + throw err; + } else { + //console.log("successfully connected to the database"); + var collection = db.collection('all'); + var options = { + "limit": perRedditLimit//, + //"skip": 10, + } + var cursor = collection.find({"subreddit":sr[i]}, options).toArray(function(err, res){ + if (err) { + throw err; + } + else{ + + console.log(res); + out.push(res); + ups = ups + res[0].ups + console.log(res[0].subreddit + " " + String(ups)) + } + ups = 0; + db.close(); + }); + } + }); + + } +} + +run(function(out) { console.log(out);}) + +app.listen(3000); +console.log('app is listening at localhost:3000');