diff --git a/.gitignore b/.gitignore index fee2179..8fb9827 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,8 @@ node_modules/ +dist/ # IDE .idea # config -config.js +config.js \ No newline at end of file diff --git a/client/index.html b/client/index.html index 60e5034..b6263ad 100644 --- a/client/index.html +++ b/client/index.html @@ -12,6 +12,6 @@
- + diff --git a/client/src/index.js b/client/src/index.js index 9b1d738..0195f83 100644 --- a/client/src/index.js +++ b/client/src/index.js @@ -1,5 +1,5 @@ var echarts = require('echarts'); -var socket = require('socket.io-client')('http://localhost:3001'); +var socket = require('socket.io-client')('http://localhost:8080'); var $ = require('jquery'); var myChart = echarts.init(document.getElementById('main')); var echartParser = require('./echartParser'); diff --git a/gulpfile.js b/gulpfile.js new file mode 100644 index 0000000..b46f860 --- /dev/null +++ b/gulpfile.js @@ -0,0 +1,23 @@ +"use strict" + +const gulp = require('gulp') +const babel = require('gulp-babel') +const sourcemaps = require('gulp-sourcemaps') +const plumber = require('gulp-plumber') + +gulp.task('babel', () => { + return gulp.src('src/*.js') + .pipe(plumber()) + .pipe(sourcemaps.init()) + .pipe(babel({ + presets: ['es2015'] + })) + .pipe(sourcemaps.write('.')) + .pipe(gulp.dest('dist')) +}) + +gulp.task('watch', () => { + gulp.watch('src/*.js', ['babel']) +}) + +gulp.task('default', ['watch']) \ No newline at end of file diff --git a/index.js b/index.js index 25ce7a9..ef0f658 100644 --- a/index.js +++ b/index.js @@ -1,26 +1,25 @@ -var Spider = require('./src/Spider'); -var express = require('express'); -var bodyParser = require('body-parser'); +const express = require('express') +const bodyParser = require('body-parser') +const http = require('http') +const Spider = require('./dist/Spider') -var app = express(); -var server = require('http').createServer(app); -var io = require('socket.io')(server); -io.on('connection', function(socket) { - socket.on('fetch start', function(data) { - Spider(data.url, socket); - }); -}); -server.listen(3001); +const app = express() +const server = http.createServer(app) +const io = require('socket.io')(server) +app.use(bodyParser()) +app.use(express.static('./client')) -app.use(bodyParser());// WARNING -app.use('/js', express.static('./client/build')); -app.use('/css', express.static('./client/build')); +app.get('/', (req, res) => { + res.sendFile(__dirname + '/client/index.html') +}) -app.get('/', function(req, res) { - res.sendFile(__dirname + '/client/index.html'); -}); +io.on('connection', socket => { + socket.on('fetch start', data => { + Spider(data.url, socket) + }) +}) -app.listen(3000,function(){ - console.log('server start at 127.0.0.1:%s',this.address().port) -}); +server.listen(3001) + +app.listen(8080) diff --git a/package.json b/package.json index eefec3a..8e380dd 100644 --- a/package.json +++ b/package.json @@ -9,10 +9,18 @@ "express": "^4.13.4", "gexf": "^0.2.5", "request": "^2.69.0", - "socket.io": "^1.4.5" + "socket.io": "^1.4.5", + "tracer": "^0.8.3" + }, + "devDependencies": { + "babel-preset-es2015": "^6.5.0", + "gulp": "^3.9.1", + "gulp-babel": "^6.1.2", + "gulp-plumber": "^1.1.0", + "gulp-sourcemaps": "^1.6.0" }, - "devDependencies": {}, "scripts": { + "start": "node app", "test": "echo \"Error: no test specified\" && exit 1" }, "author": "", diff --git a/src/Spider.js b/src/Spider.js index a1957e1..83e9fe0 100644 --- a/src/Spider.js +++ b/src/Spider.js @@ -1,96 +1,113 @@ -var fetchFollwerOrFollwee = require('./fetchFollwerOrFollwee'); -var getUser = require('./getUser'); -var Promise = require('bluebird'); -var config = require('../config'); -module.exports = Spider; - -function Spider(userPageUrl, socket) { - socket.emit('notice', '抓取用户信息......'); - return getUser(userPageUrl) - .then(function(user) { - socket.emit('notice', '抓取用户信息成功'); - socket.emit('get user', user); - return getFriends(user, socket); - }) - .then(function(myFriends) { - return Promise.map(myFriends, function(myFriend) { - return getUser(myFriend.url); - }, { concurrency: config.concurrency ? config.concurrency : 3 }); - }) - .then(function(myFriends) { - var input = []; - myFriends.forEach(function(friend) { - input.push({ - "user": friend, - "sameFriends": [] - }) - }); - socket.emit('data', input); - - console.log(myFriends); - return Promise.map(myFriends, function(myFriend) { - return searchSameFriend(myFriend, myFriends, socket); - }, { concurrency: config.concurrency ? config.concurrency : 3 }); - }) - .then(function(result) { - var data = result; - socket.emit('data', data); - - }) - .catch(function(err) { - console.log(err); - }) +"use strict" + +import Promise from 'bluebird' +import tracer from 'tracer' +import fetchFollwerOrFollwee from './fetchFollwerOrFollwee' +import getUser from './getUser' +import config from '../config' + +const logger = tracer.colorConsole() + +const Spider = (userPageUrl, socket) => { + const concurrency = config.concurrency ? config.concurrency : 3 + + socket.emit('notice', '抓取用户信息......') + + return getUser(userPageUrl) + .then(function (user) { + socket.emit('notice', '抓取用户信息成功') + socket.emit('get user', user) + + return getFriends(user, socket) + }) + .then(function (myFriends) { + return Promise.map(myFriends, myFriend => getUser(myFriend.url), {concurrency}) + }) + .then(function (myFriends) { + let input = [] + + myFriends.forEach(friend => { + input.push({ + user: friend, + sameFriends: [], + }) + }) + + socket.emit('data', input) + + // debug + logger.log(myFriends) + + return Promise.map(myFriends, myFriend => searchSameFriend(myFriend, myFriends, socket), {concurrency}) + }) + .then(function (data) { + socket.emit('data', data) + }) + .catch(function (err) { + // debug + logger.error(err) + }) +} +const getFriends = (user, socket) => { + const options1 = { + isFollowees: true, + user, + } + const options2 = {user} + const works = [fetchFollwerOrFollwee(options1, socket), fetchFollwerOrFollwee(options2, socket)] + + return Promise.all(works) + .then(function(result) { + const [followees, followers] = result + let friends = [] + + followers.forEach(follower => { + followees.forEach(followee => { + if (follower.hash_id === followee.hash_id) { + friends.push(follower) + } + }) + }) + + return friends + }) } +const searchSameFriend = (aFriend, myFriends, socket) => { + socket.emit("notice", "searchSameFriend with " + aFriend.name + "......") + + // debug + logger.log("searchSameFriend with " + aFriend.name + "......") + + return getFriends(aFriend, socket) + .then(function(targetFriends) { + let sameFriends = [] + + // debug + logger.log('counting for ' + aFriend.name + '......') + logger.log("\n\n==============\n Same Friends with " + aFriend.name + "\n") + + targetFriends.forEach(targetFriend => { + myFriends.forEach(myFriend => { + if (targetFriend.hash_id === myFriend.hash_id) { + sameFriends.push(targetFriend) + } + }) + }) + socket.emit('same friend', { + hash_id: aFriend.hash_id, + sameFriends: sameFriends + }) + // debug + logger.log(sameFriends) + logger.log("\n\n") -function getFriends(user, socket) { - var works = [fetchFollwerOrFollwee({ - isFollowees: true, - user: user - }, socket), fetchFollwerOrFollwee({ - user: user - }, socket)]; - return Promise.all(works).then(function(result) { - var followees = result[0]; - var followers = result[1]; - var friends = []; - followers.forEach(function(follower) { - followees.forEach(function(followee) { - if (follower.hash_id === followee.hash_id) { - friends.push(follower); - } - }); - }); - return friends; - }); + return { + user: aFriend, + sameFriends, + } + }) } -function searchSameFriend(aFriend, myFriends, socket) { - socket.emit("notice", "searchSameFriend with " + aFriend.name + "......"); - console.log("searchSameFriend with " + aFriend.name + "......"); - return getFriends(aFriend, socket) - .then(function(targetFriends) { - var sameFriends = []; - console.log('counting for ' + aFriend.name + '......') - targetFriends.forEach(function(targetFriend) { - myFriends.forEach(function(myFriend) { - if (targetFriend.hash_id === myFriend.hash_id) { - sameFriends.push(targetFriend); - } - }) - }) - console.log("\n\n==============\n Same Friends with " + aFriend.name + "\n"); - socket.emit('same friend', { - hash_id: aFriend.hash_id, - sameFriends: sameFriends - }) - console.log(sameFriends); - console.log("\n\n"); - - return { - user: aFriend, - sameFriends: sameFriends - }; - }) -} \ No newline at end of file +module.exports = Spider diff --git a/src/fetchFollwerOrFollwee.js b/src/fetchFollwerOrFollwee.js index 82024ab..b23b725 100644 --- a/src/fetchFollwerOrFollwee.js +++ b/src/fetchFollwerOrFollwee.js @@ -1,92 +1,94 @@ -var request = require('request'); -var Promise = require('bluebird'); -var config = require('../config'); - -var fetchFollwerOrFollwee = function(options, socket) { - var user = options.user; - var isFollowees = options.isFollowees; - var grounpAmount = isFollowees ? Math.ceil(user.followeeAmount / 20) : Math.ceil(user.followerAmount / 20); - var offsets = []; - for (var i = 0; i < grounpAmount; i++) { - offsets.push(i * 20); - } - return Promise.map(offsets, function(offset) { - return getFollwerOrFollwee(user, offset, isFollowees, socket); - }, { concurrency: config.concurrency ? config.concurrency : 3 }).then(function(array) { - var result = []; - array.forEach(function(item) { - result = result.concat(item); - }); - return result; - }) -} +"use strict" -function getFollwerOrFollwee(user, offset, isFollowees, socket) { - socket.emit('notice','开始抓取 ' + user.name + ' 的第 ' + offset + '-' + (offset + 20) + ' 位' + (isFollowees ? '关注的人' : '关注者')); - console.log('开始抓取 ' + user.name + ' 的第 ' + offset + '-' + (offset + 20) + ' 位' + (isFollowees ? '关注的人' : '关注者')); - var params = "{\"offset\":{{counter}},\"order_by\":\"created\",\"hash_id\":\"{{hash_id}}\"}".replace(/{{counter}}/, offset).replace(/{{hash_id}}/, user.hash_id); - // console.log(params); - return new Promise(function(resolve, reject) { - request({ - method: 'POST', - url: isFollowees ? 'https://www.zhihu.com/node/ProfileFolloweesListV2' : 'https://www.zhihu.com/node/ProfileFollowersListV2', - form: { - method: "next", - params: params, - _xsrf: config._xsrf - }, - headers: { - 'cookie': config.cookie, - 'content-type': 'application/x-www-form-urlencoded; charset=UTF-8', - 'cache-control': 'no-cache', - 'x-requested-with': 'XMLHttpRequest' - }, - timeout: 1500 - }, function(err, res, body) { - var tmp = []; - try { - if (body) { - tmp = JSON.parse(body).msg.map(parseCard); - } else { - throw ('Body is undefined'); - } - } catch (e) { - console.log("\n======ERROR======"); - console.log(e, body); - console.log("======ERROR======\n"); - } - if (err) { - if (err.code == 'ETIMEDOUT' || err.code == 'ESOCKETTIMEDOUT') { - resolve(getFollwerOrFollwee(user, offset, isFollowees, socket)); - } else { - reject(err) - } - } else { - resolve(tmp); - } - }) - }) -} +import request from 'request' +import Promise from 'bluebird' +import tracer from 'tracer' +import config from '../config' + +const logger = tracer.colorConsole() + +const parseCard = text => { + const re1 = /data-id=\"(\S*)\"/g + const re2 = /

.*>(.*)<\/a><\/h2>/g + const re3 = /href=\"(https:\/\/www\.zhihu\.com\/people\/\S*)\"/g + let result = {} + + re1.exec(text) + result.hash_id = RegExp.$1 + + re2.exec(text) + result.name = RegExp.$1 + + re3.exec(text) + result.url = RegExp.$1 -function parseCard(text) { - var result = {}; - var re1 = /data-id=\"(\S*)\"/g; - var re2 = /

.*>(.*)<\/a><\/h2>/g - var re3 = /href=\"(https:\/\/www\.zhihu\.com\/people\/\S*)\"/g; - re1.exec(text); - result.hash_id = RegExp.$1; - re2.exec(text); - result.name = RegExp.$1; - re3.exec(text); - result.url = RegExp.$1; - return result; + return result } +const fetchFollwerOrFollwee = (options, socket) => { + let user = options.user + let isFollowees = options.isFollowees + let grounpAmount = isFollowees ? Math.ceil(user.followeeAmount / 20) : Math.ceil(user.followerAmount / 20) + let offsets = [] + let concurrency = config.concurrency ? config.concurrency : 3 -function consoleLog(x) { - console.log(x); - return x; + for (let i = 0; i < grounpAmount; i++) { + offsets.push(i * 20); + } + + return Promise.map(offsets, offset => getFollwerOrFollwee(user, offset, isFollowees, socket), {concurrency}) + .then(function(array) { + let result = [] + + array.forEach(item => { + result = result.concat(item) + }) + + return result + }) } +const getFollwerOrFollwee = (user, offset, isFollowees, socket) => { + const params = "{\"offset\":{{counter}},\"order_by\":\"created\",\"hash_id\":\"{{hash_id}}\"}".replace(/{{counter}}/, offset).replace(/{{hash_id}}/, user.hash_id) + const options = { + method: 'POST', + url: isFollowees ? 'https://www.zhihu.com/node/ProfileFolloweesListV2' : 'https://www.zhihu.com/node/ProfileFollowersListV2', + form: { + method: "next", + params: params, + _xsrf: config._xsrf, + }, + headers: { + 'cookie': config.cookie, + 'content-type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'cache-control': 'no-cache', + 'x-requested-with': 'XMLHttpRequest', + }, + timeout: 1500, + } + + socket.emit('notice','开始抓取 ' + user.name + ' 的第 ' + offset + '-' + (offset + 20) + ' 位' + (isFollowees ? '关注的人' : '关注者')) + // debug + logger.log('开始抓取 ' + user.name + ' 的第 ' + offset + '-' + (offset + 20) + ' 位' + (isFollowees ? '关注的人' : '关注者')) + return new Promise((resolve, reject) => { + request(options, (err, res, body) => { + let tmp = [] + + if (body) { + tmp = JSON.parse(body).msg.map(parseCard) + } + + if (err) { + if (err.code == 'ETIMEDOUT' || err.code == 'ESOCKETTIMEDOUT') { + resolve(getFollwerOrFollwee(user, offset, isFollowees, socket)) + } else { + reject(err) + } + } else { + resolve(tmp) + } + }) + }) +} -module.exports = fetchFollwerOrFollwee; +module.exports = fetchFollwerOrFollwee diff --git a/src/getUser.js b/src/getUser.js index 8d393cb..17a68ca 100644 --- a/src/getUser.js +++ b/src/getUser.js @@ -1,45 +1,52 @@ -var request = require('request'); -var Promise = require('bluebird'); -var config = require('../config'); - -function getUser(userPageUrl) { - return new Promise(function(resolve, reject) { - request({ - method: 'GET', - url: userPageUrl, - headers: { - 'cookie': config.cookie - } - }, function(err, res, body) { - if (err) { - reject(err); - } else { - resolve(parse(body)); - } - }) - }); -} +"use strict" + +import request from 'request' +import Promise from 'bluebird' +import tracer from 'tracer' +import config from '../config' + +const logger = tracer.colorConsole() -function parse(html) { - var user = {}; +const parse = html => { + const reg1 = /data-name=\"current_people\">\[.*\"(\S*)\"\]<\/script>/g + const reg2 = /关注了<\/span>
\n(\d*)/g + const reg3 = /关注者<\/span>
\n(\d*)/g + const reg4 = / (.*) - 知乎<\/title>/g + //var reg4 = /<a class=\"name\" href=\"\/people\/.*\">(.*)<\/a>/g; + let user = {} - var reg1 = /data-name=\"current_people\">\[.*\"(\S*)\"\]<\/script>/g; - reg1.exec(html); - user.hash_id = RegExp.$1; + reg1.exec(html) + user.hash_id = RegExp.$1 - var reg2 = /关注了<\/span><br \/>\n<strong>(\d*)/g; - reg2.exec(html); - user.followeeAmount = parseInt(RegExp.$1); + reg2.exec(html) + user.followeeAmount = parseInt(RegExp.$1) - var reg3 = /关注者<\/span><br \/>\n<strong>(\d*)/g; - reg3.exec(html); - user.followerAmount = parseInt(RegExp.$1); + reg3.exec(html) + user.followerAmount = parseInt(RegExp.$1) + + reg4.exec(html) + user.name = RegExp.$1 + + return user +} +const getUser = userPageUrl => { + return new Promise((resolve, reject) => { + const options = { + method: 'GET', + url: userPageUrl, + headers: { + cookie: config.cookie + }, + } - //var reg4 = /<a class=\"name\" href=\"\/people\/.*\">(.*)<\/a>/g; - var reg4 = /<title> (.*) - 知乎<\/title>/g - reg4.exec(html); - user.name = RegExp.$1; - return user; + request(options, (err, res, body) => { + if (err) { + reject(err) + } else { + resolve(parse(body)) + } + }) + }) } -module.exports = getUser; +module.exports = getUser