forked from TheaMorin/us-to-uk-json-steno
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
109 lines (99 loc) · 3.37 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import fs from 'fs'
import { parseString } from 'xml2js'
// All queries in XML
const xmlQueries =
fs.readFileSync('./assets/queries.txt', 'utf8')
.split(/(\r\n|\n){2}/g)
// Array of stop words (strings)
const stopWords =
fs.readFileSync('./assets/stopwords.txt', 'utf8')
.split('\n')
// Array of { time: num ms, tweet: 'string' }
const tweets =
fs.readFileSync('./assets/tweets.txt', 'utf8')
.split('\n')
.map(tweet => tweet.split('\t'))
.map(([ time, tweet ]) => ({ time, tweet }))
.filter(tweet => tweet.tweet)
console.info(`Loaded ${tweets.length} tweets, ${stopWords.length} stop words,
and ${xmlQueries.length} lines in the queries file.`)
// Do everything in here:
async function main () {
// Parse all the XML strings
const queryPromises =
xmlQueries.map(query =>
(new Promise((resolve, reject) => {
parseString(query, (err, result) => {
if (err) reject(err)
resolve(result)
})
}))
)
//function for tokenization and stopword removal
const filterSentence = sentence => sentence.trim()
.replace(/[^a-zA-Z ]/g, '').toLowerCase()
.split(' ')
.filter(word => word && stopWords.indexOf(word) < 0)
console.info('Processing queries from XML to JSON, removing stop words…')
const queries =
(await Promise.all(queryPromises)) // parsing...
.filter(x => x) // gets rid of null entries
.map(x => x.top) // remove the <top> tag
.map(({ num, title, querytime, querytweettime }) => (
// Get rid of spacing for all the properties
{ num: parseInt(num[0].trim().substring(10), 10) // MB048
, tokens: filterSentence(title[0])
, time: querytime[0].trim()
, tweetTime: querytweettime[0].trim()
}
)
)
console.info(`There are ${queries.length} valid queries.`)
console.info('Filtering stop words from tweets…')
const filteredTweets =
tweets.map(({ time, tweet }) => (
{ time
, tweet: filterSentence(tweet)
}
))
console.info('Building index for tweet vocabulary…')
const tokens =
filteredTweets.reduce((index, { time, tweet }) => {
tweet.forEach(word => {
if (!index[word]) {
index[word] = { [time]: 1 }
} else if (index[word][time]) {
index[word][time] += 1
} else {
index[word][time] = 1
}
})
return index
}, {})
console.info(`Vocabulary has ${Object.keys(tokens).length} words`)
// Array of { document id (tweettime): num words }
const wordsInTweets =
filteredTweets.reduce((twitterWords, { time, tweet }) => {
twitterWords[time] = tweet.length
return twitterWords
}, {})
console.log('Writing files…')
fs.writeFile('./assets/queries.json', JSON.stringify(queries), 'utf8', e => {
if (e) throw e
console.log('Successfully wrote queries.json')
})
fs.writeFile('./assets/index.json', JSON.stringify(tokens), 'utf8', err => {
if (err) throw err
console.log('Successfully wrote index.json')
})
fs.writeFile('./assets/tweets.json', JSON.stringify(tweets), 'utf8', err => {
if (err) throw err
console.log('Successfully wrote tweets.json')
})
fs.writeFile('./assets/words_per_tweet.json',
JSON.stringify(wordsInTweets), 'utf8', err => {
if (err) throw err
console.log('Successfully wrote words_per_tweet.json')
})
}
main()