-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Evangelos Lamprou <[email protected]>
- Loading branch information
Showing
17 changed files
with
588 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
inputs | ||
outputs | ||
|
||
node_modules |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -e | ||
# 7zip | ||
pkgs='p7zip-full curl wget nodejs unzip npm' | ||
if ! dpkg -s $pkgs >/dev/null 2>&1 ; then | ||
sudo apt-get install $pkgs -y | ||
echo 'Packages Installed' | ||
fi | ||
|
||
if ! dpkg -s pandoc > /dev/null 2>&1 ; then | ||
# since pandoc v.2.2.1 does not support arm64, we use v.3.5 | ||
wget https://github.com/jgm/pandoc/releases/download/3.5/pandoc-3.5-1-$(dpkg --print-architecture).deb | ||
sudo dpkg -i ./pandoc-3.5-1-$(dpkg --print-architecture).deb | ||
rm ./pandoc-3.5-1-$(dpkg --print-architecture).deb | ||
fi | ||
|
||
if ! dpkg -s nodejs > /dev/null 2>&1 ; then | ||
# node version 18+ does not need external npm | ||
curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash - | ||
sudo apt-get install -y nodejs | ||
fi | ||
|
||
cd $(dirname $0)/scripts || exit 1 | ||
npm install | ||
# Install the npm packages | ||
npm install natural | ||
cd - |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
a5c0042f76e0680586ae707dd62c9f3e |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
db8e559c78464ffc3fdf5aeede7a3b8a |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
f1caa64726ed848a67a117a6d9879a51 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#!/bin/bash | ||
|
||
BENCH_TOP=${BENCH_TOP:-$(git rev-parse --show-toplevel)} | ||
RESOURCES_DIR=${RESOURCES_DIR:-$BENCH_TOP/web-index}/inputs | ||
|
||
cp stopwords.txt $RESOURCES_DIR # TODO: Grab this from the atlas server | ||
|
||
mkdir -p $RESOURCES_DIR | ||
|
||
if [ "$1" = "--small" ]; then | ||
if [[ ! -f "$RESOURCES_DIR/wikipedia-small.tar.gz" ]]; then | ||
# 1000 entries | ||
echo "Downloading the small dataset." | ||
wget -O $RESOURCES_DIR/wikipedia-small.tar.gz https://atlas-group.cs.brown.edu/data/wikipedia/input_small/articles.tar.gz --no-check-certificate | ||
wget -O $RESOURCES_DIR/index_small.txt https://atlas-group.cs.brown.edu/data/wikipedia/input_small/index.txt --no-check-certificate | ||
fi | ||
else | ||
if [[ ! -f "$RESOURCES_DIR/wikipedia.tar.gz" ]]; then | ||
# full dataset | ||
echo "Downloading the full dataset. Caution!! Extracted size >200GB" | ||
wget -O $RESOURCES_DIR/wikipedia.tar.gz https://atlas-group.cs.brown.edu/data/wikipedia/input/articles.tar.gz --no-check-certificate | ||
wget -O $RESOURCES_DIR/index.txt https://atlas-group.cs.brown.edu/data/wikipedia/input/index.txt --no-check-certificate | ||
fi | ||
fi | ||
|
||
if [[ ! -d "$RESOURCES_DIR/articles" ]]; then | ||
if [ "$1" = "--small" ]; then | ||
# 1000 entries | ||
echo "Extracting the small dataset." | ||
tar -xf $RESOURCES_DIR/wikipedia-small.tar.gz -C $RESOURCES_DIR | ||
else | ||
# full dataset | ||
echo "Extracting the full dataset. Caution!! Extracted size >200GB" | ||
tar -xf $RESOURCES_DIR/wikipedia.tar.gz -C $RESOURCES_DIR | ||
fi | ||
else | ||
echo "Did not extract data because of existing data." | ||
echo "Please rm -r $RESOURCES_DIR/articles manually and rerun this script." | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
#!/usr/bin/env bash | ||
|
||
cd "$(dirname "$0")" | ||
|
||
directory_path="inputs/articles" | ||
|
||
if [ ! -d "$directory_path" ]; then | ||
echo "Error: Directory does not exist." | ||
exit 1 | ||
fi | ||
|
||
# ensure a local ./tmp directory exists for sorting | ||
mkdir -p ./tmp | ||
export TMPDIR=$PWD/tmp | ||
|
||
INPUTS="$PWD/inputs" | ||
OUTPUT_BASE="$PWD/outputs/ngrams" | ||
|
||
if [[ "$@" == *"--small"* ]]; then | ||
export INPUT_FILE="$INPUTS/index_small.txt" | ||
else | ||
export INPUT_FILE="$INPUTS/index.txt" | ||
fi | ||
|
||
mkdir -p "$OUTPUT_BASE" | ||
|
||
echo "web-index" | ||
time $SHELL ./scripts/ngrams.sh "$OUTPUT_BASE" | ||
echo $? |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
( mkfifo s2 > /dev/null ) ; | ||
( mkfifo s3 > /dev/null ) ; | ||
|
||
sed '$d' s2 > s3 & | ||
tee s2 | | ||
tail +2 | | ||
paste s3 - | ||
rm s2 | ||
rm s3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
while read -r line | ||
do | ||
cat $line | | ||
iconv -c -t ascii//TRANSLIT | | ||
pandoc +RTS -K64m -RTS --from html --to plain --quiet | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
#!/usr/bin/env node | ||
// TODO: use node's URL to parse and emit a URL in normal form | ||
// URL validation as a stream transformer | ||
// | ||
// Contains code by Diego Perini, as compared in | ||
// http://mathiasbynens.be/demo/url-regex | ||
// | ||
// Notes on possible differences from a standard/generic validation: | ||
// | ||
// - utf-8 char class take in consideration the full Unicode range | ||
// - TLDs have been made mandatory so single names like "localhost" fails | ||
// - protocols have been restricted to ftp, http and https only as requested | ||
|
||
var re_weburl = new RegExp( | ||
"^" + | ||
// protocol identifier (optional) | ||
// short syntax // still required | ||
"(?:(?:(?:https?|ftp):)?\\/\\/)" + | ||
// user:pass BasicAuth (optional) | ||
"(?:\\S+(?::\\S*)?@)?" + | ||
"(?:" + | ||
// IP address exclusion | ||
// private & local networks | ||
"(?!(?:10|127)(?:\\.\\d{1,3}){3})" + | ||
"(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" + | ||
"(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" + | ||
// IP address dotted notation octets | ||
// excludes loopback network 0.0.0.0 | ||
// excludes reserved space >= 224.0.0.0 | ||
// excludes network & broadcast addresses | ||
// (first & last IP address of each class) | ||
"(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" + | ||
"(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" + | ||
"(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" + | ||
"|" + | ||
// host & domain names, may end with dot | ||
// can be replaced by a shortest alternative | ||
// (?![-_])(?:[-\\w\\u00a1-\\uffff]{0,63}[^-_]\\.)+ | ||
"(?:" + | ||
"(?:" + | ||
"[a-z0-9\\u00a1-\\uffff]" + | ||
"[a-z0-9\\u00a1-\\uffff_-]{0,62}" + | ||
")?" + | ||
"[a-z0-9\\u00a1-\\uffff]\\." + | ||
")+" + | ||
// TLD identifier name, may end with dot | ||
"(?:[a-z\\u00a1-\\uffff]{2,}\\.?)" + | ||
")" + | ||
// port number (optional) | ||
"(?::\\d{2,5})?" + | ||
// resource path (optional) | ||
"(?:[/?#]\\S*)?" + | ||
"$", "i" | ||
); | ||
|
||
let nregex = options => { | ||
options = { | ||
strict: true, | ||
...options | ||
}; | ||
|
||
const tlds = require('./tlds'); | ||
const v4 = '(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)(?:\\.(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)){3}'; | ||
const protocol = `(?:(?:[a-z]+:)?//)${options.strict ? '' : '?'}`; | ||
const auth = '(?:\\S+(?::\\S*)?@)?'; | ||
const ip = v4; | ||
const host = '(?:(?:[a-z\\u00a1-\\uffff0-9][-_]*)*[a-z\\u00a1-\\uffff0-9]+)'; | ||
const domain = '(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*'; | ||
const tld = `(?:\\.${options.strict ? '(?:[a-z\\u00a1-\\uffff]{2,})' : `(?:${tlds.sort((a, b) => b.length - a.length).join('|')})`})\\.?`; | ||
const port = '(?::\\d{2,5})?'; | ||
const path = '(?:[/?#][^\\s"]*)?'; | ||
const regex = `(?:${protocol}|www\\.)${auth}(?:localhost|${ip}|${host}${domain}${tld})${port}${path}`; | ||
|
||
return options.exact ? new RegExp(`(?:^${regex}$)`, 'i') : new RegExp(regex, 'ig'); | ||
}; | ||
|
||
var readline = require('readline'); | ||
|
||
var rl = readline.createInterface({ | ||
input: process.stdin, | ||
output: process.stdout, | ||
terminal: false | ||
}); | ||
|
||
rl.on('line', function (line) { | ||
let r = line.match(nregex()); | ||
if (r) { | ||
for (let i = 0; i < r.length; i++) { | ||
//console.error(i);// (r[i]); | ||
console.log(r[i]); | ||
}; | ||
} else { | ||
console.log("pizza"); | ||
} | ||
// if (r) { | ||
// console.log(r.join('\n')); | ||
// } | ||
}); | ||
|
||
// console.log('foo http://github.com bar //google.com'.match(nregex())); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
REPO_TOP=$(git rev-parse --show-toplevel) | ||
export TEST_BASE=$REPO_TOP/web-index | ||
export SCRIPT_DIR="$TEST_BASE"/scripts | ||
export WEB_INDEX_DIR="$TEST_BASE"/inputs | ||
export WIKI="$TEST_BASE"/inputs/articles | ||
|
||
cd $(dirname "$0") || exit 1 | ||
|
||
output_base="$1" | ||
|
||
rm -f {1,2,3}grams | ||
mkfifo {1,2,3}grams | ||
|
||
extract_text="$SCRIPT_DIR/extract_text.sh" | ||
bigrams_aux="$SCRIPT_DIR/bigrams_aux.sh" | ||
trigrams_aux="$SCRIPT_DIR/trigrams_aux.sh" | ||
|
||
cat "$INPUT_FILE" | | ||
sed "s#^#$WIKI/#" | | ||
$extract_text | | ||
tr -cs A-Za-z '\n' | | ||
tr A-Z a-z | | ||
grep -vwFf "$WEB_INDEX_DIR/stopwords.txt" | | ||
"$SCRIPT_DIR/stem-words.js" | | ||
tee 3grams 2grams 1grams > /dev/null & | ||
|
||
cat 1grams | | ||
sort | | ||
uniq -c | | ||
sort -rn > "$output_base/1-grams.txt" & | ||
|
||
cat 2grams | | ||
tr -cs A-Za-z '\n' | | ||
tr A-Z a-z | | ||
$bigrams_aux | | ||
sort | | ||
uniq -c | | ||
sort -rn > "$output_base/2-grams.txt" & | ||
|
||
cat 3grams | | ||
tr -cs A-Za-z '\n' | | ||
tr A-Z a-z | | ||
$trigrams_aux | | ||
sort | | ||
uniq -c | | ||
sort -rn > "$output_base/3-grams.txt" | ||
|
||
rm -f {1,2,3}grams |
Oops, something went wrong.