Skip to content

Commit

Permalink
Fix web-index
Browse files Browse the repository at this point in the history
Signed-off-by: Evangelos Lamprou <[email protected]>
  • Loading branch information
vagos committed Nov 11, 2024
1 parent f4cf478 commit 46477a0
Show file tree
Hide file tree
Showing 17 changed files with 588 additions and 1 deletion.
4 changes: 4 additions & 0 deletions web-index/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
inputs
outputs

node_modules
28 changes: 28 additions & 0 deletions web-index/deps.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env bash

set -e
# 7zip
pkgs='p7zip-full curl wget nodejs unzip npm'
if ! dpkg -s $pkgs >/dev/null 2>&1 ; then
sudo apt-get install $pkgs -y
echo 'Packages Installed'
fi

if ! dpkg -s pandoc > /dev/null 2>&1 ; then
# since pandoc v.2.2.1 does not support arm64, we use v.3.5
wget https://github.com/jgm/pandoc/releases/download/3.5/pandoc-3.5-1-$(dpkg --print-architecture).deb
sudo dpkg -i ./pandoc-3.5-1-$(dpkg --print-architecture).deb
rm ./pandoc-3.5-1-$(dpkg --print-architecture).deb
fi

if ! dpkg -s nodejs > /dev/null 2>&1 ; then
# node version 18+ does not need external npm
curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash -
sudo apt-get install -y nodejs
fi

cd $(dirname $0)/scripts || exit 1
npm install
# Install the npm packages
npm install natural
cd -
1 change: 1 addition & 0 deletions web-index/hashes/1-grams.txt.small.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
a5c0042f76e0680586ae707dd62c9f3e
1 change: 1 addition & 0 deletions web-index/hashes/2-grams.txt.small.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
db8e559c78464ffc3fdf5aeede7a3b8a
1 change: 1 addition & 0 deletions web-index/hashes/3-grams.txt.small.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
f1caa64726ed848a67a117a6d9879a51
39 changes: 39 additions & 0 deletions web-index/input.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash

BENCH_TOP=${BENCH_TOP:-$(git rev-parse --show-toplevel)}
RESOURCES_DIR=${RESOURCES_DIR:-$BENCH_TOP/web-index}/inputs

cp stopwords.txt $RESOURCES_DIR # TODO: Grab this from the atlas server

mkdir -p $RESOURCES_DIR

if [ "$1" = "--small" ]; then
if [[ ! -f "$RESOURCES_DIR/wikipedia-small.tar.gz" ]]; then
# 1000 entries
echo "Downloading the small dataset."
wget -O $RESOURCES_DIR/wikipedia-small.tar.gz https://atlas-group.cs.brown.edu/data/wikipedia/input_small/articles.tar.gz --no-check-certificate
wget -O $RESOURCES_DIR/index_small.txt https://atlas-group.cs.brown.edu/data/wikipedia/input_small/index.txt --no-check-certificate
fi
else
if [[ ! -f "$RESOURCES_DIR/wikipedia.tar.gz" ]]; then
# full dataset
echo "Downloading the full dataset. Caution!! Extracted size >200GB"
wget -O $RESOURCES_DIR/wikipedia.tar.gz https://atlas-group.cs.brown.edu/data/wikipedia/input/articles.tar.gz --no-check-certificate
wget -O $RESOURCES_DIR/index.txt https://atlas-group.cs.brown.edu/data/wikipedia/input/index.txt --no-check-certificate
fi
fi

if [[ ! -d "$RESOURCES_DIR/articles" ]]; then
if [ "$1" = "--small" ]; then
# 1000 entries
echo "Extracting the small dataset."
tar -xf $RESOURCES_DIR/wikipedia-small.tar.gz -C $RESOURCES_DIR
else
# full dataset
echo "Extracting the full dataset. Caution!! Extracted size >200GB"
tar -xf $RESOURCES_DIR/wikipedia.tar.gz -C $RESOURCES_DIR
fi
else
echo "Did not extract data because of existing data."
echo "Please rm -r $RESOURCES_DIR/articles manually and rerun this script."
fi
29 changes: 29 additions & 0 deletions web-index/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env bash

cd "$(dirname "$0")"

directory_path="inputs/articles"

if [ ! -d "$directory_path" ]; then
echo "Error: Directory does not exist."
exit 1
fi

# ensure a local ./tmp directory exists for sorting
mkdir -p ./tmp
export TMPDIR=$PWD/tmp

INPUTS="$PWD/inputs"
OUTPUT_BASE="$PWD/outputs/ngrams"

if [[ "$@" == *"--small"* ]]; then
export INPUT_FILE="$INPUTS/index_small.txt"
else
export INPUT_FILE="$INPUTS/index.txt"
fi

mkdir -p "$OUTPUT_BASE"

echo "web-index"
time $SHELL ./scripts/ngrams.sh "$OUTPUT_BASE"
echo $?
9 changes: 9 additions & 0 deletions web-index/scripts/bigrams_aux.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
( mkfifo s2 > /dev/null ) ;
( mkfifo s3 > /dev/null ) ;

sed '$d' s2 > s3 &
tee s2 |
tail +2 |
paste s3 -
rm s2
rm s3
6 changes: 6 additions & 0 deletions web-index/scripts/extract_text.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
while read -r line
do
cat $line |
iconv -c -t ascii//TRANSLIT |
pandoc +RTS -K64m -RTS --from html --to plain --quiet
done
100 changes: 100 additions & 0 deletions web-index/scripts/grep-url.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#!/usr/bin/env node
// TODO: use node's URL to parse and emit a URL in normal form
// URL validation as a stream transformer
//
// Contains code by Diego Perini, as compared in
// http://mathiasbynens.be/demo/url-regex
//
// Notes on possible differences from a standard/generic validation:
//
// - utf-8 char class take in consideration the full Unicode range
// - TLDs have been made mandatory so single names like "localhost" fails
// - protocols have been restricted to ftp, http and https only as requested

var re_weburl = new RegExp(
"^" +
// protocol identifier (optional)
// short syntax // still required
"(?:(?:(?:https?|ftp):)?\\/\\/)" +
// user:pass BasicAuth (optional)
"(?:\\S+(?::\\S*)?@)?" +
"(?:" +
// IP address exclusion
// private & local networks
"(?!(?:10|127)(?:\\.\\d{1,3}){3})" +
"(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" +
"(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" +
// IP address dotted notation octets
// excludes loopback network 0.0.0.0
// excludes reserved space >= 224.0.0.0
// excludes network & broadcast addresses
// (first & last IP address of each class)
"(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" +
"(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" +
"(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" +
"|" +
// host & domain names, may end with dot
// can be replaced by a shortest alternative
// (?![-_])(?:[-\\w\\u00a1-\\uffff]{0,63}[^-_]\\.)+
"(?:" +
"(?:" +
"[a-z0-9\\u00a1-\\uffff]" +
"[a-z0-9\\u00a1-\\uffff_-]{0,62}" +
")?" +
"[a-z0-9\\u00a1-\\uffff]\\." +
")+" +
// TLD identifier name, may end with dot
"(?:[a-z\\u00a1-\\uffff]{2,}\\.?)" +
")" +
// port number (optional)
"(?::\\d{2,5})?" +
// resource path (optional)
"(?:[/?#]\\S*)?" +
"$", "i"
);

let nregex = options => {
options = {
strict: true,
...options
};

const tlds = require('./tlds');
const v4 = '(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)(?:\\.(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)){3}';
const protocol = `(?:(?:[a-z]+:)?//)${options.strict ? '' : '?'}`;
const auth = '(?:\\S+(?::\\S*)?@)?';
const ip = v4;
const host = '(?:(?:[a-z\\u00a1-\\uffff0-9][-_]*)*[a-z\\u00a1-\\uffff0-9]+)';
const domain = '(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*';
const tld = `(?:\\.${options.strict ? '(?:[a-z\\u00a1-\\uffff]{2,})' : `(?:${tlds.sort((a, b) => b.length - a.length).join('|')})`})\\.?`;
const port = '(?::\\d{2,5})?';
const path = '(?:[/?#][^\\s"]*)?';
const regex = `(?:${protocol}|www\\.)${auth}(?:localhost|${ip}|${host}${domain}${tld})${port}${path}`;

return options.exact ? new RegExp(`(?:^${regex}$)`, 'i') : new RegExp(regex, 'ig');
};

var readline = require('readline');

var rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
terminal: false
});

rl.on('line', function (line) {
let r = line.match(nregex());
if (r) {
for (let i = 0; i < r.length; i++) {
//console.error(i);// (r[i]);
console.log(r[i]);
};
} else {
console.log("pizza");
}
// if (r) {
// console.log(r.join('\n'));
// }
});

// console.log('foo http://github.com bar //google.com'.match(nregex()));
48 changes: 48 additions & 0 deletions web-index/scripts/ngrams.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
REPO_TOP=$(git rev-parse --show-toplevel)
export TEST_BASE=$REPO_TOP/web-index
export SCRIPT_DIR="$TEST_BASE"/scripts
export WEB_INDEX_DIR="$TEST_BASE"/inputs
export WIKI="$TEST_BASE"/inputs/articles

cd $(dirname "$0") || exit 1

output_base="$1"

rm -f {1,2,3}grams
mkfifo {1,2,3}grams

extract_text="$SCRIPT_DIR/extract_text.sh"
bigrams_aux="$SCRIPT_DIR/bigrams_aux.sh"
trigrams_aux="$SCRIPT_DIR/trigrams_aux.sh"

cat "$INPUT_FILE" |
sed "s#^#$WIKI/#" |
$extract_text |
tr -cs A-Za-z '\n' |
tr A-Z a-z |
grep -vwFf "$WEB_INDEX_DIR/stopwords.txt" |
"$SCRIPT_DIR/stem-words.js" |
tee 3grams 2grams 1grams > /dev/null &

cat 1grams |
sort |
uniq -c |
sort -rn > "$output_base/1-grams.txt" &

cat 2grams |
tr -cs A-Za-z '\n' |
tr A-Z a-z |
$bigrams_aux |
sort |
uniq -c |
sort -rn > "$output_base/2-grams.txt" &

cat 3grams |
tr -cs A-Za-z '\n' |
tr A-Z a-z |
$trigrams_aux |
sort |
uniq -c |
sort -rn > "$output_base/3-grams.txt"

rm -f {1,2,3}grams
Loading

0 comments on commit 46477a0

Please sign in to comment.