Fix web-index

Signed-off-by: Evangelos Lamprou <[email protected]>
binpash · Nov 11, 2024 · 46477a0 · 46477a0
1 parent f4cf478
commit 46477a0
Show file tree

Hide file tree

Showing 17 changed files with 588 additions and 1 deletion.
diff --git a/web-index/.gitignore b/web-index/.gitignore
@@ -0,0 +1,4 @@
+inputs
+outputs
+
+node_modules
diff --git a/web-index/deps.sh b/web-index/deps.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+set -e
+# 7zip
+pkgs='p7zip-full curl wget nodejs unzip npm' 
+if ! dpkg -s $pkgs >/dev/null 2>&1 ; then
+  sudo apt-get install $pkgs -y
+  echo 'Packages Installed'
+fi
+
+if ! dpkg -s pandoc > /dev/null 2>&1 ; then
+  # since pandoc v.2.2.1 does not support arm64, we use v.3.5
+  wget https://github.com/jgm/pandoc/releases/download/3.5/pandoc-3.5-1-$(dpkg --print-architecture).deb
+  sudo dpkg -i ./pandoc-3.5-1-$(dpkg --print-architecture).deb
+  rm ./pandoc-3.5-1-$(dpkg --print-architecture).deb
+fi
+
+if ! dpkg -s nodejs > /dev/null 2>&1 ; then
+    # node version 18+ does not need external npm
+    curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash -
+    sudo apt-get install -y nodejs
+fi
+
+cd $(dirname $0)/scripts || exit 1
+npm install
+# Install the npm packages
+npm install natural
+cd -
diff --git a/web-index/hashes/1-grams.txt.small.hash b/web-index/hashes/1-grams.txt.small.hash
@@ -0,0 +1 @@
+a5c0042f76e0680586ae707dd62c9f3e
diff --git a/web-index/hashes/2-grams.txt.small.hash b/web-index/hashes/2-grams.txt.small.hash
@@ -0,0 +1 @@
+db8e559c78464ffc3fdf5aeede7a3b8a
diff --git a/web-index/hashes/3-grams.txt.small.hash b/web-index/hashes/3-grams.txt.small.hash
@@ -0,0 +1 @@
+f1caa64726ed848a67a117a6d9879a51
diff --git a/web-index/input.sh b/web-index/input.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+BENCH_TOP=${BENCH_TOP:-$(git rev-parse --show-toplevel)}
+RESOURCES_DIR=${RESOURCES_DIR:-$BENCH_TOP/web-index}/inputs
+
+cp stopwords.txt $RESOURCES_DIR # TODO: Grab this from the atlas server
+
+mkdir -p $RESOURCES_DIR
+
+if [ "$1" = "--small" ]; then
+	if [[ ! -f "$RESOURCES_DIR/wikipedia-small.tar.gz" ]]; then
+		# 1000 entries
+		echo "Downloading the small dataset."
+		wget -O $RESOURCES_DIR/wikipedia-small.tar.gz https://atlas-group.cs.brown.edu/data/wikipedia/input_small/articles.tar.gz --no-check-certificate
+		wget -O $RESOURCES_DIR/index_small.txt https://atlas-group.cs.brown.edu/data/wikipedia/input_small/index.txt --no-check-certificate
+	fi
+else
+	if [[ ! -f "$RESOURCES_DIR/wikipedia.tar.gz" ]]; then
+		# full dataset
+		echo "Downloading the full dataset. Caution!! Extracted size >200GB"
+		wget -O $RESOURCES_DIR/wikipedia.tar.gz https://atlas-group.cs.brown.edu/data/wikipedia/input/articles.tar.gz --no-check-certificate
+		wget -O $RESOURCES_DIR/index.txt https://atlas-group.cs.brown.edu/data/wikipedia/input/index.txt --no-check-certificate
+	fi
+fi
+
+if [[ ! -d "$RESOURCES_DIR/articles" ]]; then
+	if [ "$1" = "--small" ]; then
+		# 1000 entries
+		echo "Extracting the small dataset."
+		tar -xf $RESOURCES_DIR/wikipedia-small.tar.gz -C $RESOURCES_DIR
+	else
+		# full dataset
+		echo "Extracting the full dataset. Caution!! Extracted size >200GB"
+		tar -xf $RESOURCES_DIR/wikipedia.tar.gz -C $RESOURCES_DIR
+	fi
+else
+	echo "Did not extract data because of existing data."
+	echo "Please rm -r $RESOURCES_DIR/articles manually and rerun this script."
+fi
diff --git a/web-index/run.sh b/web-index/run.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+cd "$(dirname "$0")"
+
+directory_path="inputs/articles"
+
+if [ ! -d "$directory_path" ]; then
+	    echo "Error: Directory does not exist."
+      exit 1
+fi
+
+# ensure a local ./tmp directory exists for sorting
+mkdir -p ./tmp
+export TMPDIR=$PWD/tmp
+
+INPUTS="$PWD/inputs"
+OUTPUT_BASE="$PWD/outputs/ngrams"
+
+if [[ "$@" == *"--small"* ]]; then
+    export INPUT_FILE="$INPUTS/index_small.txt"
+else
+    export INPUT_FILE="$INPUTS/index.txt"
+fi
+
+mkdir -p "$OUTPUT_BASE"
+
+echo "web-index"
+time $SHELL ./scripts/ngrams.sh "$OUTPUT_BASE"
+echo $?
diff --git a/web-index/scripts/bigrams_aux.sh b/web-index/scripts/bigrams_aux.sh
@@ -0,0 +1,9 @@
+( mkfifo s2 > /dev/null ) ;
+( mkfifo s3 > /dev/null ) ;
+
+sed '$d' s2 > s3 &
+tee s2 |
+    tail +2 |
+    paste s3 -
+rm s2
+rm s3
diff --git a/web-index/scripts/extract_text.sh b/web-index/scripts/extract_text.sh
@@ -0,0 +1,6 @@
+while read -r line
+do
+    cat $line |
+        iconv -c -t ascii//TRANSLIT |
+        pandoc +RTS -K64m -RTS --from html --to plain --quiet
+done
diff --git a/web-index/scripts/grep-url.js b/web-index/scripts/grep-url.js
@@ -0,0 +1,100 @@
+#!/usr/bin/env node
+// TODO: use node's URL to parse and emit a URL in normal form
+// URL validation as a stream transformer
+//
+// Contains code by Diego Perini, as compared in  
+//   http://mathiasbynens.be/demo/url-regex
+//
+// Notes on possible differences from a standard/generic validation:
+//
+// - utf-8 char class take in consideration the full Unicode range
+// - TLDs have been made mandatory so single names like "localhost" fails
+// - protocols have been restricted to ftp, http and https only as requested
+
+var re_weburl = new RegExp(
+    "^" +
+      // protocol identifier (optional)
+      // short syntax // still required
+      "(?:(?:(?:https?|ftp):)?\\/\\/)" +
+      // user:pass BasicAuth (optional)
+      "(?:\\S+(?::\\S*)?@)?" +
+      "(?:" +
+        // IP address exclusion
+        // private & local networks
+        "(?!(?:10|127)(?:\\.\\d{1,3}){3})" +
+        "(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" +
+        "(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" +
+        // IP address dotted notation octets
+        // excludes loopback network 0.0.0.0
+        // excludes reserved space >= 224.0.0.0
+        // excludes network & broadcast addresses
+        // (first & last IP address of each class)
+        "(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" +
+        "(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" +
+        "(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" +
+      "|" +
+        // host & domain names, may end with dot
+        // can be replaced by a shortest alternative
+        // (?![-_])(?:[-\\w\\u00a1-\\uffff]{0,63}[^-_]\\.)+
+        "(?:" +
+          "(?:" +
+            "[a-z0-9\\u00a1-\\uffff]" +
+            "[a-z0-9\\u00a1-\\uffff_-]{0,62}" +
+          ")?" +
+          "[a-z0-9\\u00a1-\\uffff]\\." +
+        ")+" +
+        // TLD identifier name, may end with dot
+        "(?:[a-z\\u00a1-\\uffff]{2,}\\.?)" +
+      ")" +
+      // port number (optional)
+      "(?::\\d{2,5})?" +
+      // resource path (optional)
+      "(?:[/?#]\\S*)?" +
+    "$", "i"
+  );
+
+  let nregex = options => {
+      options = {
+          strict: true,
+          ...options
+      };
+
+      const tlds = require('./tlds');
+      const v4 = '(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)(?:\\.(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)){3}';
+      const protocol = `(?:(?:[a-z]+:)?//)${options.strict ? '' : '?'}`;
+      const auth = '(?:\\S+(?::\\S*)?@)?';
+      const ip = v4;
+      const host = '(?:(?:[a-z\\u00a1-\\uffff0-9][-_]*)*[a-z\\u00a1-\\uffff0-9]+)';
+      const domain = '(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*';
+      const tld = `(?:\\.${options.strict ? '(?:[a-z\\u00a1-\\uffff]{2,})' : `(?:${tlds.sort((a, b) => b.length - a.length).join('|')})`})\\.?`;
+      const port = '(?::\\d{2,5})?';
+      const path = '(?:[/?#][^\\s"]*)?';
+      const regex = `(?:${protocol}|www\\.)${auth}(?:localhost|${ip}|${host}${domain}${tld})${port}${path}`;
+
+      return options.exact ? new RegExp(`(?:^${regex}$)`, 'i') : new RegExp(regex, 'ig');
+  };
+
+  var readline = require('readline');
+
+  var rl = readline.createInterface({
+    input: process.stdin,
+    output: process.stdout,
+    terminal: false
+  });
+
+  rl.on('line', function (line) {
+    let r = line.match(nregex());
+    if (r) {
+      for (let i = 0; i < r.length; i++) {
+        //console.error(i);// (r[i]);
+        console.log(r[i]);
+      };
+    } else {
+      console.log("pizza");
+    }
+  //  if (r) {
+  //    console.log(r.join('\n'));
+  //  }
+  });
+
+  // console.log('foo http://github.com bar //google.com'.match(nregex()));
diff --git a/web-index/scripts/ngrams.sh b/web-index/scripts/ngrams.sh
@@ -0,0 +1,48 @@
+REPO_TOP=$(git rev-parse --show-toplevel)
+export TEST_BASE=$REPO_TOP/web-index
+export SCRIPT_DIR="$TEST_BASE"/scripts
+export WEB_INDEX_DIR="$TEST_BASE"/inputs
+export WIKI="$TEST_BASE"/inputs/articles
+
+cd $(dirname "$0") || exit 1
+
+output_base="$1"
+
+rm -f {1,2,3}grams
+mkfifo {1,2,3}grams
+
+extract_text="$SCRIPT_DIR/extract_text.sh"
+bigrams_aux="$SCRIPT_DIR/bigrams_aux.sh"
+trigrams_aux="$SCRIPT_DIR/trigrams_aux.sh"
+
+cat "$INPUT_FILE" |
+  sed "s#^#$WIKI/#" |
+  $extract_text |
+  tr -cs A-Za-z '\n' |
+  tr A-Z a-z |
+  grep -vwFf "$WEB_INDEX_DIR/stopwords.txt" |
+  "$SCRIPT_DIR/stem-words.js" |
+  tee 3grams 2grams 1grams > /dev/null &
+
+cat 1grams |
+    sort |
+    uniq -c |
+    sort -rn > "$output_base/1-grams.txt" &
+
+cat 2grams |
+    tr -cs A-Za-z '\n' |
+    tr A-Z a-z |
+    $bigrams_aux |
+    sort |
+    uniq -c |
+    sort -rn > "$output_base/2-grams.txt" &
+
+cat 3grams |
+    tr -cs A-Za-z '\n' |
+    tr A-Z a-z |
+    $trigrams_aux |
+    sort |
+    uniq -c |
+    sort -rn > "$output_base/3-grams.txt"
+
+rm -f {1,2,3}grams