diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ec0e40a2f..bfd753ae6 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -12,7 +12,7 @@ jobs: strategy: fail-fast: false matrix: - benchmark: [nlp, file-enc, unix50, log-analysis, max-temp, uniq-ips, media-conv, sklearn, covid-mts, riker, oneliners, web-index] + benchmark: [nlp, file-enc, unix50, log-analysis, max-temp, uniq-ips, media-conv, sklearn, covid-mts, riker, oneliners] steps: - name: Checkout code diff --git a/web-index/bigrams_aux.sh b/web-index/bigrams_aux.sh deleted file mode 100755 index b9ae0f37d..000000000 --- a/web-index/bigrams_aux.sh +++ /dev/null @@ -1,9 +0,0 @@ -( mkfifo s2 > /dev/null ) ; -( mkfifo s3 > /dev/null ) ; - -sed '$d' s2 > s3 & -tee s2 | - tail +2 | - paste s3 - -rm s2 -rm s3 diff --git a/web-index/cleanup.sh b/web-index/cleanup.sh deleted file mode 100755 index 00f844973..000000000 --- a/web-index/cleanup.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -rm -rf tmp articles node_modules output *grams *grams.txt *index*.txt *tar.gz \ No newline at end of file diff --git a/web-index/deps.sh b/web-index/deps.sh deleted file mode 100755 index 3a933b852..000000000 --- a/web-index/deps.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash -set -e -# 7zip -pkgs='p7zip-full curl wget nodejs unzip npm' -if ! dpkg -s $pkgs >/dev/null 2>&1 ; then - sudo apt-get install $pkgs -y - echo 'Packages Installed' -fi - -if ! dpkg -s pandoc > /dev/null 2>&1 ; then - # since pandoc v.2.2.1 does not support arm64, we use v.3.5 - wget https://github.com/jgm/pandoc/releases/download/3.5/pandoc-3.5-1-$(dpkg --print-architecture).deb - sudo dpkg -i ./pandoc-3.5-1-$(dpkg --print-architecture).deb - rm ./pandoc-3.5-1-$(dpkg --print-architecture).deb -fi - -if ! dpkg -s nodejs > /dev/null 2>&1 ; then - # node version 18+ does not need external npm - curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash - - sudo apt-get install -y nodejs -fi - -npm install -# Install the npm packages -npm install natural diff --git a/web-index/extract_text.sh b/web-index/extract_text.sh deleted file mode 100755 index 81e5b4c84..000000000 --- a/web-index/extract_text.sh +++ /dev/null @@ -1,6 +0,0 @@ -while read -r line -do - cat $line | - iconv -c -t ascii//TRANSLIT | - pandoc +RTS -K64m -RTS --from html --to plain --quiet -done diff --git a/web-index/grep-url.js b/web-index/grep-url.js deleted file mode 100755 index 7585a5c63..000000000 --- a/web-index/grep-url.js +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env node -// TODO: use node's URL to parse and emit a URL in normal form -// URL validation as a stream transformer -// -// Contains code by Diego Perini, as compared in -// http://mathiasbynens.be/demo/url-regex -// -// Notes on possible differences from a standard/generic validation: -// -// - utf-8 char class take in consideration the full Unicode range -// - TLDs have been made mandatory so single names like "localhost" fails -// - protocols have been restricted to ftp, http and https only as requested - -var re_weburl = new RegExp( - "^" + - // protocol identifier (optional) - // short syntax // still required - "(?:(?:(?:https?|ftp):)?\\/\\/)" + - // user:pass BasicAuth (optional) - "(?:\\S+(?::\\S*)?@)?" + - "(?:" + - // IP address exclusion - // private & local networks - "(?!(?:10|127)(?:\\.\\d{1,3}){3})" + - "(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" + - "(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" + - // IP address dotted notation octets - // excludes loopback network 0.0.0.0 - // excludes reserved space >= 224.0.0.0 - // excludes network & broadcast addresses - // (first & last IP address of each class) - "(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" + - "(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" + - "(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" + - "|" + - // host & domain names, may end with dot - // can be replaced by a shortest alternative - // (?![-_])(?:[-\\w\\u00a1-\\uffff]{0,63}[^-_]\\.)+ - "(?:" + - "(?:" + - "[a-z0-9\\u00a1-\\uffff]" + - "[a-z0-9\\u00a1-\\uffff_-]{0,62}" + - ")?" + - "[a-z0-9\\u00a1-\\uffff]\\." + - ")+" + - // TLD identifier name, may end with dot - "(?:[a-z\\u00a1-\\uffff]{2,}\\.?)" + - ")" + - // port number (optional) - "(?::\\d{2,5})?" + - // resource path (optional) - "(?:[/?#]\\S*)?" + - "$", "i" - ); - - let nregex = options => { - options = { - strict: true, - ...options - }; - - const tlds = require('./tlds'); - const v4 = '(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)(?:\\.(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)){3}'; - const protocol = `(?:(?:[a-z]+:)?//)${options.strict ? '' : '?'}`; - const auth = '(?:\\S+(?::\\S*)?@)?'; - const ip = v4; - const host = '(?:(?:[a-z\\u00a1-\\uffff0-9][-_]*)*[a-z\\u00a1-\\uffff0-9]+)'; - const domain = '(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*'; - const tld = `(?:\\.${options.strict ? '(?:[a-z\\u00a1-\\uffff]{2,})' : `(?:${tlds.sort((a, b) => b.length - a.length).join('|')})`})\\.?`; - const port = '(?::\\d{2,5})?'; - const path = '(?:[/?#][^\\s"]*)?'; - const regex = `(?:${protocol}|www\\.)${auth}(?:localhost|${ip}|${host}${domain}${tld})${port}${path}`; - - return options.exact ? new RegExp(`(?:^${regex}$)`, 'i') : new RegExp(regex, 'ig'); - }; - - var readline = require('readline'); - - var rl = readline.createInterface({ - input: process.stdin, - output: process.stdout, - terminal: false - }); - - rl.on('line', function (line) { - let r = line.match(nregex()); - if (r) { - for (let i = 0; i < r.length; i++) { - //console.error(i);// (r[i]); - console.log(r[i]); - }; - } else { - console.log("pizza"); - } - // if (r) { - // console.log(r.join('\n')); - // } - }); - - // console.log('foo http://github.com bar //google.com'.match(nregex())); \ No newline at end of file diff --git a/web-index/hashes/1-grams.txt.small.hash b/web-index/hashes/1-grams.txt.small.hash deleted file mode 100644 index c840d1150..000000000 --- a/web-index/hashes/1-grams.txt.small.hash +++ /dev/null @@ -1 +0,0 @@ -b7006f6d425233137811f16eeb6ca668 diff --git a/web-index/hashes/2-grams.txt.small.hash b/web-index/hashes/2-grams.txt.small.hash deleted file mode 100644 index a73de048d..000000000 --- a/web-index/hashes/2-grams.txt.small.hash +++ /dev/null @@ -1 +0,0 @@ -a48e86700b02c50651e8d4b09a73170c diff --git a/web-index/hashes/3-grams.txt.small.hash b/web-index/hashes/3-grams.txt.small.hash deleted file mode 100644 index c23454341..000000000 --- a/web-index/hashes/3-grams.txt.small.hash +++ /dev/null @@ -1 +0,0 @@ -73310ad60a0d2d50d805901c481a5dbc diff --git a/web-index/input.sh b/web-index/input.sh deleted file mode 100755 index 6f4d53097..000000000 --- a/web-index/input.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -BENCH_TOP=${BENCH_TOP:-$(git rev-parse --show-toplevel)} -RESOURCES_DIR=${RESOURCES_DIR:-$BENCH_TOP/web-index} - -mkdir -p $RESOURCES_DIR - -if [ "$1" = "--small" ]; then - if [[ ! -f "$RESOURCES_DIR/wikipedia-small.tar.gz" ]]; then - # 1000 entries - echo "Downloading the small dataset." - wget -O $RESOURCES_DIR/wikipedia-small.tar.gz https://atlas-group.cs.brown.edu/data/wikipedia/input_small/articles.tar.gz --no-check-certificate - wget -O $RESOURCES_DIR/index_small.txt https://atlas-group.cs.brown.edu/data/wikipedia/input_small/index.txt --no-check-certificate - fi -else - if [[ ! -f "$RESOURCES_DIR/wikipedia.tar.gz" ]]; then - # full dataset - echo "Downloading the full dataset. Caution!! Extracted size >200GB" - wget -O $RESOURCES_DIR/wikipedia.tar.gz https://atlas-group.cs.brown.edu/data/wikipedia/input/articles.tar.gz --no-check-certificate - wget -O $RESOURCES_DIR/index.txt https://atlas-group.cs.brown.edu/data/wikipedia/input/index.txt --no-check-certificate - fi -fi - -if [[ ! -d "$RESOURCES_DIR/articles" ]]; then - if [ "$1" = "--small" ]; then - # 1000 entries - echo "Extracting the small dataset." - tar -xf $RESOURCES_DIR/wikipedia-small.tar.gz -C $RESOURCES_DIR - else - # full dataset - echo "Extracting the full dataset. Caution!! Extracted size >200GB" - tar -xf $RESOURCES_DIR/wikipedia.tar.gz -C $RESOURCES_DIR - fi -else - echo "Did not extract data because of existing data." - echo "Please rm -r $RESOURCES_DIR/articles manually and rerun this script." -fi - -echo "Data is ready." diff --git a/web-index/input/dependencies.sh b/web-index/input/dependencies.sh new file mode 100755 index 000000000..8df2a11dd --- /dev/null +++ b/web-index/input/dependencies.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +# 7zip +pkgs='p7zip-full curl wget nodejs' +if ! dpkg -s $pkgs >/dev/null 2>&1 ; then + sudo apt-get install $pkgs -y + echo 'Packages Installed' +fi + +if ! dpkg -s pandoc > /dev/null 2>&1 ; then + # pandoc v.2.2.1 + wget https://github.com/jgm/pandoc/releases/download/2.2.1/pandoc-2.2.1-1-$(dpkg --print-architecture).deb + sudo dpkg -i ./pandoc-2.2.1-1-$(dpkg --print-architecture).deb + rm ./pandoc-2.2.1-1-$(dpkg --print-architecture).deb +fi + +if ! dpkg -s nodejs > /dev/null 2>&1 ; then + # node version 18+ does not need external npm + curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash - + sudo apt-get install -y nodejs +fi + +if [ ! -d node_modules ]; then + npm install +fi \ No newline at end of file diff --git a/web-index/input/generte_index.sh b/web-index/input/generte_index.sh new file mode 100755 index 000000000..d39725de9 --- /dev/null +++ b/web-index/input/generte_index.sh @@ -0,0 +1,24 @@ +#!/bin/bash +if [ $# -eq 0 ]; then + echo "Usage: $0 " + exit 1 +fi + +# Directory path is the first argument +directory_path=$1 + +# Check if the directory exists +if [ ! -d "$directory_path" ]; then + echo "Error: Directory does not exist." + exit 1 +fi + +# Ensure a local ./tmp directory exists for sorting +mkdir -p ./tmp +export TMPDIR=./tmp + +# Find all files, remove prefix, sort them, and write to a text file +find "$directory_path" -type f | sed 's|./wikipedia/en/articles/||' | sort > index.txt + +echo "File paths have been saved to all_files_paths.txt" + diff --git a/web-index/input/input.sh b/web-index/input/input.sh new file mode 100755 index 000000000..317dd17cc --- /dev/null +++ b/web-index/input/input.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +#set -e + +wiki_archive="https://dumps.wikimedia.org/other/static_html_dumps/current/en/wikipedia-en-html.tar.7z" +BENCH_TOP=${BENCH_TOP:-$(git rev-parse --show-toplevel)} + +# . "$BENCH_TOP/scripts/utils.sh" +sudo apt-get install unzip + +[ "$1" = "-c" ] && rm -rf en/ *.7z *.tar 500.txt 1000.txt full small + +setup_dataset() { + rm -rf ../1-grams.txt ../2-grams.txt + + ## Downloading the dataset needs to happen for both small and large + if [[ ! -d ./en ]]; then + # wget $wiki_archive || eexit "cannot fetch wikipedia" + # 7za x wikipedia-en-html.tar.7z + tar -xvf wikipedia-en-html.tar + wget http://ndr.md/data/wikipedia/index.txt # || eexit "cannot fetch wiki indices" + # It is actually OK if we don't have this index since we download the 500/1000 below + fi + + if [ "$1" = "--small" ]; then + # 500 entries + wget http://pac-n4.csail.mit.edu:81/pash_data/small/web-index.small.zip + unzip web-index.small.zip + mv small/500.txt . + rm -rf small web-index.small.zip + elif [ "$1" = "--full" ]; then + the default full + 1000 entries + wget http://pac-n4.csail.mit.edu:81/pash_data/full/web-index.full.zip + unzip web-index.full.zip + mv full/1000.txt . + rm -rf full web-index.full.zip + fi +} + +setup_dataset $1 \ No newline at end of file diff --git a/web-index/package.json b/web-index/input/package.json similarity index 100% rename from web-index/package.json rename to web-index/input/package.json diff --git a/web-index/inputs/cleanup.sh b/web-index/inputs/cleanup.sh new file mode 100644 index 000000000..e69de29bb diff --git a/web-index/inputs/dependencies.sh b/web-index/inputs/dependencies.sh new file mode 100644 index 000000000..e69de29bb diff --git a/web-index/inputs/input.sh b/web-index/inputs/input.sh new file mode 100644 index 000000000..e69de29bb diff --git a/web-index/inputs/run.sh b/web-index/inputs/run.sh new file mode 100644 index 000000000..e69de29bb diff --git a/web-index/inputs/verify.sh b/web-index/inputs/verify.sh new file mode 100644 index 000000000..e69de29bb diff --git a/web-index/move_articles.sh b/web-index/move_articles.sh deleted file mode 100755 index f1bd89bde..000000000 --- a/web-index/move_articles.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# Define the base directory -base_directory="input1000" - -# Check if the base directory exists -if [ ! -d "$base_directory" ]; then - echo "Base directory does not exist: $base_directory" - exit 1 -fi - -# Navigate to the base directory -cd "$base_directory" - -# Create a tar archive of the en/articles directory -tar -czvf en_articles.tar.gz en/articles - -echo "Archive created: $(pwd)/en_articles.tar.gz" diff --git a/web-index/p1.sh b/web-index/p1.sh new file mode 100755 index 000000000..a0a1a628e --- /dev/null +++ b/web-index/p1.sh @@ -0,0 +1,18 @@ +#!/bin/bash +PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)} +WIKI=${WIKI:-$PASH_TOP/web-index} + +export WIKI +# Squash all HTML for each URL into a single line, streaming fashion +# It also prefixes with the URL + +page_per_line () { + cat "$WIKI/$0" | tr -d "\n\r" | tr -d '\n' | sed -e '/.$/a\' +} + +export -f page_per_line + +# xargs: +# add `-t` for debugging +cat $WIKI/input/index.txt | xargs -0 -d '\n' -n 1 bash -c 'page_per_line "$@"' + diff --git a/web-index/p2.sh b/web-index/p2.sh new file mode 100755 index 000000000..1390b29a5 --- /dev/null +++ b/web-index/p2.sh @@ -0,0 +1,13 @@ +#!/bin/bash +PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)} +WIKI=${WIKI:-$PASH_TOP/web-index} + +cat $WIKI/input/index.txt | +sed "s#^#$WIKI#" | +iconv -c -t ascii//TRANSLIT | +pandoc +RTS -K64m -RTS --from html --to plain --quiet | +tr -cs A-Za-z '\n' | +tr A-Z a-z | +grep -vwFf $WIKI/stopwords.txt | +$WIKI/stem-words.js + diff --git a/web-index/run.sh b/web-index/run.sh deleted file mode 100755 index ad1e862dc..000000000 --- a/web-index/run.sh +++ /dev/null @@ -1,114 +0,0 @@ -#!/bin/sh - -cd "$(dirname "$0")" - -directory_path="articles" - -if [ ! -d "$directory_path" ]; then - echo "Error: Directory does not exist." - exit 1 -fi - -# ensure a local ./tmp directory exists for sorting -mkdir -p ./tmp -export TMPDIR=./tmp - -# find all files, remove prefix, sort them, and write to a text file -find "$directory_path" -type f | sed 's|./wikipedia/en/articles/||' | sort > index.txt - -echo "File paths have been saved to all_files_paths.txt" - - -# default values -WINDOW=5 -TARGET="sh-only" -LOG="enable" -INPUT="small" - -# parse arguments -while [ "$#" -gt 0 ]; do - case $1 in - --window) WINDOW="$2"; shift ;; - --target) TARGET="$2"; shift ;; - --log) LOG="$2"; shift ;; - --input) INPUT="$2"; shift ;; - esac - shift -done - -# setz environment variables -TEST_BASE=$(dirname "$(realpath "$0")") -echo "TEST_BASE: $TEST_BASE" -LOCAL_NAME=$(basename "$TEST_BASE") -echo "LOCAL_NAME: $LOCAL_NAME" -OUTPUT_BASE="$TEST_BASE/output/$LOCAL_NAME" - -if [ "$INPUT" = "small" ]; then - export INPUT_FILE="$TEST_BASE/index_small.txt" -elif [ "$INPUT" = "full" ]; then - export INPUT_FILE="$TEST_BASE/index.txt" -fi -export WEB_INDEX_DIR="$TEST_BASE" -export SCRIPT_DIR="$TEST_BASE" -export WIKI="$TEST_BASE/articles" - -do_cleanup() { - echo "Cleaning up..." - rm -f *grams *-grams.txt -} - -do_run() { - output_base=$1 - start_time=$(date +%s) - - echo "Running integrated script" - - mkfifo {1,2,3}grams - - extract_text="$SCRIPT_DIR/extract_text.sh" - bigrams_aux="$SCRIPT_DIR/bigrams_aux.sh" - trigrams_aux="$SCRIPT_DIR/trigrams_aux.sh" - - cat "$INPUT_FILE" | - sed "s#^#$WIKI/#" | head | - $extract_text | - tr -cs A-Za-z '\n' | - tr A-Z a-z | - grep -vwFf "$WEB_INDEX_DIR/stopwords.txt" | - "$WEB_INDEX_DIR/stem-words.js" | - tee 3grams 2grams 1grams - - cat 1grams | - sort | - uniq -c | - sort -rn > 1-grams.txt - - cat 2grams | - tr -cs A-Za-z '\n' | - tr A-Z a-z | - $bigrams_aux | - sort | - uniq -c | - sort -rn > 2-grams.txt - - cat 3grams | - tr -cs A-Za-z '\n' | - tr A-Z a-z | - $trigrams_aux | - sort | - uniq -c | - sort -rn > 3-grams.txt - - rm -f {1,2,3}grams - - end_time=$(date +%s) - duration=$((end_time - start_time)) - echo "$duration" > "$output_base/sh_time" -} - -# create output directory -mkdir -p "$OUTPUT_BASE" - -# run the integrated process -do_cleanup -do_run "$OUTPUT_BASE" diff --git a/web-index/tlds.js b/web-index/tlds.js index 74e3fa0d3..d0bb9042e 100644 --- a/web-index/tlds.js +++ b/web-index/tlds.js @@ -1,1544 +1,1543 @@ module.exports = [ - "aaa", - "aarp", - "abarth", - "abb", - "abbott", - "abbvie", - "abc", - "able", - "abogado", - "abudhabi", - "ac", - "academy", - "accenture", - "accountant", - "accountants", - "aco", - "active", - "actor", - "ad", - "adac", - "ads", - "adult", - "ae", - "aeg", - "aero", - "aetna", - "af", - "afamilycompany", - "afl", - "africa", - "ag", - "agakhan", - "agency", - "ai", - "aig", - "aigo", - "airbus", - "airforce", - "airtel", - "akdn", - "al", - "alfaromeo", - "alibaba", - "alipay", - "allfinanz", - "allstate", - "ally", - "alsace", - "alstom", - "am", - "americanexpress", - "americanfamily", - "amex", - "amfam", - "amica", - "amsterdam", - "analytics", - "android", - "anquan", - "anz", - "ao", - "aol", - "apartments", - "app", - "apple", - "aq", - "aquarelle", - "ar", - "arab", - "aramco", - "archi", - "army", - "arpa", - "art", - "arte", - "as", - "asda", - "asia", - "associates", - "at", - "athleta", - "attorney", - "au", - "auction", - "audi", - "audible", - "audio", - "auspost", - "author", - "auto", - "autos", - "avianca", - "aw", - "aws", - "ax", - "axa", - "az", - "azure", - "ba", - "baby", - "baidu", - "banamex", - "bananarepublic", - "band", - "bank", - "bar", - "barcelona", - "barclaycard", - "barclays", - "barefoot", - "bargains", - "baseball", - "basketball", - "bauhaus", - "bayern", - "bb", - "bbc", - "bbt", - "bbva", - "bcg", - "bcn", - "bd", - "be", - "beats", - "beauty", - "beer", - "bentley", - "berlin", - "best", - "bestbuy", - "bet", - "bf", - "bg", - "bh", - "bharti", - "bi", - "bible", - "bid", - "bike", - "bing", - "bingo", - "bio", - "biz", - "bj", - "black", - "blackfriday", - "blanco", - "blockbuster", - "blog", - "bloomberg", - "blue", - "bm", - "bms", - "bmw", - "bn", - "bnl", - "bnpparibas", - "bo", - "boats", - "boehringer", - "bofa", - "bom", - "bond", - "boo", - "book", - "booking", - "bosch", - "bostik", - "boston", - "bot", - "boutique", - "box", - "br", - "bradesco", - "bridgestone", - "broadway", - "broker", - "brother", - "brussels", - "bs", - "bt", - "budapest", - "bugatti", - "build", - "builders", - "business", - "buy", - "buzz", - "bv", - "bw", - "by", - "bz", - "bzh", - "ca", - "cab", - "cafe", - "cal", - "call", - "calvinklein", - "cam", - "camera", - "camp", - "cancerresearch", - "canon", - "capetown", - "capital", - "capitalone", - "car", - "caravan", - "cards", - "care", - "career", - "careers", - "cars", - "cartier", - "casa", - "case", - "caseih", - "cash", - "casino", - "cat", - "catering", - "catholic", - "cba", - "cbn", - "cbre", - "cbs", - "cc", - "cd", - "ceb", - "center", - "ceo", - "cern", - "cf", - "cfa", - "cfd", - "cg", - "ch", - "chanel", - "channel", - "charity", - "chase", - "chat", - "cheap", - "chintai", - "christmas", - "chrome", - "chrysler", - "church", - "ci", - "cipriani", - "circle", - "cisco", - "citadel", - "citi", - "citic", - "city", - "cityeats", - "ck", - "cl", - "claims", - "cleaning", - "click", - "clinic", - "clinique", - "clothing", - "cloud", - "club", - "clubmed", - "cm", - "cn", - "co", - "coach", - "codes", - "coffee", - "college", - "cologne", - "com", - "comcast", - "commbank", - "community", - "company", - "compare", - "computer", - "comsec", - "condos", - "construction", - "consulting", - "contact", - "contractors", - "cooking", - "cookingchannel", - "cool", - "coop", - "corsica", - "country", - "coupon", - "coupons", - "courses", - "cr", - "credit", - "creditcard", - "creditunion", - "cricket", - "crown", - "crs", - "cruise", - "cruises", - "csc", - "cu", - "cuisinella", - "cv", - "cw", - "cx", - "cy", - "cymru", - "cyou", - "cz", - "dabur", - "dad", - "dance", - "data", - "date", - "dating", - "datsun", - "day", - "dclk", - "dds", - "de", - "deal", - "dealer", - "deals", - "degree", - "delivery", - "dell", - "deloitte", - "delta", - "democrat", - "dental", - "dentist", - "desi", - "design", - "dev", - "dhl", - "diamonds", - "diet", - "digital", - "direct", - "directory", - "discount", - "discover", - "dish", - "diy", - "dj", - "dk", - "dm", - "dnp", - "do", - "docs", - "doctor", - "dodge", - "dog", - "doha", - "domains", - "dot", - "download", - "drive", - "dtv", - "dubai", - "duck", - "dunlop", - "duns", - "dupont", - "durban", - "dvag", - "dvr", - "dz", - "earth", - "eat", - "ec", - "eco", - "edeka", - "edu", - "education", - "ee", - "eg", - "email", - "emerck", - "energy", - "engineer", - "engineering", - "enterprises", - "epost", - "epson", - "equipment", - "er", - "ericsson", - "erni", - "es", - "esq", - "estate", - "esurance", - "et", - "etisalat", - "eu", - "eurovision", - "eus", - "events", - "everbank", - "exchange", - "expert", - "exposed", - "express", - "extraspace", - "fage", - "fail", - "fairwinds", - "faith", - "family", - "fan", - "fans", - "farm", - "farmers", - "fashion", - "fast", - "fedex", - "feedback", - "ferrari", - "ferrero", - "fi", - "fiat", - "fidelity", - "fido", - "film", - "final", - "finance", - "financial", - "fire", - "firestone", - "firmdale", - "fish", - "fishing", - "fit", - "fitness", - "fj", - "fk", - "flickr", - "flights", - "flir", - "florist", - "flowers", - "fly", - "fm", - "fo", - "foo", - "food", - "foodnetwork", - "football", - "ford", - "forex", - "forsale", - "forum", - "foundation", - "fox", - "fr", - "free", - "fresenius", - "frl", - "frogans", - "frontdoor", - "frontier", - "ftr", - "fujitsu", - "fujixerox", - "fun", - "fund", - "furniture", - "futbol", - "fyi", - "ga", - "gal", - "gallery", - "gallo", - "gallup", - "game", - "games", - "gap", - "garden", - "gb", - "gbiz", - "gd", - "gdn", - "ge", - "gea", - "gent", - "genting", - "george", - "gf", - "gg", - "ggee", - "gh", - "gi", - "gift", - "gifts", - "gives", - "giving", - "gl", - "glade", - "glass", - "gle", - "global", - "globo", - "gm", - "gmail", - "gmbh", - "gmo", - "gmx", - "gn", - "godaddy", - "gold", - "goldpoint", - "golf", - "goo", - "goodhands", - "goodyear", - "goog", - "google", - "gop", - "got", - "gov", - "gp", - "gq", - "gr", - "grainger", - "graphics", - "gratis", - "green", - "gripe", - "grocery", - "group", - "gs", - "gt", - "gu", - "guardian", - "gucci", - "guge", - "guide", - "guitars", - "guru", - "gw", - "gy", - "hair", - "hamburg", - "hangout", - "haus", - "hbo", - "hdfc", - "hdfcbank", - "health", - "healthcare", - "help", - "helsinki", - "here", - "hermes", - "hgtv", - "hiphop", - "hisamitsu", - "hitachi", - "hiv", - "hk", - "hkt", - "hm", - "hn", - "hockey", - "holdings", - "holiday", - "homedepot", - "homegoods", - "homes", - "homesense", - "honda", - "honeywell", - "horse", - "hospital", - "host", - "hosting", - "hot", - "hoteles", - "hotels", - "hotmail", - "house", - "how", - "hr", - "hsbc", - "ht", - "hu", - "hughes", - "hyatt", - "hyundai", - "ibm", - "icbc", - "ice", - "icu", - "id", - "ie", - "ieee", - "ifm", - "ikano", - "il", - "im", - "imamat", - "imdb", - "immo", - "immobilien", - "in", - "inc", - "industries", - "infiniti", - "info", - "ing", - "ink", - "institute", - "insurance", - "insure", - "int", - "intel", - "international", - "intuit", - "investments", - "io", - "ipiranga", - "iq", - "ir", - "irish", - "is", - "iselect", - "ismaili", - "ist", - "istanbul", - "it", - "itau", - "itv", - "iveco", - "jaguar", - "java", - "jcb", - "jcp", - "je", - "jeep", - "jetzt", - "jewelry", - "jio", - "jlc", - "jll", - "jm", - "jmp", - "jnj", - "jo", - "jobs", - "joburg", - "jot", - "joy", - "jp", - "jpmorgan", - "jprs", - "juegos", - "juniper", - "kaufen", - "kddi", - "ke", - "kerryhotels", - "kerrylogistics", - "kerryproperties", - "kfh", - "kg", - "kh", - "ki", - "kia", - "kim", - "kinder", - "kindle", - "kitchen", - "kiwi", - "km", - "kn", - "koeln", - "komatsu", - "kosher", - "kp", - "kpmg", - "kpn", - "kr", - "krd", - "kred", - "kuokgroup", - "kw", - "ky", - "kyoto", - "kz", - "la", - "lacaixa", - "ladbrokes", - "lamborghini", - "lamer", - "lancaster", - "lancia", - "lancome", - "land", - "landrover", - "lanxess", - "lasalle", - "lat", - "latino", - "latrobe", - "law", - "lawyer", - "lb", - "lc", - "lds", - "lease", - "leclerc", - "lefrak", - "legal", - "lego", - "lexus", - "lgbt", - "li", - "liaison", - "lidl", - "life", - "lifeinsurance", - "lifestyle", - "lighting", - "like", - "lilly", - "limited", - "limo", - "lincoln", - "linde", - "link", - "lipsy", - "live", - "living", - "lixil", - "lk", - "llc", - "loan", - "loans", - "locker", - "locus", - "loft", - "lol", - "london", - "lotte", - "lotto", - "love", - "lpl", - "lplfinancial", - "lr", - "ls", - "lt", - "ltd", - "ltda", - "lu", - "lundbeck", - "lupin", - "luxe", - "luxury", - "lv", - "ly", - "ma", - "macys", - "madrid", - "maif", - "maison", - "makeup", - "man", - "management", - "mango", - "map", - "market", - "marketing", - "markets", - "marriott", - "marshalls", - "maserati", - "mattel", - "mba", - "mc", - "mckinsey", - "md", - "me", - "med", - "media", - "meet", - "melbourne", - "meme", - "memorial", - "men", - "menu", - "merckmsd", - "metlife", - "mg", - "mh", - "miami", - "microsoft", - "mil", - "mini", - "mint", - "mit", - "mitsubishi", - "mk", - "ml", - "mlb", - "mls", - "mm", - "mma", - "mn", - "mo", - "mobi", - "mobile", - "mobily", - "moda", - "moe", - "moi", - "mom", - "monash", - "money", - "monster", - "mopar", - "mormon", - "mortgage", - "moscow", - "moto", - "motorcycles", - "mov", - "movie", - "movistar", - "mp", - "mq", - "mr", - "ms", - "msd", - "mt", - "mtn", - "mtr", - "mu", - "museum", - "mutual", - "mv", - "mw", - "mx", - "my", - "mz", - "na", - "nab", - "nadex", - "nagoya", - "name", - "nationwide", - "natura", - "navy", - "nba", - "nc", - "ne", - "nec", - "net", - "netbank", - "netflix", - "network", - "neustar", - "new", - "newholland", - "news", - "next", - "nextdirect", - "nexus", - "nf", - "nfl", - "ng", - "ngo", - "nhk", - "ni", - "nico", - "nike", - "nikon", - "ninja", - "nissan", - "nissay", - "nl", - "no", - "nokia", - "northwesternmutual", - "norton", - "now", - "nowruz", - "nowtv", - "np", - "nr", - "nra", - "nrw", - "ntt", - "nu", - "nyc", - "nz", - "obi", - "observer", - "off", - "office", - "okinawa", - "olayan", - "olayangroup", - "oldnavy", - "ollo", - "om", - "omega", - "one", - "ong", - "onl", - "online", - "onyourside", - "ooo", - "open", - "oracle", - "orange", - "org", - "organic", - "origins", - "osaka", - "otsuka", - "ott", - "ovh", - "pa", - "page", - "panasonic", - "panerai", - "paris", - "pars", - "partners", - "parts", - "party", - "passagens", - "pay", - "pccw", - "pe", - "pet", - "pf", - "pfizer", - "pg", - "ph", - "pharmacy", - "phd", - "philips", - "phone", - "photo", - "photography", - "photos", - "physio", - "piaget", - "pics", - "pictet", - "pictures", - "pid", - "pin", - "ping", - "pink", - "pioneer", - "pizza", - "pk", - "pl", - "place", - "play", - "playstation", - "plumbing", - "plus", - "pm", - "pn", - "pnc", - "pohl", - "poker", - "politie", - "porn", - "post", - "pr", - "pramerica", - "praxi", - "press", - "prime", - "pro", - "prod", - "productions", - "prof", - "progressive", - "promo", - "properties", - "property", - "protection", - "pru", - "prudential", - "ps", - "pt", - "pub", - "pw", - "pwc", - "py", - "qa", - "qpon", - "quebec", - "quest", - "qvc", - "racing", - "radio", - "raid", - "re", - "read", - "realestate", - "realtor", - "realty", - "recipes", - "red", - "redstone", - "redumbrella", - "rehab", - "reise", - "reisen", - "reit", - "reliance", - "ren", - "rent", - "rentals", - "repair", - "report", - "republican", - "rest", - "restaurant", - "review", - "reviews", - "rexroth", - "rich", - "richardli", - "ricoh", - "rightathome", - "ril", - "rio", - "rip", - "rmit", - "ro", - "rocher", - "rocks", - "rodeo", - "rogers", - "room", - "rs", - "rsvp", - "ru", - "rugby", - "ruhr", - "run", - "rw", - "rwe", - "ryukyu", - "sa", - "saarland", - "safe", - "safety", - "sakura", - "sale", - "salon", - "samsclub", - "samsung", - "sandvik", - "sandvikcoromant", - "sanofi", - "sap", - "sarl", - "sas", - "save", - "saxo", - "sb", - "sbi", - "sbs", - "sc", - "sca", - "scb", - "schaeffler", - "schmidt", - "scholarships", - "school", - "schule", - "schwarz", - "science", - "scjohnson", - "scor", - "scot", - "sd", - "se", - "search", - "seat", - "secure", - "security", - "seek", - "select", - "sener", - "services", - "ses", - "seven", - "sew", - "sex", - "sexy", - "sfr", - "sg", - "sh", - "shangrila", - "sharp", - "shaw", - "shell", - "shia", - "shiksha", - "shoes", - "shop", - "shopping", - "shouji", - "show", - "showtime", - "shriram", - "si", - "silk", - "sina", - "singles", - "site", - "sj", - "sk", - "ski", - "skin", - "sky", - "skype", - "sl", - "sling", - "sm", - "smart", - "smile", - "sn", - "sncf", - "so", - "soccer", - "social", - "softbank", - "software", - "sohu", - "solar", - "solutions", - "song", - "sony", - "soy", - "space", - "spiegel", - "sport", - "spot", - "spreadbetting", - "sr", - "srl", - "srt", - "st", - "stada", - "staples", - "star", - "starhub", - "statebank", - "statefarm", - "statoil", - "stc", - "stcgroup", - "stockholm", - "storage", - "store", - "stream", - "studio", - "study", - "style", - "su", - "sucks", - "supplies", - "supply", - "support", - "surf", - "surgery", - "suzuki", - "sv", - "swatch", - "swiftcover", - "swiss", - "sx", - "sy", - "sydney", - "symantec", - "systems", - "sz", - "tab", - "taipei", - "talk", - "taobao", - "target", - "tatamotors", - "tatar", - "tattoo", - "tax", - "taxi", - "tc", - "tci", - "td", - "tdk", - "team", - "tech", - "technology", - "tel", - "telecity", - "telefonica", - "temasek", - "tennis", - "teva", - "tf", - "tg", - "th", - "thd", - "theater", - "theatre", - "tiaa", - "tickets", - "tienda", - "tiffany", - "tips", - "tires", - "tirol", - "tj", - "tjmaxx", - "tjx", - "tk", - "tkmaxx", - "tl", - "tm", - "tmall", - "tn", - "to", - "today", - "tokyo", - "tools", - "top", - "toray", - "toshiba", - "total", - "tours", - "town", - "toyota", - "toys", - "tr", - "trade", - "trading", - "training", - "travel", - "travelchannel", - "travelers", - "travelersinsurance", - "trust", - "trv", - "tt", - "tube", - "tui", - "tunes", - "tushu", - "tv", - "tvs", - "tw", - "tz", - "ua", - "ubank", - "ubs", - "uconnect", - "ug", - "uk", - "unicom", - "university", - "uno", - "uol", - "ups", - "us", - "uy", - "uz", - "va", - "vacations", - "vana", - "vanguard", - "vc", - "ve", - "vegas", - "ventures", - "verisign", - "versicherung", - "vet", - "vg", - "vi", - "viajes", - "video", - "vig", - "viking", - "villas", - "vin", - "vip", - "virgin", - "visa", - "vision", - "vista", - "vistaprint", - "viva", - "vivo", - "vlaanderen", - "vn", - "vodka", - "volkswagen", - "volvo", - "vote", - "voting", - "voto", - "voyage", - "vu", - "vuelos", - "wales", - "walmart", - "walter", - "wang", - "wanggou", - "warman", - "watch", - "watches", - "weather", - "weatherchannel", - "webcam", - "weber", - "website", - "wed", - "wedding", - "weibo", - "weir", - "wf", - "whoswho", - "wien", - "wiki", - "williamhill", - "win", - "windows", - "wine", - "winners", - "wme", - "wolterskluwer", - "woodside", - "work", - "works", - "world", - "wow", - "ws", - "wtc", - "wtf", - "xbox", - "xerox", - "xfinity", - "xihuan", - "xin", - "कॉम", // xn--11b4c3d - "セール", // xn--1ck2e1b - "佛山", // xn--1qqw23a - "ಭಾರತ", // xn--2scrj9c - "慈善", // xn--30rr7y - "集团", // xn--3bst00m - "在线", // xn--3ds443g - "한국", // xn--3e0b707e - "ଭାରତ", // xn--3hcrj9c - "大众汽车", // xn--3oq18vl8pn36a - "点看", // xn--3pxu8k - "คอม", // xn--42c2d9a - "ভাৰত", // xn--45br5cyl - "ভারত", // xn--45brj9c - "八卦", // xn--45q11c - "موقع", // xn--4gbrim - "বাংলা", // xn--54b7fta0cc - "公益", // xn--55qw42g - "公司", // xn--55qx5d - "香格里拉", // xn--5su34j936bgsg - "网站", // xn--5tzm5g - "移动", // xn--6frz82g - "我爱你", // xn--6qq986b3xl - "москва", // xn--80adxhks - "қаз", // xn--80ao21a - "католик", // xn--80aqecdr1a - "онлайн", // xn--80asehdb - "сайт", // xn--80aswg - "联通", // xn--8y0a063a - "срб", // xn--90a3ac - "бг", // xn--90ae - "бел", // xn--90ais - "קום", // xn--9dbq2a - "时尚", // xn--9et52u - "微博", // xn--9krt00a - "淡马锡", // xn--b4w605ferd - "ファッション", // xn--bck1b9a5dre4c - "орг", // xn--c1avg - "नेट", // xn--c2br7g - "ストア", // xn--cck2b3b - "삼성", // xn--cg4bki - "சிங்கப்பூர்", // xn--clchc0ea0b2g2a9gcd - "商标", // xn--czr694b - "商店", // xn--czrs0t - "商城", // xn--czru2d - "дети", // xn--d1acj3b - "мкд", // xn--d1alf - "ею", // xn--e1a4c - "ポイント", // xn--eckvdtc9d - "新闻", // xn--efvy88h - "工行", // xn--estv75g - "家電", // xn--fct429k - "كوم", // xn--fhbei - "中文网", // xn--fiq228c5hs - "中信", // xn--fiq64b - "中国", // xn--fiqs8s - "中國", // xn--fiqz9s - "娱乐", // xn--fjq720a - "谷歌", // xn--flw351e - "భారత్", // xn--fpcrj9c3d - "ලංකා", // xn--fzc2c9e2c - "電訊盈科", // xn--fzys8d69uvgm - "购物", // xn--g2xx48c - "クラウド", // xn--gckr3f0f - "ભારત", // xn--gecrj9c - "通販", // xn--gk3at1e - "भारतम्", // xn--h2breg3eve - "भारत", // xn--h2brj9c - "भारोत", // xn--h2brj9c8c - "网店", // xn--hxt814e - "संगठन", // xn--i1b6b1a6a2e - "餐厅", // xn--imr513n - "网络", // xn--io0a7i - "ком", // xn--j1aef - "укр", // xn--j1amh - "香港", // xn--j6w193g - "诺基亚", // xn--jlq61u9w7b - "食品", // xn--jvr189m - "飞利浦", // xn--kcrx77d1x4a - "台湾", // xn--kprw13d - "台灣", // xn--kpry57d - "手表", // xn--kpu716f - "手机", // xn--kput3i - "мон", // xn--l1acc - "الجزائر", // xn--lgbbat1ad8j - "عمان", // xn--mgb9awbf - "ارامكو", // xn--mgba3a3ejt - "ایران", // xn--mgba3a4f16a - "العليان", // xn--mgba7c0bbn0a - "اتصالات", // xn--mgbaakc7dvf - "امارات", // xn--mgbaam7a8h - "بازار", // xn--mgbab2bd - "پاکستان", // xn--mgbai9azgqp6j - "الاردن", // xn--mgbayh7gpa - "موبايلي", // xn--mgbb9fbpob - "بارت", // xn--mgbbh1a - "بھارت", // xn--mgbbh1a71e - "المغرب", // xn--mgbc0a9azcg - "ابوظبي", // xn--mgbca7dzdo - "السعودية", // xn--mgberp4a5d4ar - "ڀارت", // xn--mgbgu82a - "كاثوليك", // xn--mgbi4ecexp - "سودان", // xn--mgbpl2fh - "همراه", // xn--mgbt3dhd - "عراق", // xn--mgbtx2b - "مليسيا", // xn--mgbx4cd0ab - "澳門", // xn--mix891f - "닷컴", // xn--mk1bu44c - "政府", // xn--mxtq1m - "شبكة", // xn--ngbc5azd - "بيتك", // xn--ngbe9e0a - "عرب", // xn--ngbrx - "გე", // xn--node - "机构", // xn--nqv7f - "组织机构", // xn--nqv7fs00ema - "健康", // xn--nyqy26a - "ไทย", // xn--o3cw4h - "سورية", // xn--ogbpf8fl - "招聘", // xn--otu796d - "рус", // xn--p1acf - "рф", // xn--p1ai - "珠宝", // xn--pbt977c - "تونس", // xn--pgbs0dh - "大拿", // xn--pssy2u - "みんな", // xn--q9jyb4c - "グーグル", // xn--qcka1pmc - "ελ", // xn--qxam - "世界", // xn--rhqv96g - "書籍", // xn--rovu88b - "ഭാരതം", // xn--rvc1e0am3e - "ਭਾਰਤ", // xn--s9brj9c - "网址", // xn--ses554g - "닷넷", // xn--t60b56a - "コム", // xn--tckwe - "天主教", // xn--tiq49xqyj - "游戏", // xn--unup4y - "vermögensberater", // xn--vermgensberater-ctb - "vermögensberatung", // xn--vermgensberatung-pwb - "企业", // xn--vhquv - "信息", // xn--vuq861b - "嘉里大酒店", // xn--w4r85el8fhu5dnra - "嘉里", // xn--w4rs40l - "مصر", // xn--wgbh1c - "قطر", // xn--wgbl6a - "广东", // xn--xhq521b - "இலங்கை", // xn--xkc2al3hye2a - "இந்தியா", // xn--xkc2dl3a5ee0h - "հայ", // xn--y9a3aq - "新加坡", // xn--yfro4i67o - "فلسطين", // xn--ygbi2ammx - "政务", // xn--zfr164b - "xxx", - "xyz", - "yachts", - "yahoo", - "yamaxun", - "yandex", - "ye", - "yodobashi", - "yoga", - "yokohama", - "you", - "youtube", - "yt", - "yun", - "za", - "zappos", - "zara", - "zero", - "zip", - "zippo", - "zm", - "zone", - "zuerich", - "zw" - ]; - \ No newline at end of file + "aaa", + "aarp", + "abarth", + "abb", + "abbott", + "abbvie", + "abc", + "able", + "abogado", + "abudhabi", + "ac", + "academy", + "accenture", + "accountant", + "accountants", + "aco", + "active", + "actor", + "ad", + "adac", + "ads", + "adult", + "ae", + "aeg", + "aero", + "aetna", + "af", + "afamilycompany", + "afl", + "africa", + "ag", + "agakhan", + "agency", + "ai", + "aig", + "aigo", + "airbus", + "airforce", + "airtel", + "akdn", + "al", + "alfaromeo", + "alibaba", + "alipay", + "allfinanz", + "allstate", + "ally", + "alsace", + "alstom", + "am", + "americanexpress", + "americanfamily", + "amex", + "amfam", + "amica", + "amsterdam", + "analytics", + "android", + "anquan", + "anz", + "ao", + "aol", + "apartments", + "app", + "apple", + "aq", + "aquarelle", + "ar", + "arab", + "aramco", + "archi", + "army", + "arpa", + "art", + "arte", + "as", + "asda", + "asia", + "associates", + "at", + "athleta", + "attorney", + "au", + "auction", + "audi", + "audible", + "audio", + "auspost", + "author", + "auto", + "autos", + "avianca", + "aw", + "aws", + "ax", + "axa", + "az", + "azure", + "ba", + "baby", + "baidu", + "banamex", + "bananarepublic", + "band", + "bank", + "bar", + "barcelona", + "barclaycard", + "barclays", + "barefoot", + "bargains", + "baseball", + "basketball", + "bauhaus", + "bayern", + "bb", + "bbc", + "bbt", + "bbva", + "bcg", + "bcn", + "bd", + "be", + "beats", + "beauty", + "beer", + "bentley", + "berlin", + "best", + "bestbuy", + "bet", + "bf", + "bg", + "bh", + "bharti", + "bi", + "bible", + "bid", + "bike", + "bing", + "bingo", + "bio", + "biz", + "bj", + "black", + "blackfriday", + "blanco", + "blockbuster", + "blog", + "bloomberg", + "blue", + "bm", + "bms", + "bmw", + "bn", + "bnl", + "bnpparibas", + "bo", + "boats", + "boehringer", + "bofa", + "bom", + "bond", + "boo", + "book", + "booking", + "bosch", + "bostik", + "boston", + "bot", + "boutique", + "box", + "br", + "bradesco", + "bridgestone", + "broadway", + "broker", + "brother", + "brussels", + "bs", + "bt", + "budapest", + "bugatti", + "build", + "builders", + "business", + "buy", + "buzz", + "bv", + "bw", + "by", + "bz", + "bzh", + "ca", + "cab", + "cafe", + "cal", + "call", + "calvinklein", + "cam", + "camera", + "camp", + "cancerresearch", + "canon", + "capetown", + "capital", + "capitalone", + "car", + "caravan", + "cards", + "care", + "career", + "careers", + "cars", + "cartier", + "casa", + "case", + "caseih", + "cash", + "casino", + "cat", + "catering", + "catholic", + "cba", + "cbn", + "cbre", + "cbs", + "cc", + "cd", + "ceb", + "center", + "ceo", + "cern", + "cf", + "cfa", + "cfd", + "cg", + "ch", + "chanel", + "channel", + "charity", + "chase", + "chat", + "cheap", + "chintai", + "christmas", + "chrome", + "chrysler", + "church", + "ci", + "cipriani", + "circle", + "cisco", + "citadel", + "citi", + "citic", + "city", + "cityeats", + "ck", + "cl", + "claims", + "cleaning", + "click", + "clinic", + "clinique", + "clothing", + "cloud", + "club", + "clubmed", + "cm", + "cn", + "co", + "coach", + "codes", + "coffee", + "college", + "cologne", + "com", + "comcast", + "commbank", + "community", + "company", + "compare", + "computer", + "comsec", + "condos", + "construction", + "consulting", + "contact", + "contractors", + "cooking", + "cookingchannel", + "cool", + "coop", + "corsica", + "country", + "coupon", + "coupons", + "courses", + "cr", + "credit", + "creditcard", + "creditunion", + "cricket", + "crown", + "crs", + "cruise", + "cruises", + "csc", + "cu", + "cuisinella", + "cv", + "cw", + "cx", + "cy", + "cymru", + "cyou", + "cz", + "dabur", + "dad", + "dance", + "data", + "date", + "dating", + "datsun", + "day", + "dclk", + "dds", + "de", + "deal", + "dealer", + "deals", + "degree", + "delivery", + "dell", + "deloitte", + "delta", + "democrat", + "dental", + "dentist", + "desi", + "design", + "dev", + "dhl", + "diamonds", + "diet", + "digital", + "direct", + "directory", + "discount", + "discover", + "dish", + "diy", + "dj", + "dk", + "dm", + "dnp", + "do", + "docs", + "doctor", + "dodge", + "dog", + "doha", + "domains", + "dot", + "download", + "drive", + "dtv", + "dubai", + "duck", + "dunlop", + "duns", + "dupont", + "durban", + "dvag", + "dvr", + "dz", + "earth", + "eat", + "ec", + "eco", + "edeka", + "edu", + "education", + "ee", + "eg", + "email", + "emerck", + "energy", + "engineer", + "engineering", + "enterprises", + "epost", + "epson", + "equipment", + "er", + "ericsson", + "erni", + "es", + "esq", + "estate", + "esurance", + "et", + "etisalat", + "eu", + "eurovision", + "eus", + "events", + "everbank", + "exchange", + "expert", + "exposed", + "express", + "extraspace", + "fage", + "fail", + "fairwinds", + "faith", + "family", + "fan", + "fans", + "farm", + "farmers", + "fashion", + "fast", + "fedex", + "feedback", + "ferrari", + "ferrero", + "fi", + "fiat", + "fidelity", + "fido", + "film", + "final", + "finance", + "financial", + "fire", + "firestone", + "firmdale", + "fish", + "fishing", + "fit", + "fitness", + "fj", + "fk", + "flickr", + "flights", + "flir", + "florist", + "flowers", + "fly", + "fm", + "fo", + "foo", + "food", + "foodnetwork", + "football", + "ford", + "forex", + "forsale", + "forum", + "foundation", + "fox", + "fr", + "free", + "fresenius", + "frl", + "frogans", + "frontdoor", + "frontier", + "ftr", + "fujitsu", + "fujixerox", + "fun", + "fund", + "furniture", + "futbol", + "fyi", + "ga", + "gal", + "gallery", + "gallo", + "gallup", + "game", + "games", + "gap", + "garden", + "gb", + "gbiz", + "gd", + "gdn", + "ge", + "gea", + "gent", + "genting", + "george", + "gf", + "gg", + "ggee", + "gh", + "gi", + "gift", + "gifts", + "gives", + "giving", + "gl", + "glade", + "glass", + "gle", + "global", + "globo", + "gm", + "gmail", + "gmbh", + "gmo", + "gmx", + "gn", + "godaddy", + "gold", + "goldpoint", + "golf", + "goo", + "goodhands", + "goodyear", + "goog", + "google", + "gop", + "got", + "gov", + "gp", + "gq", + "gr", + "grainger", + "graphics", + "gratis", + "green", + "gripe", + "grocery", + "group", + "gs", + "gt", + "gu", + "guardian", + "gucci", + "guge", + "guide", + "guitars", + "guru", + "gw", + "gy", + "hair", + "hamburg", + "hangout", + "haus", + "hbo", + "hdfc", + "hdfcbank", + "health", + "healthcare", + "help", + "helsinki", + "here", + "hermes", + "hgtv", + "hiphop", + "hisamitsu", + "hitachi", + "hiv", + "hk", + "hkt", + "hm", + "hn", + "hockey", + "holdings", + "holiday", + "homedepot", + "homegoods", + "homes", + "homesense", + "honda", + "honeywell", + "horse", + "hospital", + "host", + "hosting", + "hot", + "hoteles", + "hotels", + "hotmail", + "house", + "how", + "hr", + "hsbc", + "ht", + "hu", + "hughes", + "hyatt", + "hyundai", + "ibm", + "icbc", + "ice", + "icu", + "id", + "ie", + "ieee", + "ifm", + "ikano", + "il", + "im", + "imamat", + "imdb", + "immo", + "immobilien", + "in", + "inc", + "industries", + "infiniti", + "info", + "ing", + "ink", + "institute", + "insurance", + "insure", + "int", + "intel", + "international", + "intuit", + "investments", + "io", + "ipiranga", + "iq", + "ir", + "irish", + "is", + "iselect", + "ismaili", + "ist", + "istanbul", + "it", + "itau", + "itv", + "iveco", + "jaguar", + "java", + "jcb", + "jcp", + "je", + "jeep", + "jetzt", + "jewelry", + "jio", + "jlc", + "jll", + "jm", + "jmp", + "jnj", + "jo", + "jobs", + "joburg", + "jot", + "joy", + "jp", + "jpmorgan", + "jprs", + "juegos", + "juniper", + "kaufen", + "kddi", + "ke", + "kerryhotels", + "kerrylogistics", + "kerryproperties", + "kfh", + "kg", + "kh", + "ki", + "kia", + "kim", + "kinder", + "kindle", + "kitchen", + "kiwi", + "km", + "kn", + "koeln", + "komatsu", + "kosher", + "kp", + "kpmg", + "kpn", + "kr", + "krd", + "kred", + "kuokgroup", + "kw", + "ky", + "kyoto", + "kz", + "la", + "lacaixa", + "ladbrokes", + "lamborghini", + "lamer", + "lancaster", + "lancia", + "lancome", + "land", + "landrover", + "lanxess", + "lasalle", + "lat", + "latino", + "latrobe", + "law", + "lawyer", + "lb", + "lc", + "lds", + "lease", + "leclerc", + "lefrak", + "legal", + "lego", + "lexus", + "lgbt", + "li", + "liaison", + "lidl", + "life", + "lifeinsurance", + "lifestyle", + "lighting", + "like", + "lilly", + "limited", + "limo", + "lincoln", + "linde", + "link", + "lipsy", + "live", + "living", + "lixil", + "lk", + "llc", + "loan", + "loans", + "locker", + "locus", + "loft", + "lol", + "london", + "lotte", + "lotto", + "love", + "lpl", + "lplfinancial", + "lr", + "ls", + "lt", + "ltd", + "ltda", + "lu", + "lundbeck", + "lupin", + "luxe", + "luxury", + "lv", + "ly", + "ma", + "macys", + "madrid", + "maif", + "maison", + "makeup", + "man", + "management", + "mango", + "map", + "market", + "marketing", + "markets", + "marriott", + "marshalls", + "maserati", + "mattel", + "mba", + "mc", + "mckinsey", + "md", + "me", + "med", + "media", + "meet", + "melbourne", + "meme", + "memorial", + "men", + "menu", + "merckmsd", + "metlife", + "mg", + "mh", + "miami", + "microsoft", + "mil", + "mini", + "mint", + "mit", + "mitsubishi", + "mk", + "ml", + "mlb", + "mls", + "mm", + "mma", + "mn", + "mo", + "mobi", + "mobile", + "mobily", + "moda", + "moe", + "moi", + "mom", + "monash", + "money", + "monster", + "mopar", + "mormon", + "mortgage", + "moscow", + "moto", + "motorcycles", + "mov", + "movie", + "movistar", + "mp", + "mq", + "mr", + "ms", + "msd", + "mt", + "mtn", + "mtr", + "mu", + "museum", + "mutual", + "mv", + "mw", + "mx", + "my", + "mz", + "na", + "nab", + "nadex", + "nagoya", + "name", + "nationwide", + "natura", + "navy", + "nba", + "nc", + "ne", + "nec", + "net", + "netbank", + "netflix", + "network", + "neustar", + "new", + "newholland", + "news", + "next", + "nextdirect", + "nexus", + "nf", + "nfl", + "ng", + "ngo", + "nhk", + "ni", + "nico", + "nike", + "nikon", + "ninja", + "nissan", + "nissay", + "nl", + "no", + "nokia", + "northwesternmutual", + "norton", + "now", + "nowruz", + "nowtv", + "np", + "nr", + "nra", + "nrw", + "ntt", + "nu", + "nyc", + "nz", + "obi", + "observer", + "off", + "office", + "okinawa", + "olayan", + "olayangroup", + "oldnavy", + "ollo", + "om", + "omega", + "one", + "ong", + "onl", + "online", + "onyourside", + "ooo", + "open", + "oracle", + "orange", + "org", + "organic", + "origins", + "osaka", + "otsuka", + "ott", + "ovh", + "pa", + "page", + "panasonic", + "panerai", + "paris", + "pars", + "partners", + "parts", + "party", + "passagens", + "pay", + "pccw", + "pe", + "pet", + "pf", + "pfizer", + "pg", + "ph", + "pharmacy", + "phd", + "philips", + "phone", + "photo", + "photography", + "photos", + "physio", + "piaget", + "pics", + "pictet", + "pictures", + "pid", + "pin", + "ping", + "pink", + "pioneer", + "pizza", + "pk", + "pl", + "place", + "play", + "playstation", + "plumbing", + "plus", + "pm", + "pn", + "pnc", + "pohl", + "poker", + "politie", + "porn", + "post", + "pr", + "pramerica", + "praxi", + "press", + "prime", + "pro", + "prod", + "productions", + "prof", + "progressive", + "promo", + "properties", + "property", + "protection", + "pru", + "prudential", + "ps", + "pt", + "pub", + "pw", + "pwc", + "py", + "qa", + "qpon", + "quebec", + "quest", + "qvc", + "racing", + "radio", + "raid", + "re", + "read", + "realestate", + "realtor", + "realty", + "recipes", + "red", + "redstone", + "redumbrella", + "rehab", + "reise", + "reisen", + "reit", + "reliance", + "ren", + "rent", + "rentals", + "repair", + "report", + "republican", + "rest", + "restaurant", + "review", + "reviews", + "rexroth", + "rich", + "richardli", + "ricoh", + "rightathome", + "ril", + "rio", + "rip", + "rmit", + "ro", + "rocher", + "rocks", + "rodeo", + "rogers", + "room", + "rs", + "rsvp", + "ru", + "rugby", + "ruhr", + "run", + "rw", + "rwe", + "ryukyu", + "sa", + "saarland", + "safe", + "safety", + "sakura", + "sale", + "salon", + "samsclub", + "samsung", + "sandvik", + "sandvikcoromant", + "sanofi", + "sap", + "sarl", + "sas", + "save", + "saxo", + "sb", + "sbi", + "sbs", + "sc", + "sca", + "scb", + "schaeffler", + "schmidt", + "scholarships", + "school", + "schule", + "schwarz", + "science", + "scjohnson", + "scor", + "scot", + "sd", + "se", + "search", + "seat", + "secure", + "security", + "seek", + "select", + "sener", + "services", + "ses", + "seven", + "sew", + "sex", + "sexy", + "sfr", + "sg", + "sh", + "shangrila", + "sharp", + "shaw", + "shell", + "shia", + "shiksha", + "shoes", + "shop", + "shopping", + "shouji", + "show", + "showtime", + "shriram", + "si", + "silk", + "sina", + "singles", + "site", + "sj", + "sk", + "ski", + "skin", + "sky", + "skype", + "sl", + "sling", + "sm", + "smart", + "smile", + "sn", + "sncf", + "so", + "soccer", + "social", + "softbank", + "software", + "sohu", + "solar", + "solutions", + "song", + "sony", + "soy", + "space", + "spiegel", + "sport", + "spot", + "spreadbetting", + "sr", + "srl", + "srt", + "st", + "stada", + "staples", + "star", + "starhub", + "statebank", + "statefarm", + "statoil", + "stc", + "stcgroup", + "stockholm", + "storage", + "store", + "stream", + "studio", + "study", + "style", + "su", + "sucks", + "supplies", + "supply", + "support", + "surf", + "surgery", + "suzuki", + "sv", + "swatch", + "swiftcover", + "swiss", + "sx", + "sy", + "sydney", + "symantec", + "systems", + "sz", + "tab", + "taipei", + "talk", + "taobao", + "target", + "tatamotors", + "tatar", + "tattoo", + "tax", + "taxi", + "tc", + "tci", + "td", + "tdk", + "team", + "tech", + "technology", + "tel", + "telecity", + "telefonica", + "temasek", + "tennis", + "teva", + "tf", + "tg", + "th", + "thd", + "theater", + "theatre", + "tiaa", + "tickets", + "tienda", + "tiffany", + "tips", + "tires", + "tirol", + "tj", + "tjmaxx", + "tjx", + "tk", + "tkmaxx", + "tl", + "tm", + "tmall", + "tn", + "to", + "today", + "tokyo", + "tools", + "top", + "toray", + "toshiba", + "total", + "tours", + "town", + "toyota", + "toys", + "tr", + "trade", + "trading", + "training", + "travel", + "travelchannel", + "travelers", + "travelersinsurance", + "trust", + "trv", + "tt", + "tube", + "tui", + "tunes", + "tushu", + "tv", + "tvs", + "tw", + "tz", + "ua", + "ubank", + "ubs", + "uconnect", + "ug", + "uk", + "unicom", + "university", + "uno", + "uol", + "ups", + "us", + "uy", + "uz", + "va", + "vacations", + "vana", + "vanguard", + "vc", + "ve", + "vegas", + "ventures", + "verisign", + "versicherung", + "vet", + "vg", + "vi", + "viajes", + "video", + "vig", + "viking", + "villas", + "vin", + "vip", + "virgin", + "visa", + "vision", + "vista", + "vistaprint", + "viva", + "vivo", + "vlaanderen", + "vn", + "vodka", + "volkswagen", + "volvo", + "vote", + "voting", + "voto", + "voyage", + "vu", + "vuelos", + "wales", + "walmart", + "walter", + "wang", + "wanggou", + "warman", + "watch", + "watches", + "weather", + "weatherchannel", + "webcam", + "weber", + "website", + "wed", + "wedding", + "weibo", + "weir", + "wf", + "whoswho", + "wien", + "wiki", + "williamhill", + "win", + "windows", + "wine", + "winners", + "wme", + "wolterskluwer", + "woodside", + "work", + "works", + "world", + "wow", + "ws", + "wtc", + "wtf", + "xbox", + "xerox", + "xfinity", + "xihuan", + "xin", + "कॉम", // xn--11b4c3d + "セール", // xn--1ck2e1b + "佛山", // xn--1qqw23a + "ಭಾರತ", // xn--2scrj9c + "慈善", // xn--30rr7y + "集团", // xn--3bst00m + "在线", // xn--3ds443g + "한국", // xn--3e0b707e + "ଭାରତ", // xn--3hcrj9c + "大众汽车", // xn--3oq18vl8pn36a + "点看", // xn--3pxu8k + "คอม", // xn--42c2d9a + "ভাৰত", // xn--45br5cyl + "ভারত", // xn--45brj9c + "八卦", // xn--45q11c + "موقع", // xn--4gbrim + "বাংলা", // xn--54b7fta0cc + "公益", // xn--55qw42g + "公司", // xn--55qx5d + "香格里拉", // xn--5su34j936bgsg + "网站", // xn--5tzm5g + "移动", // xn--6frz82g + "我爱你", // xn--6qq986b3xl + "москва", // xn--80adxhks + "қаз", // xn--80ao21a + "католик", // xn--80aqecdr1a + "онлайн", // xn--80asehdb + "сайт", // xn--80aswg + "联通", // xn--8y0a063a + "срб", // xn--90a3ac + "бг", // xn--90ae + "бел", // xn--90ais + "קום", // xn--9dbq2a + "时尚", // xn--9et52u + "微博", // xn--9krt00a + "淡马锡", // xn--b4w605ferd + "ファッション", // xn--bck1b9a5dre4c + "орг", // xn--c1avg + "नेट", // xn--c2br7g + "ストア", // xn--cck2b3b + "삼성", // xn--cg4bki + "சிங்கப்பூர்", // xn--clchc0ea0b2g2a9gcd + "商标", // xn--czr694b + "商店", // xn--czrs0t + "商城", // xn--czru2d + "дети", // xn--d1acj3b + "мкд", // xn--d1alf + "ею", // xn--e1a4c + "ポイント", // xn--eckvdtc9d + "新闻", // xn--efvy88h + "工行", // xn--estv75g + "家電", // xn--fct429k + "كوم", // xn--fhbei + "中文网", // xn--fiq228c5hs + "中信", // xn--fiq64b + "中国", // xn--fiqs8s + "中國", // xn--fiqz9s + "娱乐", // xn--fjq720a + "谷歌", // xn--flw351e + "భారత్", // xn--fpcrj9c3d + "ලංකා", // xn--fzc2c9e2c + "電訊盈科", // xn--fzys8d69uvgm + "购物", // xn--g2xx48c + "クラウド", // xn--gckr3f0f + "ભારત", // xn--gecrj9c + "通販", // xn--gk3at1e + "भारतम्", // xn--h2breg3eve + "भारत", // xn--h2brj9c + "भारोत", // xn--h2brj9c8c + "网店", // xn--hxt814e + "संगठन", // xn--i1b6b1a6a2e + "餐厅", // xn--imr513n + "网络", // xn--io0a7i + "ком", // xn--j1aef + "укр", // xn--j1amh + "香港", // xn--j6w193g + "诺基亚", // xn--jlq61u9w7b + "食品", // xn--jvr189m + "飞利浦", // xn--kcrx77d1x4a + "台湾", // xn--kprw13d + "台灣", // xn--kpry57d + "手表", // xn--kpu716f + "手机", // xn--kput3i + "мон", // xn--l1acc + "الجزائر", // xn--lgbbat1ad8j + "عمان", // xn--mgb9awbf + "ارامكو", // xn--mgba3a3ejt + "ایران", // xn--mgba3a4f16a + "العليان", // xn--mgba7c0bbn0a + "اتصالات", // xn--mgbaakc7dvf + "امارات", // xn--mgbaam7a8h + "بازار", // xn--mgbab2bd + "پاکستان", // xn--mgbai9azgqp6j + "الاردن", // xn--mgbayh7gpa + "موبايلي", // xn--mgbb9fbpob + "بارت", // xn--mgbbh1a + "بھارت", // xn--mgbbh1a71e + "المغرب", // xn--mgbc0a9azcg + "ابوظبي", // xn--mgbca7dzdo + "السعودية", // xn--mgberp4a5d4ar + "ڀارت", // xn--mgbgu82a + "كاثوليك", // xn--mgbi4ecexp + "سودان", // xn--mgbpl2fh + "همراه", // xn--mgbt3dhd + "عراق", // xn--mgbtx2b + "مليسيا", // xn--mgbx4cd0ab + "澳門", // xn--mix891f + "닷컴", // xn--mk1bu44c + "政府", // xn--mxtq1m + "شبكة", // xn--ngbc5azd + "بيتك", // xn--ngbe9e0a + "عرب", // xn--ngbrx + "გე", // xn--node + "机构", // xn--nqv7f + "组织机构", // xn--nqv7fs00ema + "健康", // xn--nyqy26a + "ไทย", // xn--o3cw4h + "سورية", // xn--ogbpf8fl + "招聘", // xn--otu796d + "рус", // xn--p1acf + "рф", // xn--p1ai + "珠宝", // xn--pbt977c + "تونس", // xn--pgbs0dh + "大拿", // xn--pssy2u + "みんな", // xn--q9jyb4c + "グーグル", // xn--qcka1pmc + "ελ", // xn--qxam + "世界", // xn--rhqv96g + "書籍", // xn--rovu88b + "ഭാരതം", // xn--rvc1e0am3e + "ਭਾਰਤ", // xn--s9brj9c + "网址", // xn--ses554g + "닷넷", // xn--t60b56a + "コム", // xn--tckwe + "天主教", // xn--tiq49xqyj + "游戏", // xn--unup4y + "vermögensberater", // xn--vermgensberater-ctb + "vermögensberatung", // xn--vermgensberatung-pwb + "企业", // xn--vhquv + "信息", // xn--vuq861b + "嘉里大酒店", // xn--w4r85el8fhu5dnra + "嘉里", // xn--w4rs40l + "مصر", // xn--wgbh1c + "قطر", // xn--wgbl6a + "广东", // xn--xhq521b + "இலங்கை", // xn--xkc2al3hye2a + "இந்தியா", // xn--xkc2dl3a5ee0h + "հայ", // xn--y9a3aq + "新加坡", // xn--yfro4i67o + "فلسطين", // xn--ygbi2ammx + "政务", // xn--zfr164b + "xxx", + "xyz", + "yachts", + "yahoo", + "yamaxun", + "yandex", + "ye", + "yodobashi", + "yoga", + "yokohama", + "you", + "youtube", + "yt", + "yun", + "za", + "zappos", + "zara", + "zero", + "zip", + "zippo", + "zm", + "zone", + "zuerich", + "zw" +]; diff --git a/web-index/trigrams_aux.sh b/web-index/trigrams_aux.sh deleted file mode 100755 index 99ccc5b42..000000000 --- a/web-index/trigrams_aux.sh +++ /dev/null @@ -1,16 +0,0 @@ -s2=$(mktemp -u) -s3=$(mktemp -u) - -mkfifo $s2 $s3 - -tee $s2 | - tail +2 | - paste $s2 - | - tee $s3 | - cut -f 1 | - tail +3 | - paste $s3 - | - sed "\$d" | - sed "\$d" - -rm $s2 $s3 \ No newline at end of file diff --git a/web-index/urls.txt b/web-index/urls.txt new file mode 100644 index 000000000..5920081a4 --- /dev/null +++ b/web-index/urls.txt @@ -0,0 +1,2 @@ + +http://nikos.vasilak.is diff --git a/web-index/verify.sh b/web-index/verify.sh deleted file mode 100755 index 4a77d7664..000000000 --- a/web-index/verify.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash -REPO_TOP=$(git rev-parse --show-toplevel) -eval_dir="${REPO_TOP}/web-index" -hashes_dir="${eval_dir}/hashes" - -# create hashes directory if it does not exist -if [ ! -d "${hashes_dir}" ]; then - mkdir "${hashes_dir}" -fi - -suffix=".full" -if [[ "$@" == *"--small"* ]]; then - suffix=".small" -fi - -if [[ "$@" == *"--generate"* ]]; then - # generate hashes and store in hashes directory for all *grams.txt files - for file in $(find ${eval_dir} -name "*grams.txt"); do - echo "Generating hash for ${file}" - hash=$(md5sum ${file} | cut -d ' ' -f 1) - echo "Hash: ${hash}" - echo "${hash}" > "${hashes_dir}/$(basename ${file})${suffix}.hash" - done - exit 0 -fi - -# verify hashes for all *grams.txt files -for file in $(find ${eval_dir} -name "*grams.txt"); do - hash=$(md5sum ${file} | cut -d ' ' -f 1) - expected_hash=$(cat "${hashes_dir}/$(basename ${file})${suffix}.hash") - if [[ "${hash}" != "${expected_hash}" ]]; then - exit 1 - fi -done -exit 0 \ No newline at end of file diff --git a/web-index/web-index-aux.sh b/web-index/web-index-aux.sh new file mode 100644 index 000000000..cb6fd403e --- /dev/null +++ b/web-index/web-index-aux.sh @@ -0,0 +1,141 @@ +mkfifo {1,2,3}grams + +bigrams_aux() +{ + ( mkfifo s2 > /dev/null ) ; + ( mkfifo s3 > /dev/null ) ; + + sed '$d' s2 > s3 & + tee s2 | + tail +2 | + paste s3 - + rm s2 + rm s3 +} + +bigram_aux_map() +{ + IN=$1 + OUT=$2 + AUX_HEAD=$3 + AUX_TAIL=$4 + + s2=$(mktemp -u) + aux1=$(mktemp -u) + aux2=$(mktemp -u) + aux3=$(mktemp -u) + temp=$(mktemp -u) + + mkfifo $s2 + mkfifo $aux1 + mkfifo $aux2 + mkfifo $aux3 + + ## New way of doing it using an intermediate file. This is slow + ## but doesn't deadlock + cat $IN > $temp + + sed '$d' $temp > $aux3 & + cat $temp | head -n 1 > $AUX_HEAD & + cat $temp | tail -n 1 > $AUX_TAIL & + cat $temp | tail +2 | paste $aux3 - > $OUT & + + wait + + rm $temp + rm $s2 + rm $aux1 + rm $aux2 + rm $aux3 +} + +bigram_aux_reduce() +{ + IN1=$1 + AUX_HEAD1=$2 + AUX_TAIL1=$3 + IN2=$4 + AUX_HEAD2=$5 + AUX_TAIL2=$6 + OUT=$7 + AUX_HEAD_OUT=$8 + AUX_TAIL_OUT=$9 + + temp=$(mktemp -u) + + mkfifo $temp + + cat $AUX_HEAD1 > $AUX_HEAD_OUT & + cat $AUX_TAIL2 > $AUX_TAIL_OUT & + paste $AUX_TAIL1 $AUX_HEAD2 > $temp & + cat $IN1 $temp $IN2 > $OUT & + + wait + + rm $temp +} + + +trigrams_aux() +{ + s2=$(mktemp -u) + s3=$(mktemp -u) + + mkfifo $s2 $s3 + + tee $s2 | + tail +2 | + paste $s2 - | + tee $s3 | + cut -f 1 | + tail +3 | + paste $s3 - | + sed "\$d" | + sed "\$d" + + rm $s2 $s3 +} + + +extract_text() +{ + while read -r line + do + cat $line | + iconv -c -t ascii//TRANSLIT | + pandoc +RTS -K64m -RTS --from html --to plain --quiet + done +} + + +cat $IN | + sed "s#^#$WIKI#" | + extract_text | + tr -cs A-Za-z '\n' | + tr A-Z a-z | + grep -vwFf $WEB_INDEX_DIR/stopwords.txt | + $WEB_INDEX_DIR/stem-words.js | + tee 3grams 2grams 1grams > /dev/null & + +cat 1grams | + sort | + uniq -c | + sort -rn > 1-grams.txt & + +cat 2grams | + tr -cs A-Za-z '\n' | + tr A-Z a-z | + bigrams_aux | + sort | + uniq -c | + sort -rn > 2-grams.txt & + +cat 3grams | + tr -cs A-Za-z '\n' | + tr A-Z a-z | + trigrams_aux | + sort | + uniq -c | + sort -rn # > 3-grams.txt + +rm {1,2,3}grams diff --git a/web-index/web-index.sh b/web-index/web-index.sh new file mode 100755 index 000000000..a7004e7c4 --- /dev/null +++ b/web-index/web-index.sh @@ -0,0 +1,149 @@ +#!/bin/bash +PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)} +IN=${IN:-$PASH_TOP/web-index/input/index.txt} +WEB_INDEX_DIR=${WEB_INDEX_DIR:-$PASH_TOP/web-index/input} +WIKI=${WIKI:-$PASH_TOP/web-index/} + +mkfifo {1,2,3}grams + +bigrams_aux() +{ + ( mkfifo s2 > /dev/null ) ; + ( mkfifo s3 > /dev/null ) ; + + sed '$d' s2 > s3 & + tee s2 | + tail +2 | + paste s3 - + rm s2 + rm s3 +} + +bigram_aux_map() +{ + IN=$1 + OUT=$2 + AUX_HEAD=$3 + AUX_TAIL=$4 + + s2=$(mktemp -u) + aux1=$(mktemp -u) + aux2=$(mktemp -u) + aux3=$(mktemp -u) + temp=$(mktemp -u) + + mkfifo $s2 + mkfifo $aux1 + mkfifo $aux2 + mkfifo $aux3 + + ## New way of doing it using an intermediate file. This is slow + ## but doesn't deadlock + cat $IN > $temp + + sed '$d' $temp > $aux3 & + cat $temp | head -n 1 > $AUX_HEAD & + cat $temp | tail -n 1 > $AUX_TAIL & + cat $temp | tail +2 | paste $aux3 - > $OUT & + + wait + + rm $temp + rm $s2 + rm $aux1 + rm $aux2 + rm $aux3 +} + +bigram_aux_reduce() +{ + IN1=$1 + AUX_HEAD1=$2 + AUX_TAIL1=$3 + IN2=$4 + AUX_HEAD2=$5 + AUX_TAIL2=$6 + OUT=$7 + AUX_HEAD_OUT=$8 + AUX_TAIL_OUT=$9 + + temp=$(mktemp -u) + + mkfifo $temp + + cat $AUX_HEAD1 > $AUX_HEAD_OUT & + cat $AUX_TAIL2 > $AUX_TAIL_OUT & + paste $AUX_TAIL1 $AUX_HEAD2 > $temp & + cat $IN1 $temp $IN2 > $OUT & + + wait + + rm $temp +} + + +trigrams_aux() +{ + s2=$(mktemp -u) + s3=$(mktemp -u) + + mkfifo $s2 $s3 + + tee $s2 | + tail +2 | + paste $s2 - | + tee $s3 | + cut -f 1 | + tail +3 | + paste $s3 - | + sed "\$d" | + sed "\$d" + + rm $s2 $s3 +} + + +extract_text() +{ + while read -r line + do + cat $line | + iconv -c -t ascii//TRANSLIT | + pandoc +RTS -K64m -RTS --from html --to plain --quiet + done +} + +export -f extract_text + +head $IN | + sed "s#^#$WIKI#" | + extract_text | + tr -cs A-Za-z '\n' | + tr A-Z a-z | + grep -vwFf $WIKI/stopwords.txt | + $WIKI/stem-words.js | + tee 3grams 2grams 1grams > /dev/null & + +cat 1grams | + sort | + uniq -c | + sort -rn > 1-grams.txt & + +cat 2grams | + tr -cs A-Za-z '\n' | + tr A-Z a-z | + bigrams_aux | + sort | + uniq -c | + sort -rn > 2-grams.txt & + +cat 3grams | + tr -cs A-Za-z '\n' | + tr A-Z a-z | + trigrams_aux | + sort | + uniq -c | + sort -rn > 3-grams.txt & + +# rm -f {1,2,3}grams {1,2,3}-grams.txt s2 s3 +