Skip to content

Commit

Permalink
Revert "just web-index (#31)" (#35)
Browse files Browse the repository at this point in the history
This reverts commit 938769f.
  • Loading branch information
vagos authored Nov 11, 2024
1 parent 938769f commit f4cf478
Show file tree
Hide file tree
Showing 29 changed files with 1,956 additions and 1,912 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
strategy:
fail-fast: false
matrix:
benchmark: [nlp, file-enc, unix50, log-analysis, max-temp, uniq-ips, media-conv, sklearn, covid-mts, riker, oneliners, web-index]
benchmark: [nlp, file-enc, unix50, log-analysis, max-temp, uniq-ips, media-conv, sklearn, covid-mts, riker, oneliners]

steps:
- name: Checkout code
Expand Down
9 changes: 0 additions & 9 deletions web-index/bigrams_aux.sh

This file was deleted.

3 changes: 0 additions & 3 deletions web-index/cleanup.sh

This file was deleted.

25 changes: 0 additions & 25 deletions web-index/deps.sh

This file was deleted.

6 changes: 0 additions & 6 deletions web-index/extract_text.sh

This file was deleted.

100 changes: 0 additions & 100 deletions web-index/grep-url.js

This file was deleted.

1 change: 0 additions & 1 deletion web-index/hashes/1-grams.txt.small.hash

This file was deleted.

1 change: 0 additions & 1 deletion web-index/hashes/2-grams.txt.small.hash

This file was deleted.

1 change: 0 additions & 1 deletion web-index/hashes/3-grams.txt.small.hash

This file was deleted.

39 changes: 0 additions & 39 deletions web-index/input.sh

This file was deleted.

25 changes: 25 additions & 0 deletions web-index/input/dependencies.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/usr/bin/env bash

# 7zip
pkgs='p7zip-full curl wget nodejs'
if ! dpkg -s $pkgs >/dev/null 2>&1 ; then
sudo apt-get install $pkgs -y
echo 'Packages Installed'
fi

if ! dpkg -s pandoc > /dev/null 2>&1 ; then
# pandoc v.2.2.1
wget https://github.com/jgm/pandoc/releases/download/2.2.1/pandoc-2.2.1-1-$(dpkg --print-architecture).deb
sudo dpkg -i ./pandoc-2.2.1-1-$(dpkg --print-architecture).deb
rm ./pandoc-2.2.1-1-$(dpkg --print-architecture).deb
fi

if ! dpkg -s nodejs > /dev/null 2>&1 ; then
# node version 18+ does not need external npm
curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash -
sudo apt-get install -y nodejs
fi

if [ ! -d node_modules ]; then
npm install
fi
24 changes: 24 additions & 0 deletions web-index/input/generte_index.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash
if [ $# -eq 0 ]; then
echo "Usage: $0 <directory_path>"
exit 1
fi

# Directory path is the first argument
directory_path=$1

# Check if the directory exists
if [ ! -d "$directory_path" ]; then
echo "Error: Directory does not exist."
exit 1
fi

# Ensure a local ./tmp directory exists for sorting
mkdir -p ./tmp
export TMPDIR=./tmp

# Find all files, remove prefix, sort them, and write to a text file
find "$directory_path" -type f | sed 's|./wikipedia/en/articles/||' | sort > index.txt

echo "File paths have been saved to all_files_paths.txt"

41 changes: 41 additions & 0 deletions web-index/input/input.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash

#set -e

wiki_archive="https://dumps.wikimedia.org/other/static_html_dumps/current/en/wikipedia-en-html.tar.7z"
BENCH_TOP=${BENCH_TOP:-$(git rev-parse --show-toplevel)}

# . "$BENCH_TOP/scripts/utils.sh"
sudo apt-get install unzip

[ "$1" = "-c" ] && rm -rf en/ *.7z *.tar 500.txt 1000.txt full small

setup_dataset() {
rm -rf ../1-grams.txt ../2-grams.txt

## Downloading the dataset needs to happen for both small and large
if [[ ! -d ./en ]]; then
# wget $wiki_archive || eexit "cannot fetch wikipedia"
# 7za x wikipedia-en-html.tar.7z
tar -xvf wikipedia-en-html.tar
wget http://ndr.md/data/wikipedia/index.txt # || eexit "cannot fetch wiki indices"
# It is actually OK if we don't have this index since we download the 500/1000 below
fi

if [ "$1" = "--small" ]; then
# 500 entries
wget http://pac-n4.csail.mit.edu:81/pash_data/small/web-index.small.zip
unzip web-index.small.zip
mv small/500.txt .
rm -rf small web-index.small.zip
elif [ "$1" = "--full" ]; then
the default full
1000 entries
wget http://pac-n4.csail.mit.edu:81/pash_data/full/web-index.full.zip
unzip web-index.full.zip
mv full/1000.txt .
rm -rf full web-index.full.zip
fi
}

setup_dataset $1
File renamed without changes.
Empty file added web-index/inputs/cleanup.sh
Empty file.
Empty file.
Empty file added web-index/inputs/input.sh
Empty file.
Empty file added web-index/inputs/run.sh
Empty file.
Empty file added web-index/inputs/verify.sh
Empty file.
18 changes: 0 additions & 18 deletions web-index/move_articles.sh

This file was deleted.

18 changes: 18 additions & 0 deletions web-index/p1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash
PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)}
WIKI=${WIKI:-$PASH_TOP/web-index}

export WIKI
# Squash all HTML for each URL into a single line, streaming fashion
# It also prefixes with the URL

page_per_line () {
cat "$WIKI/$0" | tr -d "\n\r" | tr -d '\n' | sed -e '/.$/a\'
}

export -f page_per_line

# xargs:
# add `-t` for debugging
cat $WIKI/input/index.txt | xargs -0 -d '\n' -n 1 bash -c 'page_per_line "$@"'

13 changes: 13 additions & 0 deletions web-index/p2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash
PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)}
WIKI=${WIKI:-$PASH_TOP/web-index}

cat $WIKI/input/index.txt |
sed "s#^#$WIKI#" |
iconv -c -t ascii//TRANSLIT |
pandoc +RTS -K64m -RTS --from html --to plain --quiet |
tr -cs A-Za-z '\n' |
tr A-Z a-z |
grep -vwFf $WIKI/stopwords.txt |
$WIKI/stem-words.js

Loading

0 comments on commit f4cf478

Please sign in to comment.