-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This reverts commit 938769f.
- Loading branch information
Showing
29 changed files
with
1,956 additions
and
1,912 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
#!/usr/bin/env bash | ||
|
||
# 7zip | ||
pkgs='p7zip-full curl wget nodejs' | ||
if ! dpkg -s $pkgs >/dev/null 2>&1 ; then | ||
sudo apt-get install $pkgs -y | ||
echo 'Packages Installed' | ||
fi | ||
|
||
if ! dpkg -s pandoc > /dev/null 2>&1 ; then | ||
# pandoc v.2.2.1 | ||
wget https://github.com/jgm/pandoc/releases/download/2.2.1/pandoc-2.2.1-1-$(dpkg --print-architecture).deb | ||
sudo dpkg -i ./pandoc-2.2.1-1-$(dpkg --print-architecture).deb | ||
rm ./pandoc-2.2.1-1-$(dpkg --print-architecture).deb | ||
fi | ||
|
||
if ! dpkg -s nodejs > /dev/null 2>&1 ; then | ||
# node version 18+ does not need external npm | ||
curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash - | ||
sudo apt-get install -y nodejs | ||
fi | ||
|
||
if [ ! -d node_modules ]; then | ||
npm install | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
#!/bin/bash | ||
if [ $# -eq 0 ]; then | ||
echo "Usage: $0 <directory_path>" | ||
exit 1 | ||
fi | ||
|
||
# Directory path is the first argument | ||
directory_path=$1 | ||
|
||
# Check if the directory exists | ||
if [ ! -d "$directory_path" ]; then | ||
echo "Error: Directory does not exist." | ||
exit 1 | ||
fi | ||
|
||
# Ensure a local ./tmp directory exists for sorting | ||
mkdir -p ./tmp | ||
export TMPDIR=./tmp | ||
|
||
# Find all files, remove prefix, sort them, and write to a text file | ||
find "$directory_path" -type f | sed 's|./wikipedia/en/articles/||' | sort > index.txt | ||
|
||
echo "File paths have been saved to all_files_paths.txt" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
#!/bin/bash | ||
|
||
#set -e | ||
|
||
wiki_archive="https://dumps.wikimedia.org/other/static_html_dumps/current/en/wikipedia-en-html.tar.7z" | ||
BENCH_TOP=${BENCH_TOP:-$(git rev-parse --show-toplevel)} | ||
|
||
# . "$BENCH_TOP/scripts/utils.sh" | ||
sudo apt-get install unzip | ||
|
||
[ "$1" = "-c" ] && rm -rf en/ *.7z *.tar 500.txt 1000.txt full small | ||
|
||
setup_dataset() { | ||
rm -rf ../1-grams.txt ../2-grams.txt | ||
|
||
## Downloading the dataset needs to happen for both small and large | ||
if [[ ! -d ./en ]]; then | ||
# wget $wiki_archive || eexit "cannot fetch wikipedia" | ||
# 7za x wikipedia-en-html.tar.7z | ||
tar -xvf wikipedia-en-html.tar | ||
wget http://ndr.md/data/wikipedia/index.txt # || eexit "cannot fetch wiki indices" | ||
# It is actually OK if we don't have this index since we download the 500/1000 below | ||
fi | ||
|
||
if [ "$1" = "--small" ]; then | ||
# 500 entries | ||
wget http://pac-n4.csail.mit.edu:81/pash_data/small/web-index.small.zip | ||
unzip web-index.small.zip | ||
mv small/500.txt . | ||
rm -rf small web-index.small.zip | ||
elif [ "$1" = "--full" ]; then | ||
the default full | ||
1000 entries | ||
wget http://pac-n4.csail.mit.edu:81/pash_data/full/web-index.full.zip | ||
unzip web-index.full.zip | ||
mv full/1000.txt . | ||
rm -rf full web-index.full.zip | ||
fi | ||
} | ||
|
||
setup_dataset $1 |
File renamed without changes.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
#!/bin/bash | ||
PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)} | ||
WIKI=${WIKI:-$PASH_TOP/web-index} | ||
|
||
export WIKI | ||
# Squash all HTML for each URL into a single line, streaming fashion | ||
# It also prefixes with the URL | ||
|
||
page_per_line () { | ||
cat "$WIKI/$0" | tr -d "\n\r" | tr -d '\n' | sed -e '/.$/a\' | ||
} | ||
|
||
export -f page_per_line | ||
|
||
# xargs: | ||
# add `-t` for debugging | ||
cat $WIKI/input/index.txt | xargs -0 -d '\n' -n 1 bash -c 'page_per_line "$@"' | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#!/bin/bash | ||
PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)} | ||
WIKI=${WIKI:-$PASH_TOP/web-index} | ||
|
||
cat $WIKI/input/index.txt | | ||
sed "s#^#$WIKI#" | | ||
iconv -c -t ascii//TRANSLIT | | ||
pandoc +RTS -K64m -RTS --from html --to plain --quiet | | ||
tr -cs A-Za-z '\n' | | ||
tr A-Z a-z | | ||
grep -vwFf $WIKI/stopwords.txt | | ||
$WIKI/stem-words.js | ||
|
Oops, something went wrong.