forked from ThinkBigAnalytics/scalding-workshop
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathrun11.sh
executable file
·80 lines (75 loc) · 2.51 KB
/
run11.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env bash
# Run just the HadoopTwitter11 script, which requires Hadoop to be installed.
# Note that an alternative is to use Scalding's scald.rb script with just the
# "script" file, e.g.,:
# $SCALDING_HOME/scripts/scald.rb --hdfs-local --host $host \
# src/main/scala/HadoopTwitter11.scala \
# --input data/twitter/tweets.tsv \
# --uniques output/unique-languages \
# --count-star output/count-star \
# --count-star-100 output/count-star-100
usage() {
cat <<EOF
usage $0 [-h|--help] [--host host] [--hdfs|--local|--print] [--root rootpath]
where:
--host host is the JobTracker host (default: localhost)
--root rootpath is the parent directory for the input and output (default ".")
--hdfs run in a Hadoop cluster. You'll need to copy the input files
into HDFS and create the output directory and you might want to
use the --root path argument to specify a "root" (or parent)
directory for both. E.g.., use the following commands, where
we'll also assume you'll use "--root /user/me/twitter":
hdfs -mkdir /user/me/twitter/data
hdfs -mkdir /user/me/twitter/output
hdfs -put data/twitter /user/me/twitter/data
--local Use Cascading's local mode that bypasses MR altogether.
(default)
--print Print what you'll do, but don't do anything
EOF
}
assembly=$(ls target/scala-2.10/scalding-workshop*.jar 2> /dev/null)
if [ $? -ne 0 ]
then
echo "$0: The target/scala-2.10/scalding-workshop-X.Y.Z.jar hasn't been built yet."
echo " Run 'sbt assembly' then try again."
exit 1
fi
script=HadoopTwitter11
host=localhost
file_system_arg=--local
root=
args=()
while [ $# -ne 0 ]
do
case $1 in
-h|--help)
usage
exit 0
;;
--root)
shift
root="$1/"
;;
--host)
shift
host=$1
;;
--hdfs|--local|--print)
file_system_arg=$1
;;
*)
args[${#args[@]}]="$1"
;;
esac
shift
done
runit() {
$@ hadoop jar $assembly $script \
$file_system_arg --host $host "${args[@]}" --scalaversion 2.10 \
--input ${root}data/twitter/tweets.tsv \
--uniques ${root}output/unique-languages \
--count-star ${root}output/count-star \
--count-star-100 ${root}output/count-star-100
}
runit echo running
test -z "$NOOP" && runit