Skip to content

Commit

Permalink
[EGGO-30] Use Crunch-based MR partitioner, from
Browse files Browse the repository at this point in the history
https://github.com/tomwhite/adam-partitioning, while Spark version
is being debugged.
  • Loading branch information
tomwhite committed Apr 21, 2015
1 parent 4f0a26f commit 5f18147
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 12 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ export HADOOP_HOME=~/sw/hadoop-2.5.1/
export SPARK_HOME=~/sw/spark-1.3.0-bin-hadoop2.4/
export SPARK_MASTER_URL=local
export STREAMING_JAR=$HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-2.5.1.jar
export ADAM_PARTITIONING_JAR=~/workspace/adam-partitioning~/workspace/adam-partitioning/target/adam-partitioning-0.0.1-SNAPSHOT-job.jar
export PATH=$PATH:$HADOOP_HOME/bin
```

Expand Down
1 change: 1 addition & 0 deletions eggo-ec2-variables.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@ source /root/spark-ec2/ec2-variables.sh
export SPARK_MASTER="$MASTERS"
export SPARK_MASTER_URL="spark://$SPARK_MASTER:7077"
export STREAMING_JAR=$HADOOP_HOME/contrib/streaming/hadoop-streaming-1.0.4.jar
export ADAM_PARTITIONING_JAR=/root/adam-partitioning/adam-partitioning-0.0.1-SNAPSHOT-job.jar
export PATH=$PATH:$HADOOP_HOME/bin
26 changes: 14 additions & 12 deletions eggo/dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,12 +317,13 @@ def requires(self):
allowed_file_formats=self.allowed_file_formats)

def run(self):
adam_cmd = ('{adam_home}/bin/adam-submit --master {spark_master_url} partition'
' -partition_strategy_file {partition_strategy_file}'
' {source} {target}').format(
adam_home=os.environ['ADAM_HOME'],
spark_master_url=os.environ['SPARK_MASTER_URL'],
partition_strategy_file='genotypes-partition-strategy.json',
adam_cmd = ('{hadoop_home}/bin/hadoop jar {adam_partitioning_jar}'
' CrunchPartitionTool -D mapreduce.job.reduces={parallelism}'
' {partition_strategy_file} {source} {target}').format(
hadoop_home=os.environ['HADOOP_HOME'],
adam_partitioning_jar=os.environ['ADAM_PARTITIONING_JAR'],
parallelism=1,
partition_strategy_file='genotypes-partition-strategy',
source=target_s3n_url(ToastConfig().config['name'],
edition=self.source_edition),
target=target_s3n_url(ToastConfig().config['name'],
Expand Down Expand Up @@ -351,12 +352,13 @@ def requires(self):
allowed_file_formats=self.allowed_file_formats)

def run(self):
adam_cmd = ('{adam_home}/bin/adam-submit --master {spark_master_url} partition'
' -partition_strategy_file {partition_strategy_file}'
' {source} {target}').format(
adam_home=os.environ['ADAM_HOME'],
spark_master_url=os.environ['SPARK_MASTER_URL'],
partition_strategy_file='genotypes-flat-partition-strategy.json',
adam_cmd = ('{hadoop_home}/bin/hadoop jar {adam_partitioning_jar}'
' CrunchPartitionTool -D mapreduce.job.reduces={parallelism}'
' {partition_strategy_file} {source} {target}').format(
hadoop_home=os.environ['HADOOP_HOME'],
adam_partitioning_jar=os.environ['ADAM_PARTITIONING_JAR'],
parallelism=1,
partition_strategy_file='flat-genotypes-partition-strategy',
source=target_s3n_url(ToastConfig().config['name'],
edition=self.source_edition),
target=target_s3n_url(ToastConfig().config['name'],
Expand Down
6 changes: 6 additions & 0 deletions eggo/fabric_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,12 @@ def _install_adam():
run('mvn clean package -DskipTests')


def _install_adam_partitioning():
run('mkdir -p /root/adam-partitioning')
with cd('/root/adam-partitioning'):
run('wget https://github.com/tomwhite/adam-partitioning/raw/master/lib/adam-partitioning-0.0.1-SNAPSHOT-job.jar')


def _install_eggo(fork='bigdatagenomics', branch='master'):
# check out eggo
with cd('~'):
Expand Down

0 comments on commit 5f18147

Please sign in to comment.