From bdb7ad8f641468e0ded3cd8d5a1d7d358da90301 Mon Sep 17 00:00:00 2001 From: Tom White Date: Thu, 16 Apr 2015 18:23:13 +0100 Subject: [PATCH 1/6] [EGGO-30] Generate partitioned data. --- eggo/dag.py | 41 +++++++++++++++++++++++++++++++ genotypes-partition-strategy.json | 4 +++ test/registry/test-genotypes.json | 2 +- 3 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 genotypes-partition-strategy.json diff --git a/eggo/dag.py b/eggo/dag.py index 05f855f..753c9f4 100644 --- a/eggo/dag.py +++ b/eggo/dag.py @@ -305,6 +305,39 @@ def output(self): return S3FlagTarget( target_s3_url(ToastConfig().config['name'], edition=self.edition)) +class ADAMPartitionTask(Task): + + adam_command = Parameter() + allowed_file_formats = Parameter() + source_edition = 'basic' + edition = 'locuspart' + + def requires(self): + return ADAMBasicTask(adam_command=self.adam_command, + allowed_file_formats=self.allowed_file_formats) + + def run(self): + adam_cmd = ('{adam_home}/bin/adam-submit --master {spark_master_url} partition' + ' -partition_strategy_file {partition_strategy_file}' + ' {source} {target}').format( + adam_home=os.environ['ADAM_HOME'], + spark_master_url=os.environ['SPARK_MASTER_URL'], + partition_strategy_file='genotypes-partition-strategy.json', + source=target_s3n_url(ToastConfig().config['name'], + edition=self.source_edition), + target=target_s3n_url(ToastConfig().config['name'], + edition=self.edition)) + p = Popen(adam_cmd, shell=True) + p.wait() + + if p.returncode == 0: + create_SUCCESS_file(target_s3_url(ToastConfig().config['name'], + edition=self.edition)) + + def output(self): + return S3FlagTarget(target_s3_url(ToastConfig().config['name'], + edition=self.edition)) + class VCF2ADAMTask(Task): @@ -313,6 +346,8 @@ def requires(self): allowed_file_formats=['vcf']) flat = ADAMFlattenTask(adam_command='vcf2adam', allowed_file_formats=['vcf']) + locuspart = ADAMPartitionTask(adam_command='vcf2adam', + allowed_file_formats=['vcf']) dependencies = [basic] conf = ToastConfig().config editions = conf['editions'] if 'editions' in conf else [] @@ -321,6 +356,8 @@ def requires(self): pass # included by default elif edition == 'flat': dependencies.append(flat) + elif edition == 'locuspart': + dependencies.append(locuspart) return dependencies def run(self): @@ -337,6 +374,8 @@ def requires(self): allowed_file_formats=['sam', 'bam']) flat = ADAMFlattenTask(adam_command='transform', allowed_file_formats=['sam', 'bam']) + locuspart = ADAMPartitionTask(adam_command='transform', + allowed_file_formats=['sam', 'bam']) dependencies = [basic] conf = ToastConfig().config editions = conf['editions'] if 'editions' in conf else [] @@ -345,5 +384,7 @@ def requires(self): pass # included by default elif edition == 'flat': dependencies.append(flat) + elif edition == 'locuspart': + dependencies.append(locuspart) return dependencies diff --git a/genotypes-partition-strategy.json b/genotypes-partition-strategy.json new file mode 100644 index 0000000..a85255d --- /dev/null +++ b/genotypes-partition-strategy.json @@ -0,0 +1,4 @@ +[ + { "type": "identity", "source": "variant.contig.contigName", "name": "chr" }, + { "type": "range", "source": "variant.start", "name": "pos", "range": 10000 } +] \ No newline at end of file diff --git a/test/registry/test-genotypes.json b/test/registry/test-genotypes.json index 5d73066..6192cfc 100644 --- a/test/registry/test-genotypes.json +++ b/test/registry/test-genotypes.json @@ -2,7 +2,7 @@ "name": "test-genotypes", "title": "Test 1000 Genomes Project VCF data", "dag": "VCF2ADAMTask", - "editions": ["basic", "flat"], + "editions": ["basic", "locuspart"], "sources": [ {"format": "vcf", "compression": true, "url": "https://github.com/bigdatagenomics/eggo/raw/master/test/resources/chr22.small.vcf.gz"} ] From ce6e2152d2f859d6e5c23fef82d5d4bc7015afde Mon Sep 17 00:00:00 2001 From: Tom White Date: Fri, 17 Apr 2015 09:54:15 +0100 Subject: [PATCH 2/6] [EGGO-30] Generate flattened, partitioned data. --- eggo/dag.py | 44 +++++++++++++++++++++++++- genotypes-flat-partition-strategy.json | 4 +++ genotypes-partition-strategy.json | 2 +- test/registry/test-genotypes.json | 2 +- 4 files changed, 49 insertions(+), 3 deletions(-) create mode 100644 genotypes-flat-partition-strategy.json diff --git a/eggo/dag.py b/eggo/dag.py index 753c9f4..baae10b 100644 --- a/eggo/dag.py +++ b/eggo/dag.py @@ -339,6 +339,40 @@ def output(self): edition=self.edition)) +class ADAMFlattenPartitionTask(Task): + + adam_command = Parameter() + allowed_file_formats = Parameter() + source_edition = 'flat' + edition = 'flat_locuspart' + + def requires(self): + return ADAMBasicTask(adam_command=self.adam_command, + allowed_file_formats=self.allowed_file_formats) + + def run(self): + adam_cmd = ('{adam_home}/bin/adam-submit --master {spark_master_url} partition' + ' -partition_strategy_file {partition_strategy_file}' + ' {source} {target}').format( + adam_home=os.environ['ADAM_HOME'], + spark_master_url=os.environ['SPARK_MASTER_URL'], + partition_strategy_file='genotypes-flat-partition-strategy.json', + source=target_s3n_url(ToastConfig().config['name'], + edition=self.source_edition), + target=target_s3n_url(ToastConfig().config['name'], + edition=self.edition)) + p = Popen(adam_cmd, shell=True) + p.wait() + + if p.returncode == 0: + create_SUCCESS_file(target_s3_url(ToastConfig().config['name'], + edition=self.edition)) + + def output(self): + return S3FlagTarget(target_s3_url(ToastConfig().config['name'], + edition=self.edition)) + + class VCF2ADAMTask(Task): def requires(self): @@ -347,7 +381,9 @@ def requires(self): flat = ADAMFlattenTask(adam_command='vcf2adam', allowed_file_formats=['vcf']) locuspart = ADAMPartitionTask(adam_command='vcf2adam', - allowed_file_formats=['vcf']) + allowed_file_formats=['vcf']) + flat_locuspart = ADAMFlattenPartitionTask(adam_command='vcf2adam', + allowed_file_formats=['vcf']) dependencies = [basic] conf = ToastConfig().config editions = conf['editions'] if 'editions' in conf else [] @@ -358,6 +394,8 @@ def requires(self): dependencies.append(flat) elif edition == 'locuspart': dependencies.append(locuspart) + elif edition == 'flat_locuspart': + dependencies.append(flat_locuspart) return dependencies def run(self): @@ -376,6 +414,8 @@ def requires(self): allowed_file_formats=['sam', 'bam']) locuspart = ADAMPartitionTask(adam_command='transform', allowed_file_formats=['sam', 'bam']) + flat_locuspart = ADAMFlattenPartitionTask(adam_command='transform', + allowed_file_formats=['sam', 'bam']) dependencies = [basic] conf = ToastConfig().config editions = conf['editions'] if 'editions' in conf else [] @@ -386,5 +426,7 @@ def requires(self): dependencies.append(flat) elif edition == 'locuspart': dependencies.append(locuspart) + elif edition == 'flat_locuspart': + dependencies.append(flat_locuspart) return dependencies diff --git a/genotypes-flat-partition-strategy.json b/genotypes-flat-partition-strategy.json new file mode 100644 index 0000000..1aa5950 --- /dev/null +++ b/genotypes-flat-partition-strategy.json @@ -0,0 +1,4 @@ +[ + { "type": "identity", "source": "variant__contig__contigName", "name": "chr" }, + { "type": "range", "source": "variant__start", "name": "pos", "range": 1000000 } +] \ No newline at end of file diff --git a/genotypes-partition-strategy.json b/genotypes-partition-strategy.json index a85255d..14be592 100644 --- a/genotypes-partition-strategy.json +++ b/genotypes-partition-strategy.json @@ -1,4 +1,4 @@ [ { "type": "identity", "source": "variant.contig.contigName", "name": "chr" }, - { "type": "range", "source": "variant.start", "name": "pos", "range": 10000 } + { "type": "range", "source": "variant.start", "name": "pos", "range": 1000000 } ] \ No newline at end of file diff --git a/test/registry/test-genotypes.json b/test/registry/test-genotypes.json index 6192cfc..41af00b 100644 --- a/test/registry/test-genotypes.json +++ b/test/registry/test-genotypes.json @@ -2,7 +2,7 @@ "name": "test-genotypes", "title": "Test 1000 Genomes Project VCF data", "dag": "VCF2ADAMTask", - "editions": ["basic", "locuspart"], + "editions": ["basic", "flat", "locuspart", "flat_locuspart"], "sources": [ {"format": "vcf", "compression": true, "url": "https://github.com/bigdatagenomics/eggo/raw/master/test/resources/chr22.small.vcf.gz"} ] From 0366ec201ce1a0b9ab0493deff092b5e2af00bbe Mon Sep 17 00:00:00 2001 From: Tom White Date: Tue, 21 Apr 2015 17:56:18 +0100 Subject: [PATCH 3/6] [EGGO-30] Use Crunch-based MR partitioner, from https://github.com/tomwhite/adam-partitioning, while Spark version is being debugged. --- README.md | 1 + eggo-ec2-variables.sh | 1 + eggo/dag.py | 26 ++++++++++++++------------ eggo/fabric_util.py | 6 ++++++ 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 320ea44..bba9c87 100644 --- a/README.md +++ b/README.md @@ -137,6 +137,7 @@ export HADOOP_HOME=~/sw/hadoop-2.5.1/ export SPARK_HOME=~/sw/spark-1.3.0-bin-hadoop2.4/ export SPARK_MASTER_URL=local export STREAMING_JAR=$HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-2.5.1.jar +export ADAM_PARTITIONING_JAR=~/workspace/adam-partitioning~/workspace/adam-partitioning/target/adam-partitioning-0.0.1-SNAPSHOT-job.jar export PATH=$PATH:$HADOOP_HOME/bin ``` diff --git a/eggo-ec2-variables.sh b/eggo-ec2-variables.sh index 8e5621d..68c0737 100644 --- a/eggo-ec2-variables.sh +++ b/eggo-ec2-variables.sh @@ -22,4 +22,5 @@ source /root/spark-ec2/ec2-variables.sh export SPARK_MASTER="$MASTERS" export SPARK_MASTER_URL="spark://$SPARK_MASTER:7077" export STREAMING_JAR=$HADOOP_HOME/contrib/streaming/hadoop-streaming-1.0.4.jar +export ADAM_PARTITIONING_JAR=/root/adam-partitioning/adam-partitioning-0.0.1-SNAPSHOT-job.jar export PATH=$PATH:$HADOOP_HOME/bin diff --git a/eggo/dag.py b/eggo/dag.py index baae10b..c74cf8f 100644 --- a/eggo/dag.py +++ b/eggo/dag.py @@ -317,12 +317,13 @@ def requires(self): allowed_file_formats=self.allowed_file_formats) def run(self): - adam_cmd = ('{adam_home}/bin/adam-submit --master {spark_master_url} partition' - ' -partition_strategy_file {partition_strategy_file}' - ' {source} {target}').format( - adam_home=os.environ['ADAM_HOME'], - spark_master_url=os.environ['SPARK_MASTER_URL'], - partition_strategy_file='genotypes-partition-strategy.json', + adam_cmd = ('{hadoop_home}/bin/hadoop jar {adam_partitioning_jar}' + ' CrunchPartitionTool -D mapreduce.job.reduces={parallelism}' + ' {partition_strategy_file} {source} {target}').format( + hadoop_home=os.environ['HADOOP_HOME'], + adam_partitioning_jar=os.environ['ADAM_PARTITIONING_JAR'], + parallelism=1, + partition_strategy_file='genotypes-partition-strategy', source=target_s3n_url(ToastConfig().config['name'], edition=self.source_edition), target=target_s3n_url(ToastConfig().config['name'], @@ -351,12 +352,13 @@ def requires(self): allowed_file_formats=self.allowed_file_formats) def run(self): - adam_cmd = ('{adam_home}/bin/adam-submit --master {spark_master_url} partition' - ' -partition_strategy_file {partition_strategy_file}' - ' {source} {target}').format( - adam_home=os.environ['ADAM_HOME'], - spark_master_url=os.environ['SPARK_MASTER_URL'], - partition_strategy_file='genotypes-flat-partition-strategy.json', + adam_cmd = ('{hadoop_home}/bin/hadoop jar {adam_partitioning_jar}' + ' CrunchPartitionTool -D mapreduce.job.reduces={parallelism}' + ' {partition_strategy_file} {source} {target}').format( + hadoop_home=os.environ['HADOOP_HOME'], + adam_partitioning_jar=os.environ['ADAM_PARTITIONING_JAR'], + parallelism=1, + partition_strategy_file='flat-genotypes-partition-strategy', source=target_s3n_url(ToastConfig().config['name'], edition=self.source_edition), target=target_s3n_url(ToastConfig().config['name'], diff --git a/eggo/fabric_util.py b/eggo/fabric_util.py index 08f0f9f..f32c136 100644 --- a/eggo/fabric_util.py +++ b/eggo/fabric_util.py @@ -99,6 +99,12 @@ def _install_adam(): run('mvn clean package -DskipTests') +def _install_adam_partitioning(): + run('mkdir -p /root/adam-partitioning') + with cd('/root/adam-partitioning'): + run('wget https://github.com/tomwhite/adam-partitioning/raw/master/lib/adam-partitioning-0.0.1-SNAPSHOT-job.jar') + + def _install_eggo(fork='bigdatagenomics', branch='master'): # check out eggo with cd('~'): From d5f2d9897a0de6f6eb50a1616a59f49c564d1cef Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 22 Apr 2015 11:33:20 +0100 Subject: [PATCH 4/6] [EGGO-30] Add support for partitioning BAM/SAM. --- eggo/dag.py | 18 ++++++++++++------ genotypes-flat-partition-strategy.json | 4 ---- genotypes-partition-strategy.json | 4 ---- test/registry/test-1kg-genotypes-subset.json | 9 +++++++++ test/registry/test-alignments.json | 4 ++-- 5 files changed, 23 insertions(+), 16 deletions(-) delete mode 100644 genotypes-flat-partition-strategy.json delete mode 100644 genotypes-partition-strategy.json create mode 100644 test/registry/test-1kg-genotypes-subset.json diff --git a/eggo/dag.py b/eggo/dag.py index c74cf8f..300fe15 100644 --- a/eggo/dag.py +++ b/eggo/dag.py @@ -309,6 +309,7 @@ class ADAMPartitionTask(Task): adam_command = Parameter() allowed_file_formats = Parameter() + partition_strategy_file = Parameter() source_edition = 'basic' edition = 'locuspart' @@ -323,7 +324,7 @@ def run(self): hadoop_home=os.environ['HADOOP_HOME'], adam_partitioning_jar=os.environ['ADAM_PARTITIONING_JAR'], parallelism=1, - partition_strategy_file='genotypes-partition-strategy', + partition_strategy_file=self.partition_strategy_file, source=target_s3n_url(ToastConfig().config['name'], edition=self.source_edition), target=target_s3n_url(ToastConfig().config['name'], @@ -344,6 +345,7 @@ class ADAMFlattenPartitionTask(Task): adam_command = Parameter() allowed_file_formats = Parameter() + partition_strategy_file = Parameter() source_edition = 'flat' edition = 'flat_locuspart' @@ -358,7 +360,7 @@ def run(self): hadoop_home=os.environ['HADOOP_HOME'], adam_partitioning_jar=os.environ['ADAM_PARTITIONING_JAR'], parallelism=1, - partition_strategy_file='flat-genotypes-partition-strategy', + partition_strategy_file=self.partition_strategy_file, source=target_s3n_url(ToastConfig().config['name'], edition=self.source_edition), target=target_s3n_url(ToastConfig().config['name'], @@ -383,9 +385,11 @@ def requires(self): flat = ADAMFlattenTask(adam_command='vcf2adam', allowed_file_formats=['vcf']) locuspart = ADAMPartitionTask(adam_command='vcf2adam', - allowed_file_formats=['vcf']) + allowed_file_formats=['vcf'], + partition_strategy_file='genotypes-partition-strategy') flat_locuspart = ADAMFlattenPartitionTask(adam_command='vcf2adam', - allowed_file_formats=['vcf']) + allowed_file_formats=['vcf'], + partition_strategy_file='flat-genotypes-partition-strategy') dependencies = [basic] conf = ToastConfig().config editions = conf['editions'] if 'editions' in conf else [] @@ -415,9 +419,11 @@ def requires(self): flat = ADAMFlattenTask(adam_command='transform', allowed_file_formats=['sam', 'bam']) locuspart = ADAMPartitionTask(adam_command='transform', - allowed_file_formats=['sam', 'bam']) + allowed_file_formats=['sam', 'bam'], + partition_strategy_file='alignments-partition-strategy') flat_locuspart = ADAMFlattenPartitionTask(adam_command='transform', - allowed_file_formats=['sam', 'bam']) + allowed_file_formats=['sam', 'bam'], + partition_strategy_file='flat-alignments-partition-strategy') dependencies = [basic] conf = ToastConfig().config editions = conf['editions'] if 'editions' in conf else [] diff --git a/genotypes-flat-partition-strategy.json b/genotypes-flat-partition-strategy.json deleted file mode 100644 index 1aa5950..0000000 --- a/genotypes-flat-partition-strategy.json +++ /dev/null @@ -1,4 +0,0 @@ -[ - { "type": "identity", "source": "variant__contig__contigName", "name": "chr" }, - { "type": "range", "source": "variant__start", "name": "pos", "range": 1000000 } -] \ No newline at end of file diff --git a/genotypes-partition-strategy.json b/genotypes-partition-strategy.json deleted file mode 100644 index 14be592..0000000 --- a/genotypes-partition-strategy.json +++ /dev/null @@ -1,4 +0,0 @@ -[ - { "type": "identity", "source": "variant.contig.contigName", "name": "chr" }, - { "type": "range", "source": "variant.start", "name": "pos", "range": 1000000 } -] \ No newline at end of file diff --git a/test/registry/test-1kg-genotypes-subset.json b/test/registry/test-1kg-genotypes-subset.json new file mode 100644 index 0000000..5961989 --- /dev/null +++ b/test/registry/test-1kg-genotypes-subset.json @@ -0,0 +1,9 @@ +{ + "name": "test-1kg-genotypes-subset", + "title": "Test 1000 Genomes Project VCF data", + "dag": "VCF2ADAMTask", + "editions": ["basic", "flat", "locuspart", "flat_locuspart"], + "sources": [ + {"format": "vcf", "compression": true, "url": "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20110521/ALL.chr22.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz"} + ] +} diff --git a/test/registry/test-alignments.json b/test/registry/test-alignments.json index cdf85fb..7084702 100644 --- a/test/registry/test-alignments.json +++ b/test/registry/test-alignments.json @@ -2,8 +2,8 @@ "name": "test-alignments", "title": "Test SAM data", "dag": "BAM2ADAMTask", - "editions": ["basic", "flat"], + "editions": ["basic", "flat", "locuspart", "flat_locuspart"], "sources": [ - {"format": "sam", "compression": false, "url": "https://raw.githubusercontent.com/bigdatagenomics/adam/master/adam-core/src/test/resources/reads12.sam"} + {"format": "sam", "compression": false, "url": "https://github.com/bigdatagenomics/eggo/raw/master/test/resources/small.sam"} ] } From 1cad68c36f17006b2da5cbb838b867f655291b82 Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 22 Apr 2015 12:26:31 +0100 Subject: [PATCH 5/6] Add hint for parallelism in registry files. --- eggo/dag.py | 76 ++++++++------------ test/registry/test-1kg-genotypes-subset.json | 1 + test/registry/test-alignments.json | 1 + test/registry/test-genotypes.json | 1 + 4 files changed, 31 insertions(+), 48 deletions(-) diff --git a/eggo/dag.py b/eggo/dag.py index 300fe15..a64aa98 100644 --- a/eggo/dag.py +++ b/eggo/dag.py @@ -309,45 +309,10 @@ class ADAMPartitionTask(Task): adam_command = Parameter() allowed_file_formats = Parameter() + source_edition = Parameter() + edition = Parameter() partition_strategy_file = Parameter() - source_edition = 'basic' - edition = 'locuspart' - - def requires(self): - return ADAMBasicTask(adam_command=self.adam_command, - allowed_file_formats=self.allowed_file_formats) - - def run(self): - adam_cmd = ('{hadoop_home}/bin/hadoop jar {adam_partitioning_jar}' - ' CrunchPartitionTool -D mapreduce.job.reduces={parallelism}' - ' {partition_strategy_file} {source} {target}').format( - hadoop_home=os.environ['HADOOP_HOME'], - adam_partitioning_jar=os.environ['ADAM_PARTITIONING_JAR'], - parallelism=1, - partition_strategy_file=self.partition_strategy_file, - source=target_s3n_url(ToastConfig().config['name'], - edition=self.source_edition), - target=target_s3n_url(ToastConfig().config['name'], - edition=self.edition)) - p = Popen(adam_cmd, shell=True) - p.wait() - - if p.returncode == 0: - create_SUCCESS_file(target_s3_url(ToastConfig().config['name'], - edition=self.edition)) - - def output(self): - return S3FlagTarget(target_s3_url(ToastConfig().config['name'], - edition=self.edition)) - - -class ADAMFlattenPartitionTask(Task): - - adam_command = Parameter() - allowed_file_formats = Parameter() - partition_strategy_file = Parameter() - source_edition = 'flat' - edition = 'flat_locuspart' + parallelism = Parameter() def requires(self): return ADAMBasicTask(adam_command=self.adam_command, @@ -359,7 +324,7 @@ def run(self): ' {partition_strategy_file} {source} {target}').format( hadoop_home=os.environ['HADOOP_HOME'], adam_partitioning_jar=os.environ['ADAM_PARTITIONING_JAR'], - parallelism=1, + parallelism=self.parallelism, partition_strategy_file=self.partition_strategy_file, source=target_s3n_url(ToastConfig().config['name'], edition=self.source_edition), @@ -376,20 +341,27 @@ def output(self): return S3FlagTarget(target_s3_url(ToastConfig().config['name'], edition=self.edition)) - class VCF2ADAMTask(Task): def requires(self): + conf = ToastConfig().config + parallelism = conf['numPartitionsHint'] if 'numPartitionsHint' in conf else 1 basic = ADAMBasicTask(adam_command='vcf2adam', allowed_file_formats=['vcf']) flat = ADAMFlattenTask(adam_command='vcf2adam', allowed_file_formats=['vcf']) locuspart = ADAMPartitionTask(adam_command='vcf2adam', allowed_file_formats=['vcf'], - partition_strategy_file='genotypes-partition-strategy') - flat_locuspart = ADAMFlattenPartitionTask(adam_command='vcf2adam', - allowed_file_formats=['vcf'], - partition_strategy_file='flat-genotypes-partition-strategy') + source_edition='basic', + edition='locuspart', + partition_strategy_file='genotypes-partition-strategy', + parallelism=parallelism) + flat_locuspart = ADAMPartitionTask(adam_command='vcf2adam', + allowed_file_formats=['vcf'], + source_edition='flat', + edition='flat_locuspart', + partition_strategy_file='flat-genotypes-partition-strategy', + parallelism=parallelism) dependencies = [basic] conf = ToastConfig().config editions = conf['editions'] if 'editions' in conf else [] @@ -414,16 +386,24 @@ def output(self): class BAM2ADAMTask(Task): def requires(self): + conf = ToastConfig().config + parallelism = conf['numPartitionsHint'] if 'numPartitionsHint' in conf else 1 basic = ADAMBasicTask(adam_command='transform', allowed_file_formats=['sam', 'bam']) flat = ADAMFlattenTask(adam_command='transform', allowed_file_formats=['sam', 'bam']) locuspart = ADAMPartitionTask(adam_command='transform', allowed_file_formats=['sam', 'bam'], - partition_strategy_file='alignments-partition-strategy') - flat_locuspart = ADAMFlattenPartitionTask(adam_command='transform', - allowed_file_formats=['sam', 'bam'], - partition_strategy_file='flat-alignments-partition-strategy') + source_edition='basic', + edition='locuspart', + partition_strategy_file='alignments-partition-strategy', + parallelism=parallelism) + flat_locuspart = ADAMPartitionTask(adam_command='transform', + allowed_file_formats=['sam', 'bam'], + source_edition='flat', + edition='flat_locuspart', + partition_strategy_file='flat-alignments-partition-strategy', + parallelism=parallelism) dependencies = [basic] conf = ToastConfig().config editions = conf['editions'] if 'editions' in conf else [] diff --git a/test/registry/test-1kg-genotypes-subset.json b/test/registry/test-1kg-genotypes-subset.json index 5961989..d6276bf 100644 --- a/test/registry/test-1kg-genotypes-subset.json +++ b/test/registry/test-1kg-genotypes-subset.json @@ -3,6 +3,7 @@ "title": "Test 1000 Genomes Project VCF data", "dag": "VCF2ADAMTask", "editions": ["basic", "flat", "locuspart", "flat_locuspart"], + "numPartitionsHint": 36, "sources": [ {"format": "vcf", "compression": true, "url": "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20110521/ALL.chr22.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz"} ] diff --git a/test/registry/test-alignments.json b/test/registry/test-alignments.json index 7084702..5e15d64 100644 --- a/test/registry/test-alignments.json +++ b/test/registry/test-alignments.json @@ -3,6 +3,7 @@ "title": "Test SAM data", "dag": "BAM2ADAMTask", "editions": ["basic", "flat", "locuspart", "flat_locuspart"], + "numPartitionsHint": 1, "sources": [ {"format": "sam", "compression": false, "url": "https://github.com/bigdatagenomics/eggo/raw/master/test/resources/small.sam"} ] diff --git a/test/registry/test-genotypes.json b/test/registry/test-genotypes.json index 41af00b..3e28717 100644 --- a/test/registry/test-genotypes.json +++ b/test/registry/test-genotypes.json @@ -3,6 +3,7 @@ "title": "Test 1000 Genomes Project VCF data", "dag": "VCF2ADAMTask", "editions": ["basic", "flat", "locuspart", "flat_locuspart"], + "numPartitionsHint": 1, "sources": [ {"format": "vcf", "compression": true, "url": "https://github.com/bigdatagenomics/eggo/raw/master/test/resources/chr22.small.vcf.gz"} ] From 1e236173d5b1c8507659986cf36a63c769e2e8d8 Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 22 Apr 2015 14:17:11 +0100 Subject: [PATCH 6/6] Fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bba9c87..e2f4a55 100644 --- a/README.md +++ b/README.md @@ -137,7 +137,7 @@ export HADOOP_HOME=~/sw/hadoop-2.5.1/ export SPARK_HOME=~/sw/spark-1.3.0-bin-hadoop2.4/ export SPARK_MASTER_URL=local export STREAMING_JAR=$HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-2.5.1.jar -export ADAM_PARTITIONING_JAR=~/workspace/adam-partitioning~/workspace/adam-partitioning/target/adam-partitioning-0.0.1-SNAPSHOT-job.jar +export ADAM_PARTITIONING_JAR=~/workspace/adam-partitioning/target/adam-partitioning-0.0.1-SNAPSHOT-job.jar export PATH=$PATH:$HADOOP_HOME/bin ```