From bdb7ad8f641468e0ded3cd8d5a1d7d358da90301 Mon Sep 17 00:00:00 2001
From: Tom White <tom@cloudera.com>
Date: Thu, 16 Apr 2015 18:23:13 +0100
Subject: [PATCH 1/6] [EGGO-30] Generate partitioned data.

---
 eggo/dag.py                       | 41 +++++++++++++++++++++++++++++++
 genotypes-partition-strategy.json |  4 +++
 test/registry/test-genotypes.json |  2 +-
 3 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 genotypes-partition-strategy.json

diff --git a/eggo/dag.py b/eggo/dag.py
index 05f855f..753c9f4 100644
--- a/eggo/dag.py
+++ b/eggo/dag.py
@@ -305,6 +305,39 @@ def output(self):
         return S3FlagTarget(
             target_s3_url(ToastConfig().config['name'], edition=self.edition))
 
+class ADAMPartitionTask(Task):
+
+    adam_command = Parameter()
+    allowed_file_formats = Parameter()
+    source_edition = 'basic'
+    edition = 'locuspart'
+
+    def requires(self):
+        return ADAMBasicTask(adam_command=self.adam_command,
+                             allowed_file_formats=self.allowed_file_formats)
+
+    def run(self):
+        adam_cmd = ('{adam_home}/bin/adam-submit --master {spark_master_url} partition'
+                    ' -partition_strategy_file {partition_strategy_file}'
+                    ' {source} {target}').format(
+            adam_home=os.environ['ADAM_HOME'],
+            spark_master_url=os.environ['SPARK_MASTER_URL'],
+            partition_strategy_file='genotypes-partition-strategy.json',
+            source=target_s3n_url(ToastConfig().config['name'],
+                                  edition=self.source_edition),
+            target=target_s3n_url(ToastConfig().config['name'],
+                                  edition=self.edition))
+        p = Popen(adam_cmd, shell=True)
+        p.wait()
+
+        if p.returncode == 0:
+            create_SUCCESS_file(target_s3_url(ToastConfig().config['name'],
+                                              edition=self.edition))
+
+    def output(self):
+        return S3FlagTarget(target_s3_url(ToastConfig().config['name'],
+                                          edition=self.edition))
+
 
 class VCF2ADAMTask(Task):
 
@@ -313,6 +346,8 @@ def requires(self):
                               allowed_file_formats=['vcf'])
         flat = ADAMFlattenTask(adam_command='vcf2adam',
                                allowed_file_formats=['vcf'])
+        locuspart = ADAMPartitionTask(adam_command='vcf2adam',
+                               allowed_file_formats=['vcf'])
         dependencies = [basic]
         conf = ToastConfig().config
         editions = conf['editions'] if 'editions' in conf else []
@@ -321,6 +356,8 @@ def requires(self):
                 pass # included by default
             elif edition == 'flat':
                 dependencies.append(flat)
+            elif edition == 'locuspart':
+                dependencies.append(locuspart)
         return dependencies
 
     def run(self):
@@ -337,6 +374,8 @@ def requires(self):
                               allowed_file_formats=['sam', 'bam'])
         flat = ADAMFlattenTask(adam_command='transform',
                                allowed_file_formats=['sam', 'bam'])
+        locuspart = ADAMPartitionTask(adam_command='transform',
+                                      allowed_file_formats=['sam', 'bam'])
         dependencies = [basic]
         conf = ToastConfig().config
         editions = conf['editions'] if 'editions' in conf else []
@@ -345,5 +384,7 @@ def requires(self):
                 pass # included by default
             elif edition == 'flat':
                 dependencies.append(flat)
+            elif edition == 'locuspart':
+                dependencies.append(locuspart)
         return dependencies
 
diff --git a/genotypes-partition-strategy.json b/genotypes-partition-strategy.json
new file mode 100644
index 0000000..a85255d
--- /dev/null
+++ b/genotypes-partition-strategy.json
@@ -0,0 +1,4 @@
+[
+  { "type": "identity", "source": "variant.contig.contigName", "name": "chr" },
+  { "type": "range", "source": "variant.start", "name": "pos", "range": 10000 }
+]
\ No newline at end of file
diff --git a/test/registry/test-genotypes.json b/test/registry/test-genotypes.json
index 5d73066..6192cfc 100644
--- a/test/registry/test-genotypes.json
+++ b/test/registry/test-genotypes.json
@@ -2,7 +2,7 @@
     "name": "test-genotypes",
     "title": "Test 1000 Genomes Project VCF data",
     "dag": "VCF2ADAMTask",
-    "editions": ["basic", "flat"],
+    "editions": ["basic", "locuspart"],
     "sources": [
         {"format": "vcf", "compression": true, "url": "https://github.com/bigdatagenomics/eggo/raw/master/test/resources/chr22.small.vcf.gz"}
     ]

From ce6e2152d2f859d6e5c23fef82d5d4bc7015afde Mon Sep 17 00:00:00 2001
From: Tom White <tom@cloudera.com>
Date: Fri, 17 Apr 2015 09:54:15 +0100
Subject: [PATCH 2/6] [EGGO-30] Generate flattened, partitioned data.

---
 eggo/dag.py                            | 44 +++++++++++++++++++++++++-
 genotypes-flat-partition-strategy.json |  4 +++
 genotypes-partition-strategy.json      |  2 +-
 test/registry/test-genotypes.json      |  2 +-
 4 files changed, 49 insertions(+), 3 deletions(-)
 create mode 100644 genotypes-flat-partition-strategy.json

diff --git a/eggo/dag.py b/eggo/dag.py
index 753c9f4..baae10b 100644
--- a/eggo/dag.py
+++ b/eggo/dag.py
@@ -339,6 +339,40 @@ def output(self):
                                           edition=self.edition))
 
 
+class ADAMFlattenPartitionTask(Task):
+
+    adam_command = Parameter()
+    allowed_file_formats = Parameter()
+    source_edition = 'flat'
+    edition = 'flat_locuspart'
+
+    def requires(self):
+        return ADAMBasicTask(adam_command=self.adam_command,
+                             allowed_file_formats=self.allowed_file_formats)
+
+    def run(self):
+        adam_cmd = ('{adam_home}/bin/adam-submit --master {spark_master_url} partition'
+                    ' -partition_strategy_file {partition_strategy_file}'
+                    ' {source} {target}').format(
+            adam_home=os.environ['ADAM_HOME'],
+            spark_master_url=os.environ['SPARK_MASTER_URL'],
+            partition_strategy_file='genotypes-flat-partition-strategy.json',
+            source=target_s3n_url(ToastConfig().config['name'],
+                                  edition=self.source_edition),
+            target=target_s3n_url(ToastConfig().config['name'],
+                                  edition=self.edition))
+        p = Popen(adam_cmd, shell=True)
+        p.wait()
+
+        if p.returncode == 0:
+            create_SUCCESS_file(target_s3_url(ToastConfig().config['name'],
+                                              edition=self.edition))
+
+    def output(self):
+        return S3FlagTarget(target_s3_url(ToastConfig().config['name'],
+                                          edition=self.edition))
+
+
 class VCF2ADAMTask(Task):
 
     def requires(self):
@@ -347,7 +381,9 @@ def requires(self):
         flat = ADAMFlattenTask(adam_command='vcf2adam',
                                allowed_file_formats=['vcf'])
         locuspart = ADAMPartitionTask(adam_command='vcf2adam',
-                               allowed_file_formats=['vcf'])
+                                      allowed_file_formats=['vcf'])
+        flat_locuspart = ADAMFlattenPartitionTask(adam_command='vcf2adam',
+                                                  allowed_file_formats=['vcf'])
         dependencies = [basic]
         conf = ToastConfig().config
         editions = conf['editions'] if 'editions' in conf else []
@@ -358,6 +394,8 @@ def requires(self):
                 dependencies.append(flat)
             elif edition == 'locuspart':
                 dependencies.append(locuspart)
+            elif edition == 'flat_locuspart':
+                dependencies.append(flat_locuspart)
         return dependencies
 
     def run(self):
@@ -376,6 +414,8 @@ def requires(self):
                                allowed_file_formats=['sam', 'bam'])
         locuspart = ADAMPartitionTask(adam_command='transform',
                                       allowed_file_formats=['sam', 'bam'])
+        flat_locuspart = ADAMFlattenPartitionTask(adam_command='transform',
+                                                  allowed_file_formats=['sam', 'bam'])
         dependencies = [basic]
         conf = ToastConfig().config
         editions = conf['editions'] if 'editions' in conf else []
@@ -386,5 +426,7 @@ def requires(self):
                 dependencies.append(flat)
             elif edition == 'locuspart':
                 dependencies.append(locuspart)
+            elif edition == 'flat_locuspart':
+                dependencies.append(flat_locuspart)
         return dependencies
 
diff --git a/genotypes-flat-partition-strategy.json b/genotypes-flat-partition-strategy.json
new file mode 100644
index 0000000..1aa5950
--- /dev/null
+++ b/genotypes-flat-partition-strategy.json
@@ -0,0 +1,4 @@
+[
+  { "type": "identity", "source": "variant__contig__contigName", "name": "chr" },
+  { "type": "range", "source": "variant__start", "name": "pos", "range": 1000000 }
+]
\ No newline at end of file
diff --git a/genotypes-partition-strategy.json b/genotypes-partition-strategy.json
index a85255d..14be592 100644
--- a/genotypes-partition-strategy.json
+++ b/genotypes-partition-strategy.json
@@ -1,4 +1,4 @@
 [
   { "type": "identity", "source": "variant.contig.contigName", "name": "chr" },
-  { "type": "range", "source": "variant.start", "name": "pos", "range": 10000 }
+  { "type": "range", "source": "variant.start", "name": "pos", "range": 1000000 }
 ]
\ No newline at end of file
diff --git a/test/registry/test-genotypes.json b/test/registry/test-genotypes.json
index 6192cfc..41af00b 100644
--- a/test/registry/test-genotypes.json
+++ b/test/registry/test-genotypes.json
@@ -2,7 +2,7 @@
     "name": "test-genotypes",
     "title": "Test 1000 Genomes Project VCF data",
     "dag": "VCF2ADAMTask",
-    "editions": ["basic", "locuspart"],
+    "editions": ["basic", "flat", "locuspart", "flat_locuspart"],
     "sources": [
         {"format": "vcf", "compression": true, "url": "https://github.com/bigdatagenomics/eggo/raw/master/test/resources/chr22.small.vcf.gz"}
     ]

From 0366ec201ce1a0b9ab0493deff092b5e2af00bbe Mon Sep 17 00:00:00 2001
From: Tom White <tom@cloudera.com>
Date: Tue, 21 Apr 2015 17:56:18 +0100
Subject: [PATCH 3/6] [EGGO-30] Use Crunch-based MR partitioner, from
 https://github.com/tomwhite/adam-partitioning, while Spark version is being
 debugged.

---
 README.md             |  1 +
 eggo-ec2-variables.sh |  1 +
 eggo/dag.py           | 26 ++++++++++++++------------
 eggo/fabric_util.py   |  6 ++++++
 4 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 320ea44..bba9c87 100644
--- a/README.md
+++ b/README.md
@@ -137,6 +137,7 @@ export HADOOP_HOME=~/sw/hadoop-2.5.1/
 export SPARK_HOME=~/sw/spark-1.3.0-bin-hadoop2.4/
 export SPARK_MASTER_URL=local
 export STREAMING_JAR=$HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-2.5.1.jar
+export ADAM_PARTITIONING_JAR=~/workspace/adam-partitioning~/workspace/adam-partitioning/target/adam-partitioning-0.0.1-SNAPSHOT-job.jar
 export PATH=$PATH:$HADOOP_HOME/bin
 ```
 
diff --git a/eggo-ec2-variables.sh b/eggo-ec2-variables.sh
index 8e5621d..68c0737 100644
--- a/eggo-ec2-variables.sh
+++ b/eggo-ec2-variables.sh
@@ -22,4 +22,5 @@ source /root/spark-ec2/ec2-variables.sh
 export SPARK_MASTER="$MASTERS"
 export SPARK_MASTER_URL="spark://$SPARK_MASTER:7077"
 export STREAMING_JAR=$HADOOP_HOME/contrib/streaming/hadoop-streaming-1.0.4.jar
+export ADAM_PARTITIONING_JAR=/root/adam-partitioning/adam-partitioning-0.0.1-SNAPSHOT-job.jar
 export PATH=$PATH:$HADOOP_HOME/bin
diff --git a/eggo/dag.py b/eggo/dag.py
index baae10b..c74cf8f 100644
--- a/eggo/dag.py
+++ b/eggo/dag.py
@@ -317,12 +317,13 @@ def requires(self):
                              allowed_file_formats=self.allowed_file_formats)
 
     def run(self):
-        adam_cmd = ('{adam_home}/bin/adam-submit --master {spark_master_url} partition'
-                    ' -partition_strategy_file {partition_strategy_file}'
-                    ' {source} {target}').format(
-            adam_home=os.environ['ADAM_HOME'],
-            spark_master_url=os.environ['SPARK_MASTER_URL'],
-            partition_strategy_file='genotypes-partition-strategy.json',
+        adam_cmd = ('{hadoop_home}/bin/hadoop jar {adam_partitioning_jar}'
+                    ' CrunchPartitionTool -D mapreduce.job.reduces={parallelism}'
+                    ' {partition_strategy_file} {source} {target}').format(
+            hadoop_home=os.environ['HADOOP_HOME'],
+            adam_partitioning_jar=os.environ['ADAM_PARTITIONING_JAR'],
+            parallelism=1,
+            partition_strategy_file='genotypes-partition-strategy',
             source=target_s3n_url(ToastConfig().config['name'],
                                   edition=self.source_edition),
             target=target_s3n_url(ToastConfig().config['name'],
@@ -351,12 +352,13 @@ def requires(self):
                              allowed_file_formats=self.allowed_file_formats)
 
     def run(self):
-        adam_cmd = ('{adam_home}/bin/adam-submit --master {spark_master_url} partition'
-                    ' -partition_strategy_file {partition_strategy_file}'
-                    ' {source} {target}').format(
-            adam_home=os.environ['ADAM_HOME'],
-            spark_master_url=os.environ['SPARK_MASTER_URL'],
-            partition_strategy_file='genotypes-flat-partition-strategy.json',
+        adam_cmd = ('{hadoop_home}/bin/hadoop jar {adam_partitioning_jar}'
+                    ' CrunchPartitionTool -D mapreduce.job.reduces={parallelism}'
+                    ' {partition_strategy_file} {source} {target}').format(
+            hadoop_home=os.environ['HADOOP_HOME'],
+            adam_partitioning_jar=os.environ['ADAM_PARTITIONING_JAR'],
+            parallelism=1,
+            partition_strategy_file='flat-genotypes-partition-strategy',
             source=target_s3n_url(ToastConfig().config['name'],
                                   edition=self.source_edition),
             target=target_s3n_url(ToastConfig().config['name'],
diff --git a/eggo/fabric_util.py b/eggo/fabric_util.py
index 08f0f9f..f32c136 100644
--- a/eggo/fabric_util.py
+++ b/eggo/fabric_util.py
@@ -99,6 +99,12 @@ def _install_adam():
             run('mvn clean package -DskipTests')
 
 
+def _install_adam_partitioning():
+    run('mkdir -p /root/adam-partitioning')
+    with cd('/root/adam-partitioning'):
+        run('wget https://github.com/tomwhite/adam-partitioning/raw/master/lib/adam-partitioning-0.0.1-SNAPSHOT-job.jar')
+
+
 def _install_eggo(fork='bigdatagenomics', branch='master'):
     # check out eggo
     with cd('~'):

From d5f2d9897a0de6f6eb50a1616a59f49c564d1cef Mon Sep 17 00:00:00 2001
From: Tom White <tom@cloudera.com>
Date: Wed, 22 Apr 2015 11:33:20 +0100
Subject: [PATCH 4/6] [EGGO-30] Add support for partitioning BAM/SAM.

---
 eggo/dag.py                                  | 18 ++++++++++++------
 genotypes-flat-partition-strategy.json       |  4 ----
 genotypes-partition-strategy.json            |  4 ----
 test/registry/test-1kg-genotypes-subset.json |  9 +++++++++
 test/registry/test-alignments.json           |  4 ++--
 5 files changed, 23 insertions(+), 16 deletions(-)
 delete mode 100644 genotypes-flat-partition-strategy.json
 delete mode 100644 genotypes-partition-strategy.json
 create mode 100644 test/registry/test-1kg-genotypes-subset.json

diff --git a/eggo/dag.py b/eggo/dag.py
index c74cf8f..300fe15 100644
--- a/eggo/dag.py
+++ b/eggo/dag.py
@@ -309,6 +309,7 @@ class ADAMPartitionTask(Task):
 
     adam_command = Parameter()
     allowed_file_formats = Parameter()
+    partition_strategy_file = Parameter()
     source_edition = 'basic'
     edition = 'locuspart'
 
@@ -323,7 +324,7 @@ def run(self):
             hadoop_home=os.environ['HADOOP_HOME'],
             adam_partitioning_jar=os.environ['ADAM_PARTITIONING_JAR'],
             parallelism=1,
-            partition_strategy_file='genotypes-partition-strategy',
+            partition_strategy_file=self.partition_strategy_file,
             source=target_s3n_url(ToastConfig().config['name'],
                                   edition=self.source_edition),
             target=target_s3n_url(ToastConfig().config['name'],
@@ -344,6 +345,7 @@ class ADAMFlattenPartitionTask(Task):
 
     adam_command = Parameter()
     allowed_file_formats = Parameter()
+    partition_strategy_file = Parameter()
     source_edition = 'flat'
     edition = 'flat_locuspart'
 
@@ -358,7 +360,7 @@ def run(self):
             hadoop_home=os.environ['HADOOP_HOME'],
             adam_partitioning_jar=os.environ['ADAM_PARTITIONING_JAR'],
             parallelism=1,
-            partition_strategy_file='flat-genotypes-partition-strategy',
+            partition_strategy_file=self.partition_strategy_file,
             source=target_s3n_url(ToastConfig().config['name'],
                                   edition=self.source_edition),
             target=target_s3n_url(ToastConfig().config['name'],
@@ -383,9 +385,11 @@ def requires(self):
         flat = ADAMFlattenTask(adam_command='vcf2adam',
                                allowed_file_formats=['vcf'])
         locuspart = ADAMPartitionTask(adam_command='vcf2adam',
-                                      allowed_file_formats=['vcf'])
+                                      allowed_file_formats=['vcf'],
+                                      partition_strategy_file='genotypes-partition-strategy')
         flat_locuspart = ADAMFlattenPartitionTask(adam_command='vcf2adam',
-                                                  allowed_file_formats=['vcf'])
+                                                  allowed_file_formats=['vcf'],
+                                                  partition_strategy_file='flat-genotypes-partition-strategy')
         dependencies = [basic]
         conf = ToastConfig().config
         editions = conf['editions'] if 'editions' in conf else []
@@ -415,9 +419,11 @@ def requires(self):
         flat = ADAMFlattenTask(adam_command='transform',
                                allowed_file_formats=['sam', 'bam'])
         locuspart = ADAMPartitionTask(adam_command='transform',
-                                      allowed_file_formats=['sam', 'bam'])
+                                      allowed_file_formats=['sam', 'bam'],
+                                      partition_strategy_file='alignments-partition-strategy')
         flat_locuspart = ADAMFlattenPartitionTask(adam_command='transform',
-                                                  allowed_file_formats=['sam', 'bam'])
+                                                  allowed_file_formats=['sam', 'bam'],
+                                                  partition_strategy_file='flat-alignments-partition-strategy')
         dependencies = [basic]
         conf = ToastConfig().config
         editions = conf['editions'] if 'editions' in conf else []
diff --git a/genotypes-flat-partition-strategy.json b/genotypes-flat-partition-strategy.json
deleted file mode 100644
index 1aa5950..0000000
--- a/genotypes-flat-partition-strategy.json
+++ /dev/null
@@ -1,4 +0,0 @@
-[
-  { "type": "identity", "source": "variant__contig__contigName", "name": "chr" },
-  { "type": "range", "source": "variant__start", "name": "pos", "range": 1000000 }
-]
\ No newline at end of file
diff --git a/genotypes-partition-strategy.json b/genotypes-partition-strategy.json
deleted file mode 100644
index 14be592..0000000
--- a/genotypes-partition-strategy.json
+++ /dev/null
@@ -1,4 +0,0 @@
-[
-  { "type": "identity", "source": "variant.contig.contigName", "name": "chr" },
-  { "type": "range", "source": "variant.start", "name": "pos", "range": 1000000 }
-]
\ No newline at end of file
diff --git a/test/registry/test-1kg-genotypes-subset.json b/test/registry/test-1kg-genotypes-subset.json
new file mode 100644
index 0000000..5961989
--- /dev/null
+++ b/test/registry/test-1kg-genotypes-subset.json
@@ -0,0 +1,9 @@
+{
+    "name": "test-1kg-genotypes-subset",
+    "title": "Test 1000 Genomes Project VCF data",
+    "dag": "VCF2ADAMTask",
+    "editions": ["basic", "flat", "locuspart", "flat_locuspart"],
+    "sources": [
+      {"format": "vcf", "compression": true, "url": "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20110521/ALL.chr22.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz"}
+    ]
+}
diff --git a/test/registry/test-alignments.json b/test/registry/test-alignments.json
index cdf85fb..7084702 100644
--- a/test/registry/test-alignments.json
+++ b/test/registry/test-alignments.json
@@ -2,8 +2,8 @@
     "name": "test-alignments",
     "title": "Test SAM data",
     "dag": "BAM2ADAMTask",
-    "editions": ["basic", "flat"],
+    "editions": ["basic", "flat", "locuspart", "flat_locuspart"],
     "sources": [
-        {"format": "sam", "compression": false, "url": "https://raw.githubusercontent.com/bigdatagenomics/adam/master/adam-core/src/test/resources/reads12.sam"}
+        {"format": "sam", "compression": false, "url": "https://github.com/bigdatagenomics/eggo/raw/master/test/resources/small.sam"}
     ]
 }

From 1cad68c36f17006b2da5cbb838b867f655291b82 Mon Sep 17 00:00:00 2001
From: Tom White <tom@cloudera.com>
Date: Wed, 22 Apr 2015 12:26:31 +0100
Subject: [PATCH 5/6] Add hint for parallelism in registry files.

---
 eggo/dag.py                                  | 76 ++++++++------------
 test/registry/test-1kg-genotypes-subset.json |  1 +
 test/registry/test-alignments.json           |  1 +
 test/registry/test-genotypes.json            |  1 +
 4 files changed, 31 insertions(+), 48 deletions(-)

diff --git a/eggo/dag.py b/eggo/dag.py
index 300fe15..a64aa98 100644
--- a/eggo/dag.py
+++ b/eggo/dag.py
@@ -309,45 +309,10 @@ class ADAMPartitionTask(Task):
 
     adam_command = Parameter()
     allowed_file_formats = Parameter()
+    source_edition = Parameter()
+    edition = Parameter()
     partition_strategy_file = Parameter()
-    source_edition = 'basic'
-    edition = 'locuspart'
-
-    def requires(self):
-        return ADAMBasicTask(adam_command=self.adam_command,
-                             allowed_file_formats=self.allowed_file_formats)
-
-    def run(self):
-        adam_cmd = ('{hadoop_home}/bin/hadoop jar {adam_partitioning_jar}'
-                    ' CrunchPartitionTool -D mapreduce.job.reduces={parallelism}'
-                    ' {partition_strategy_file} {source} {target}').format(
-            hadoop_home=os.environ['HADOOP_HOME'],
-            adam_partitioning_jar=os.environ['ADAM_PARTITIONING_JAR'],
-            parallelism=1,
-            partition_strategy_file=self.partition_strategy_file,
-            source=target_s3n_url(ToastConfig().config['name'],
-                                  edition=self.source_edition),
-            target=target_s3n_url(ToastConfig().config['name'],
-                                  edition=self.edition))
-        p = Popen(adam_cmd, shell=True)
-        p.wait()
-
-        if p.returncode == 0:
-            create_SUCCESS_file(target_s3_url(ToastConfig().config['name'],
-                                              edition=self.edition))
-
-    def output(self):
-        return S3FlagTarget(target_s3_url(ToastConfig().config['name'],
-                                          edition=self.edition))
-
-
-class ADAMFlattenPartitionTask(Task):
-
-    adam_command = Parameter()
-    allowed_file_formats = Parameter()
-    partition_strategy_file = Parameter()
-    source_edition = 'flat'
-    edition = 'flat_locuspart'
+    parallelism = Parameter()
 
     def requires(self):
         return ADAMBasicTask(adam_command=self.adam_command,
@@ -359,7 +324,7 @@ def run(self):
                     ' {partition_strategy_file} {source} {target}').format(
             hadoop_home=os.environ['HADOOP_HOME'],
             adam_partitioning_jar=os.environ['ADAM_PARTITIONING_JAR'],
-            parallelism=1,
+            parallelism=self.parallelism,
             partition_strategy_file=self.partition_strategy_file,
             source=target_s3n_url(ToastConfig().config['name'],
                                   edition=self.source_edition),
@@ -376,20 +341,27 @@ def output(self):
         return S3FlagTarget(target_s3_url(ToastConfig().config['name'],
                                           edition=self.edition))
 
-
 class VCF2ADAMTask(Task):
 
     def requires(self):
+        conf = ToastConfig().config
+        parallelism = conf['numPartitionsHint'] if 'numPartitionsHint' in conf else 1
         basic = ADAMBasicTask(adam_command='vcf2adam',
                               allowed_file_formats=['vcf'])
         flat = ADAMFlattenTask(adam_command='vcf2adam',
                                allowed_file_formats=['vcf'])
         locuspart = ADAMPartitionTask(adam_command='vcf2adam',
                                       allowed_file_formats=['vcf'],
-                                      partition_strategy_file='genotypes-partition-strategy')
-        flat_locuspart = ADAMFlattenPartitionTask(adam_command='vcf2adam',
-                                                  allowed_file_formats=['vcf'],
-                                                  partition_strategy_file='flat-genotypes-partition-strategy')
+                                      source_edition='basic',
+                                      edition='locuspart',
+                                      partition_strategy_file='genotypes-partition-strategy',
+                                      parallelism=parallelism)
+        flat_locuspart = ADAMPartitionTask(adam_command='vcf2adam',
+                                           allowed_file_formats=['vcf'],
+                                           source_edition='flat',
+                                           edition='flat_locuspart',
+                                           partition_strategy_file='flat-genotypes-partition-strategy',
+                                           parallelism=parallelism)
         dependencies = [basic]
         conf = ToastConfig().config
         editions = conf['editions'] if 'editions' in conf else []
@@ -414,16 +386,24 @@ def output(self):
 class BAM2ADAMTask(Task):
 
     def requires(self):
+        conf = ToastConfig().config
+        parallelism = conf['numPartitionsHint'] if 'numPartitionsHint' in conf else 1
         basic = ADAMBasicTask(adam_command='transform',
                               allowed_file_formats=['sam', 'bam'])
         flat = ADAMFlattenTask(adam_command='transform',
                                allowed_file_formats=['sam', 'bam'])
         locuspart = ADAMPartitionTask(adam_command='transform',
                                       allowed_file_formats=['sam', 'bam'],
-                                      partition_strategy_file='alignments-partition-strategy')
-        flat_locuspart = ADAMFlattenPartitionTask(adam_command='transform',
-                                                  allowed_file_formats=['sam', 'bam'],
-                                                  partition_strategy_file='flat-alignments-partition-strategy')
+                                      source_edition='basic',
+                                      edition='locuspart',
+                                      partition_strategy_file='alignments-partition-strategy',
+                                      parallelism=parallelism)
+        flat_locuspart = ADAMPartitionTask(adam_command='transform',
+                                           allowed_file_formats=['sam', 'bam'],
+                                           source_edition='flat',
+                                           edition='flat_locuspart',
+                                           partition_strategy_file='flat-alignments-partition-strategy',
+                                           parallelism=parallelism)
         dependencies = [basic]
         conf = ToastConfig().config
         editions = conf['editions'] if 'editions' in conf else []
diff --git a/test/registry/test-1kg-genotypes-subset.json b/test/registry/test-1kg-genotypes-subset.json
index 5961989..d6276bf 100644
--- a/test/registry/test-1kg-genotypes-subset.json
+++ b/test/registry/test-1kg-genotypes-subset.json
@@ -3,6 +3,7 @@
     "title": "Test 1000 Genomes Project VCF data",
     "dag": "VCF2ADAMTask",
     "editions": ["basic", "flat", "locuspart", "flat_locuspart"],
+    "numPartitionsHint": 36,
     "sources": [
       {"format": "vcf", "compression": true, "url": "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20110521/ALL.chr22.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz"}
     ]
diff --git a/test/registry/test-alignments.json b/test/registry/test-alignments.json
index 7084702..5e15d64 100644
--- a/test/registry/test-alignments.json
+++ b/test/registry/test-alignments.json
@@ -3,6 +3,7 @@
     "title": "Test SAM data",
     "dag": "BAM2ADAMTask",
     "editions": ["basic", "flat", "locuspart", "flat_locuspart"],
+    "numPartitionsHint": 1,
     "sources": [
         {"format": "sam", "compression": false, "url": "https://github.com/bigdatagenomics/eggo/raw/master/test/resources/small.sam"}
     ]
diff --git a/test/registry/test-genotypes.json b/test/registry/test-genotypes.json
index 41af00b..3e28717 100644
--- a/test/registry/test-genotypes.json
+++ b/test/registry/test-genotypes.json
@@ -3,6 +3,7 @@
     "title": "Test 1000 Genomes Project VCF data",
     "dag": "VCF2ADAMTask",
     "editions": ["basic", "flat", "locuspart", "flat_locuspart"],
+    "numPartitionsHint": 1,
     "sources": [
         {"format": "vcf", "compression": true, "url": "https://github.com/bigdatagenomics/eggo/raw/master/test/resources/chr22.small.vcf.gz"}
     ]

From 1e236173d5b1c8507659986cf36a63c769e2e8d8 Mon Sep 17 00:00:00 2001
From: Tom White <tom@cloudera.com>
Date: Wed, 22 Apr 2015 14:17:11 +0100
Subject: [PATCH 6/6] Fix typo

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index bba9c87..e2f4a55 100644
--- a/README.md
+++ b/README.md
@@ -137,7 +137,7 @@ export HADOOP_HOME=~/sw/hadoop-2.5.1/
 export SPARK_HOME=~/sw/spark-1.3.0-bin-hadoop2.4/
 export SPARK_MASTER_URL=local
 export STREAMING_JAR=$HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-2.5.1.jar
-export ADAM_PARTITIONING_JAR=~/workspace/adam-partitioning~/workspace/adam-partitioning/target/adam-partitioning-0.0.1-SNAPSHOT-job.jar
+export ADAM_PARTITIONING_JAR=~/workspace/adam-partitioning/target/adam-partitioning-0.0.1-SNAPSHOT-job.jar
 export PATH=$PATH:$HADOOP_HOME/bin
 ```