diff --git a/tests/test_api.py b/tests/test_api.py index 8754970..951e01a 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -125,3 +125,21 @@ def test_ignores_subdirs(tmpdir): TESTDATA_DIR/"word_count/correct/output", tmpdir/"output", ) + + +def test_input_path_spaces(tmpdir): + """Run a simple MapReduce job with an input directory containing a + subdirectory. The subdirectory should be gracefully ignored. + """ + with tmpdir.as_cwd(): + madoop.mapreduce( + input_path=TESTDATA_DIR/"word_count SPACE/input SPACE", + output_dir="output", + map_exe=TESTDATA_DIR/"word_count SPACE/map SPACE.py", + reduce_exe=TESTDATA_DIR/"word_count SPACE/reduce SPACE.py", + num_reducers=4 + ) + utils.assert_dirs_eq( + TESTDATA_DIR/"word_count/correct/output", + tmpdir/"output", + ) diff --git a/tests/testdata/word_count SPACE/correct/output/part-00000 b/tests/testdata/word_count SPACE/correct/output/part-00000 new file mode 100644 index 0000000..70db879 --- /dev/null +++ b/tests/testdata/word_count SPACE/correct/output/part-00000 @@ -0,0 +1 @@ +Goodbye 1 diff --git a/tests/testdata/word_count SPACE/correct/output/part-00001 b/tests/testdata/word_count SPACE/correct/output/part-00001 new file mode 100644 index 0000000..ecc21b4 --- /dev/null +++ b/tests/testdata/word_count SPACE/correct/output/part-00001 @@ -0,0 +1,3 @@ +Bye 1 +Hadoop 2 +World 2 diff --git a/tests/testdata/word_count SPACE/correct/output/part-00002 b/tests/testdata/word_count SPACE/correct/output/part-00002 new file mode 100644 index 0000000..30f4be7 --- /dev/null +++ b/tests/testdata/word_count SPACE/correct/output/part-00002 @@ -0,0 +1 @@ +Hello 2 diff --git a/tests/testdata/word_count SPACE/input SPACE/input 01.txt b/tests/testdata/word_count SPACE/input SPACE/input 01.txt new file mode 100644 index 0000000..c614f1f --- /dev/null +++ b/tests/testdata/word_count SPACE/input SPACE/input 01.txt @@ -0,0 +1,2 @@ +Hello World +Bye World diff --git a/tests/testdata/word_count SPACE/input SPACE/input 02.txt b/tests/testdata/word_count SPACE/input SPACE/input 02.txt new file mode 100644 index 0000000..acd80a3 --- /dev/null +++ b/tests/testdata/word_count SPACE/input SPACE/input 02.txt @@ -0,0 +1,2 @@ +Hello Hadoop +Goodbye Hadoop diff --git a/tests/testdata/word_count SPACE/map SPACE.py b/tests/testdata/word_count SPACE/map SPACE.py new file mode 100755 index 0000000..4d3caf7 --- /dev/null +++ b/tests/testdata/word_count SPACE/map SPACE.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python3 +"""Word count mapper.""" +import sys + + +for line in sys.stdin: + words = line.split() + for word in words: + print(f"{word}\t1") diff --git a/tests/testdata/word_count SPACE/reduce SPACE.py b/tests/testdata/word_count SPACE/reduce SPACE.py new file mode 100755 index 0000000..b244fe0 --- /dev/null +++ b/tests/testdata/word_count SPACE/reduce SPACE.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +"""Word count reducer.""" +import sys +import itertools + + +def main(): + """Divide sorted lines into groups that share a key.""" + for key, group in itertools.groupby(sys.stdin, keyfunc): + reduce_one_group(key, group) + + +def keyfunc(line): + """Return the key from a TAB-delimited key-value pair.""" + return line.partition("\t")[0] + + +def reduce_one_group(key, group): + """Reduce one group.""" + word_count = 0 + for line in group: + count = line.partition("\t")[2] + word_count += int(count) + print(f"{key} {word_count}") + + +if __name__ == "__main__": + main()