From f595c1a3a86d018cc9d0a99d29e8c91a742858d9 Mon Sep 17 00:00:00 2001
From: Andrew DeOrio <awdeorio@umich.edu>
Date: Mon, 29 Jan 2024 08:56:56 -0500
Subject: [PATCH] Add test

---
 tests/test_api.py                             | 18 ++++++++++++
 .../correct/output/part-00000                 |  1 +
 .../correct/output/part-00001                 |  3 ++
 .../correct/output/part-00002                 |  1 +
 .../word_count SPACE/input SPACE/input 01.txt |  2 ++
 .../word_count SPACE/input SPACE/input 02.txt |  2 ++
 tests/testdata/word_count SPACE/map SPACE.py  |  9 ++++++
 .../testdata/word_count SPACE/reduce SPACE.py | 28 +++++++++++++++++++
 8 files changed, 64 insertions(+)
 create mode 100644 tests/testdata/word_count SPACE/correct/output/part-00000
 create mode 100644 tests/testdata/word_count SPACE/correct/output/part-00001
 create mode 100644 tests/testdata/word_count SPACE/correct/output/part-00002
 create mode 100644 tests/testdata/word_count SPACE/input SPACE/input 01.txt
 create mode 100644 tests/testdata/word_count SPACE/input SPACE/input 02.txt
 create mode 100755 tests/testdata/word_count SPACE/map SPACE.py
 create mode 100755 tests/testdata/word_count SPACE/reduce SPACE.py

diff --git a/tests/test_api.py b/tests/test_api.py
index 8754970..951e01a 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -125,3 +125,21 @@ def test_ignores_subdirs(tmpdir):
         TESTDATA_DIR/"word_count/correct/output",
         tmpdir/"output",
     )
+
+
+def test_input_path_spaces(tmpdir):
+    """Run a simple MapReduce job with an input directory containing a
+    subdirectory. The subdirectory should be gracefully ignored.
+    """
+    with tmpdir.as_cwd():
+        madoop.mapreduce(
+            input_path=TESTDATA_DIR/"word_count SPACE/input SPACE",
+            output_dir="output",
+            map_exe=TESTDATA_DIR/"word_count SPACE/map SPACE.py",
+            reduce_exe=TESTDATA_DIR/"word_count SPACE/reduce SPACE.py",
+            num_reducers=4
+        )
+    utils.assert_dirs_eq(
+        TESTDATA_DIR/"word_count/correct/output",
+        tmpdir/"output",
+    )
diff --git a/tests/testdata/word_count SPACE/correct/output/part-00000 b/tests/testdata/word_count SPACE/correct/output/part-00000
new file mode 100644
index 0000000..70db879
--- /dev/null
+++ b/tests/testdata/word_count SPACE/correct/output/part-00000	
@@ -0,0 +1 @@
+Goodbye 1
diff --git a/tests/testdata/word_count SPACE/correct/output/part-00001 b/tests/testdata/word_count SPACE/correct/output/part-00001
new file mode 100644
index 0000000..ecc21b4
--- /dev/null
+++ b/tests/testdata/word_count SPACE/correct/output/part-00001	
@@ -0,0 +1,3 @@
+Bye 1
+Hadoop 2
+World 2
diff --git a/tests/testdata/word_count SPACE/correct/output/part-00002 b/tests/testdata/word_count SPACE/correct/output/part-00002
new file mode 100644
index 0000000..30f4be7
--- /dev/null
+++ b/tests/testdata/word_count SPACE/correct/output/part-00002	
@@ -0,0 +1 @@
+Hello 2
diff --git a/tests/testdata/word_count SPACE/input SPACE/input 01.txt b/tests/testdata/word_count SPACE/input SPACE/input 01.txt
new file mode 100644
index 0000000..c614f1f
--- /dev/null
+++ b/tests/testdata/word_count SPACE/input SPACE/input 01.txt	
@@ -0,0 +1,2 @@
+Hello World
+Bye World
diff --git a/tests/testdata/word_count SPACE/input SPACE/input 02.txt b/tests/testdata/word_count SPACE/input SPACE/input 02.txt
new file mode 100644
index 0000000..acd80a3
--- /dev/null
+++ b/tests/testdata/word_count SPACE/input SPACE/input 02.txt	
@@ -0,0 +1,2 @@
+Hello Hadoop
+Goodbye Hadoop
diff --git a/tests/testdata/word_count SPACE/map SPACE.py b/tests/testdata/word_count SPACE/map SPACE.py
new file mode 100755
index 0000000..4d3caf7
--- /dev/null
+++ b/tests/testdata/word_count SPACE/map SPACE.py	
@@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+"""Word count mapper."""
+import sys
+
+
+for line in sys.stdin:
+    words = line.split()
+    for word in words:
+        print(f"{word}\t1")
diff --git a/tests/testdata/word_count SPACE/reduce SPACE.py b/tests/testdata/word_count SPACE/reduce SPACE.py
new file mode 100755
index 0000000..b244fe0
--- /dev/null
+++ b/tests/testdata/word_count SPACE/reduce SPACE.py	
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+"""Word count reducer."""
+import sys
+import itertools
+
+
+def main():
+    """Divide sorted lines into groups that share a key."""
+    for key, group in itertools.groupby(sys.stdin, keyfunc):
+        reduce_one_group(key, group)
+
+
+def keyfunc(line):
+    """Return the key from a TAB-delimited key-value pair."""
+    return line.partition("\t")[0]
+
+
+def reduce_one_group(key, group):
+    """Reduce one group."""
+    word_count = 0
+    for line in group:
+        count = line.partition("\t")[2]
+        word_count += int(count)
+    print(f"{key} {word_count}")
+
+
+if __name__ == "__main__":
+    main()