From b122ec67821cab77b664d6545b5bcdfe1cab2672 Mon Sep 17 00:00:00 2001
From: Tom Rutten <t.rutten15@gmail.com>
Date: Thu, 16 Sep 2021 21:39:41 -0400
Subject: [PATCH] IMDB Reviews dataset (#12)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* download imdb reviews dataset

* remove shuffle, add arg for example types

* add basic docs

* filter & reduce in comprehension

Co-authored-by: José Valim <jose.valim@dashbit.co>

* remove parenthesis

* add download tests

* change `download*` return type, add specs
& adapt tests to new return type

* spec `download` w/o opts, opts spec more explicit

* commit regex once

Co-authored-by: José Valim <jose.valim@dashbit.co>

* Convert to string once and use binary matching instead of regex

Co-authored-by: José Valim <jose.valim@dashbit.co>

* update tests

* -1 -> 0 for negative examples

* simplify api

* bump release

* alphabetize

Co-authored-by: José Valim <jose.valim@dashbit.co>
---
 README.md                   |  5 +--
 lib/scidata/imdb_reviews.ex | 68 +++++++++++++++++++++++++++++++++++++
 mix.exs                     |  2 +-
 test/imdb_reviews_test.exs  | 58 +++++++++++++++++++++++++++++++
 4 files changed, 130 insertions(+), 3 deletions(-)
 create mode 100644 lib/scidata/imdb_reviews.ex
 create mode 100644 test/imdb_reviews_test.exs

diff --git a/README.md b/README.md
index 7125f9d..46d74fb 100644
--- a/README.md
+++ b/README.md
@@ -4,10 +4,11 @@
 
 Scidata currently supports the following training and test datasets:
 
-- MNIST
-- FashionMNIST
 - CIFAR10
 - CIFAR100
+- FashionMNIST
+- IMDb Reviews
+- MNIST
 
 Download or fetch datasets locally:
 
diff --git a/lib/scidata/imdb_reviews.ex b/lib/scidata/imdb_reviews.ex
new file mode 100644
index 0000000..c10f9b8
--- /dev/null
+++ b/lib/scidata/imdb_reviews.ex
@@ -0,0 +1,68 @@
+defmodule Scidata.IMDBReviews do
+  @moduledoc """
+  Module for downloading the [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/).
+  """
+
+  @base_url "http://ai.stanford.edu/~amaas/data/sentiment/"
+  @dataset_file "aclImdb_v1.tar.gz"
+
+  alias Scidata.Utils
+
+  @type train_sentiment :: :pos | :neg | :unsup
+  @type test_sentiment :: :pos | :neg
+  @type opts :: [
+          transform_inputs: ([binary, ...] -> any),
+          transform_labels: ([integer, ...] -> any)
+        ]
+
+  @doc """
+  Downloads the IMDB reviews training dataset or fetches it locally.
+
+  `example_types` specifies which examples in the dataset should be returned
+  according to each example's label: `:pos` for positive examples, `:neg` for
+  negative examples, and `:unsup` for unlabeled examples.
+  """
+  @spec download(example_types: [test_sentiment]) :: %{review: [binary(), ...], sentiment: 1 | 0}
+  def download(opts \\ []), do: download_dataset(:train, opts)
+
+  @doc """
+  Downloads the IMDB reviews test dataset or fetches it locally.
+
+  `example_types` is the same argument in `download/2` but excludes `:unsup`
+  because all unlabeled examples are in the training set.
+  """
+  @spec download_test(example_types: [test_sentiment]) :: %{
+          review: [binary(), ...],
+          sentiment: 1 | 0
+        }
+  def download_test(opts \\ []), do: download_dataset(:test, opts)
+
+  defp download_dataset(dataset_type, opts) do
+    example_types = opts[:example_types] || [:pos, :neg]
+    transform_inputs = opts[:transform_inputs] || (& &1)
+    transform_labels = opts[:transform_labels] || (& &1)
+
+    files = Utils.get!(@base_url <> @dataset_file).body
+    regex = ~r"#{dataset_type}/(#{Enum.join(example_types, "|")})/"
+
+    {inputs, labels} =
+      for {fname, contents} <- files,
+          List.to_string(fname) =~ regex,
+          reduce: {[], []} do
+        {inputs, labels} ->
+          {[contents | inputs], [get_label(fname) | labels]}
+      end
+
+    %{review: transform_inputs.(inputs), sentiment: transform_labels.(labels)}
+  end
+
+  defp get_label(fname) do
+    fname = List.to_string(fname)
+
+    cond do
+      fname =~ "pos" -> 1
+      fname =~ "neg" -> 0
+      fname =~ "unsup" -> nil
+    end
+  end
+end
diff --git a/mix.exs b/mix.exs
index 592ba8c..b3fbbc7 100644
--- a/mix.exs
+++ b/mix.exs
@@ -1,7 +1,7 @@
 defmodule Scidata.MixProject do
   use Mix.Project
 
-  @version "0.1.1"
+  @version "0.1.2"
   @repo_url "https://github.com/elixir-nx/scidata"
 
   def project do
diff --git a/test/imdb_reviews_test.exs b/test/imdb_reviews_test.exs
new file mode 100644
index 0000000..e38a620
--- /dev/null
+++ b/test/imdb_reviews_test.exs
@@ -0,0 +1,58 @@
+defmodule IMDBReviewsTest do
+  use ExUnit.Case
+
+  @moduletag timeout: 120_000
+
+  describe "download" do
+    test "retrieves training set" do
+      %{review: train_inputs, sentiment: train_targets} =
+        Scidata.IMDBReviews.download()
+
+      assert length(train_inputs) == 25000
+      assert length(train_targets) == 25000
+
+      %{review: train_inputs, sentiment: train_targets} =
+        Scidata.IMDBReviews.download(example_types: [:pos, :neg])
+
+      assert length(train_inputs) == 25000
+      assert length(train_targets) == 25000
+
+      %{review: train_inputs, sentiment: train_targets} =
+        Scidata.IMDBReviews.download(example_types: [:pos, :neg, :unsup])
+
+      assert length(train_inputs) == 75000
+      assert length(train_targets) == 75000
+    end
+
+    test "retrieves test set" do
+      %{review: test_inputs, sentiment: test_targets} =
+        Scidata.IMDBReviews.download_test(example_types: [:pos, :neg])
+
+      assert length(test_inputs) == 25000
+      assert length(test_targets) == 25000
+      assert [0, 0, 0, 0, 0] = Enum.take(test_targets, -5)
+    end
+
+    test "utilizes transform opts" do
+      clip = fn inputs -> Enum.map(inputs, &String.slice(&1, 0..20)) end
+
+      %{review: reviews, sentiment: targets} =
+        Scidata.IMDBReviews.download(example_types: [:pos], transform_inputs: clip)
+
+      assert Enum.take(reviews, 10) == [
+               "The story centers aro",
+               "'The Adventures Of Ba",
+               "This film and it's se",
+               "I love this movie lik",
+               "A hit at the time but",
+               "Very smart, sometimes",
+               "With the mixed review",
+               "This movie really kic",
+               "I'd always wanted Dav",
+               "Like I said its a hid"
+             ]
+
+      assert Enum.take(targets, 10) == [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+    end
+  end
+end