IMDB Reviews dataset (#12)

* download imdb reviews dataset * remove shuffle, add arg for example types * add basic docs * filter & reduce in comprehension Co-authored-by: José Valim <[email protected]> * remove parenthesis * add download tests * change `download*` return type, add specs & adapt tests to new return type * spec `download` w/o opts, opts spec more explicit * commit regex once Co-authored-by: José Valim <[email protected]> * Convert to string once and use binary matching instead of regex Co-authored-by: José Valim <[email protected]> * update tests * -1 -> 0 for negative examples * simplify api * bump release * alphabetize Co-authored-by: José Valim <[email protected]>
elixir-nx · Sep 17, 2021 · b122ec6 · b122ec6
1 parent bf5e1b2
commit b122ec6
Show file tree

Hide file tree

Showing 4 changed files with 130 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -4,10 +4,11 @@
 
 Scidata currently supports the following training and test datasets:
 
-- MNIST
-- FashionMNIST
 - CIFAR10
 - CIFAR100
+- FashionMNIST
+- IMDb Reviews
+- MNIST
 
 Download or fetch datasets locally:
 

diff --git a/lib/scidata/imdb_reviews.ex b/lib/scidata/imdb_reviews.ex
@@ -0,0 +1,68 @@
+defmodule Scidata.IMDBReviews do
+  @moduledoc """
+  Module for downloading the [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/).
+  """
+
+  @base_url "http://ai.stanford.edu/~amaas/data/sentiment/"
+  @dataset_file "aclImdb_v1.tar.gz"
+
+  alias Scidata.Utils
+
+  @type train_sentiment :: :pos | :neg | :unsup
+  @type test_sentiment :: :pos | :neg
+  @type opts :: [
+          transform_inputs: ([binary, ...] -> any),
+          transform_labels: ([integer, ...] -> any)
+        ]
+
+  @doc """
+  Downloads the IMDB reviews training dataset or fetches it locally.
+
+  `example_types` specifies which examples in the dataset should be returned
+  according to each example's label: `:pos` for positive examples, `:neg` for
+  negative examples, and `:unsup` for unlabeled examples.
+  """
+  @spec download(example_types: [test_sentiment]) :: %{review: [binary(), ...], sentiment: 1 | 0}
+  def download(opts \\ []), do: download_dataset(:train, opts)
+
+  @doc """
+  Downloads the IMDB reviews test dataset or fetches it locally.
+
+  `example_types` is the same argument in `download/2` but excludes `:unsup`
+  because all unlabeled examples are in the training set.
+  """
+  @spec download_test(example_types: [test_sentiment]) :: %{
+          review: [binary(), ...],
+          sentiment: 1 | 0
+        }
+  def download_test(opts \\ []), do: download_dataset(:test, opts)
+
+  defp download_dataset(dataset_type, opts) do
+    example_types = opts[:example_types] || [:pos, :neg]
+    transform_inputs = opts[:transform_inputs] || (& &1)
+    transform_labels = opts[:transform_labels] || (& &1)
+
+    files = Utils.get!(@base_url <> @dataset_file).body
+    regex = ~r"#{dataset_type}/(#{Enum.join(example_types, "|")})/"
+
+    {inputs, labels} =
+      for {fname, contents} <- files,
+          List.to_string(fname) =~ regex,
+          reduce: {[], []} do
+        {inputs, labels} ->
+          {[contents | inputs], [get_label(fname) | labels]}
+      end
+
+    %{review: transform_inputs.(inputs), sentiment: transform_labels.(labels)}
+  end
+
+  defp get_label(fname) do
+    fname = List.to_string(fname)
+
+    cond do
+      fname =~ "pos" -> 1
+      fname =~ "neg" -> 0
+      fname =~ "unsup" -> nil
+    end
+  end
+end
diff --git a/mix.exs b/mix.exs
@@ -1,7 +1,7 @@
 defmodule Scidata.MixProject do
   use Mix.Project
 
-  @version "0.1.1"
+  @version "0.1.2"
   @repo_url "https://github.com/elixir-nx/scidata"
 
   def project do

diff --git a/test/imdb_reviews_test.exs b/test/imdb_reviews_test.exs
@@ -0,0 +1,58 @@
+defmodule IMDBReviewsTest do
+  use ExUnit.Case
+
+  @moduletag timeout: 120_000
+
+  describe "download" do
+    test "retrieves training set" do
+      %{review: train_inputs, sentiment: train_targets} =
+        Scidata.IMDBReviews.download()
+
+      assert length(train_inputs) == 25000
+      assert length(train_targets) == 25000
+
+      %{review: train_inputs, sentiment: train_targets} =
+        Scidata.IMDBReviews.download(example_types: [:pos, :neg])
+
+      assert length(train_inputs) == 25000
+      assert length(train_targets) == 25000
+
+      %{review: train_inputs, sentiment: train_targets} =
+        Scidata.IMDBReviews.download(example_types: [:pos, :neg, :unsup])
+
+      assert length(train_inputs) == 75000
+      assert length(train_targets) == 75000
+    end
+
+    test "retrieves test set" do
+      %{review: test_inputs, sentiment: test_targets} =
+        Scidata.IMDBReviews.download_test(example_types: [:pos, :neg])
+
+      assert length(test_inputs) == 25000
+      assert length(test_targets) == 25000
+      assert [0, 0, 0, 0, 0] = Enum.take(test_targets, -5)
+    end
+
+    test "utilizes transform opts" do
+      clip = fn inputs -> Enum.map(inputs, &String.slice(&1, 0..20)) end
+
+      %{review: reviews, sentiment: targets} =
+        Scidata.IMDBReviews.download(example_types: [:pos], transform_inputs: clip)
+
+      assert Enum.take(reviews, 10) == [
+               "The story centers aro",
+               "'The Adventures Of Ba",
+               "This film and it's se",
+               "I love this movie lik",
+               "A hit at the time but",
+               "Very smart, sometimes",
+               "With the mixed review",
+               "This movie really kic",
+               "I'd always wanted Dav",
+               "Like I said its a hid"
+             ]
+
+      assert Enum.take(targets, 10) == [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+    end
+  end
+end