From b122ec67821cab77b664d6545b5bcdfe1cab2672 Mon Sep 17 00:00:00 2001 From: Tom Rutten Date: Thu, 16 Sep 2021 21:39:41 -0400 Subject: [PATCH] IMDB Reviews dataset (#12) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * download imdb reviews dataset * remove shuffle, add arg for example types * add basic docs * filter & reduce in comprehension Co-authored-by: José Valim * remove parenthesis * add download tests * change `download*` return type, add specs & adapt tests to new return type * spec `download` w/o opts, opts spec more explicit * commit regex once Co-authored-by: José Valim * Convert to string once and use binary matching instead of regex Co-authored-by: José Valim * update tests * -1 -> 0 for negative examples * simplify api * bump release * alphabetize Co-authored-by: José Valim --- README.md | 5 +-- lib/scidata/imdb_reviews.ex | 68 +++++++++++++++++++++++++++++++++++++ mix.exs | 2 +- test/imdb_reviews_test.exs | 58 +++++++++++++++++++++++++++++++ 4 files changed, 130 insertions(+), 3 deletions(-) create mode 100644 lib/scidata/imdb_reviews.ex create mode 100644 test/imdb_reviews_test.exs diff --git a/README.md b/README.md index 7125f9d..46d74fb 100644 --- a/README.md +++ b/README.md @@ -4,10 +4,11 @@ Scidata currently supports the following training and test datasets: -- MNIST -- FashionMNIST - CIFAR10 - CIFAR100 +- FashionMNIST +- IMDb Reviews +- MNIST Download or fetch datasets locally: diff --git a/lib/scidata/imdb_reviews.ex b/lib/scidata/imdb_reviews.ex new file mode 100644 index 0000000..c10f9b8 --- /dev/null +++ b/lib/scidata/imdb_reviews.ex @@ -0,0 +1,68 @@ +defmodule Scidata.IMDBReviews do + @moduledoc """ + Module for downloading the [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/). + """ + + @base_url "http://ai.stanford.edu/~amaas/data/sentiment/" + @dataset_file "aclImdb_v1.tar.gz" + + alias Scidata.Utils + + @type train_sentiment :: :pos | :neg | :unsup + @type test_sentiment :: :pos | :neg + @type opts :: [ + transform_inputs: ([binary, ...] -> any), + transform_labels: ([integer, ...] -> any) + ] + + @doc """ + Downloads the IMDB reviews training dataset or fetches it locally. + + `example_types` specifies which examples in the dataset should be returned + according to each example's label: `:pos` for positive examples, `:neg` for + negative examples, and `:unsup` for unlabeled examples. + """ + @spec download(example_types: [test_sentiment]) :: %{review: [binary(), ...], sentiment: 1 | 0} + def download(opts \\ []), do: download_dataset(:train, opts) + + @doc """ + Downloads the IMDB reviews test dataset or fetches it locally. + + `example_types` is the same argument in `download/2` but excludes `:unsup` + because all unlabeled examples are in the training set. + """ + @spec download_test(example_types: [test_sentiment]) :: %{ + review: [binary(), ...], + sentiment: 1 | 0 + } + def download_test(opts \\ []), do: download_dataset(:test, opts) + + defp download_dataset(dataset_type, opts) do + example_types = opts[:example_types] || [:pos, :neg] + transform_inputs = opts[:transform_inputs] || (& &1) + transform_labels = opts[:transform_labels] || (& &1) + + files = Utils.get!(@base_url <> @dataset_file).body + regex = ~r"#{dataset_type}/(#{Enum.join(example_types, "|")})/" + + {inputs, labels} = + for {fname, contents} <- files, + List.to_string(fname) =~ regex, + reduce: {[], []} do + {inputs, labels} -> + {[contents | inputs], [get_label(fname) | labels]} + end + + %{review: transform_inputs.(inputs), sentiment: transform_labels.(labels)} + end + + defp get_label(fname) do + fname = List.to_string(fname) + + cond do + fname =~ "pos" -> 1 + fname =~ "neg" -> 0 + fname =~ "unsup" -> nil + end + end +end diff --git a/mix.exs b/mix.exs index 592ba8c..b3fbbc7 100644 --- a/mix.exs +++ b/mix.exs @@ -1,7 +1,7 @@ defmodule Scidata.MixProject do use Mix.Project - @version "0.1.1" + @version "0.1.2" @repo_url "https://github.com/elixir-nx/scidata" def project do diff --git a/test/imdb_reviews_test.exs b/test/imdb_reviews_test.exs new file mode 100644 index 0000000..e38a620 --- /dev/null +++ b/test/imdb_reviews_test.exs @@ -0,0 +1,58 @@ +defmodule IMDBReviewsTest do + use ExUnit.Case + + @moduletag timeout: 120_000 + + describe "download" do + test "retrieves training set" do + %{review: train_inputs, sentiment: train_targets} = + Scidata.IMDBReviews.download() + + assert length(train_inputs) == 25000 + assert length(train_targets) == 25000 + + %{review: train_inputs, sentiment: train_targets} = + Scidata.IMDBReviews.download(example_types: [:pos, :neg]) + + assert length(train_inputs) == 25000 + assert length(train_targets) == 25000 + + %{review: train_inputs, sentiment: train_targets} = + Scidata.IMDBReviews.download(example_types: [:pos, :neg, :unsup]) + + assert length(train_inputs) == 75000 + assert length(train_targets) == 75000 + end + + test "retrieves test set" do + %{review: test_inputs, sentiment: test_targets} = + Scidata.IMDBReviews.download_test(example_types: [:pos, :neg]) + + assert length(test_inputs) == 25000 + assert length(test_targets) == 25000 + assert [0, 0, 0, 0, 0] = Enum.take(test_targets, -5) + end + + test "utilizes transform opts" do + clip = fn inputs -> Enum.map(inputs, &String.slice(&1, 0..20)) end + + %{review: reviews, sentiment: targets} = + Scidata.IMDBReviews.download(example_types: [:pos], transform_inputs: clip) + + assert Enum.take(reviews, 10) == [ + "The story centers aro", + "'The Adventures Of Ba", + "This film and it's se", + "I love this movie lik", + "A hit at the time but", + "Very smart, sometimes", + "With the mixed review", + "This movie really kic", + "I'd always wanted Dav", + "Like I said its a hid" + ] + + assert Enum.take(targets, 10) == [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + end + end +end