Skip to content

Commit

Permalink
IMDB Reviews dataset (#12)
Browse files Browse the repository at this point in the history
* download imdb reviews dataset

* remove shuffle, add arg for example types

* add basic docs

* filter & reduce in comprehension

Co-authored-by: José Valim <[email protected]>

* remove parenthesis

* add download tests

* change `download*` return type, add specs
& adapt tests to new return type

* spec `download` w/o opts, opts spec more explicit

* commit regex once

Co-authored-by: José Valim <[email protected]>

* Convert to string once and use binary matching instead of regex

Co-authored-by: José Valim <[email protected]>

* update tests

* -1 -> 0 for negative examples

* simplify api

* bump release

* alphabetize

Co-authored-by: José Valim <[email protected]>
  • Loading branch information
t-rutten and josevalim authored Sep 17, 2021
1 parent bf5e1b2 commit b122ec6
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 3 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@

Scidata currently supports the following training and test datasets:

- MNIST
- FashionMNIST
- CIFAR10
- CIFAR100
- FashionMNIST
- IMDb Reviews
- MNIST

Download or fetch datasets locally:

Expand Down
68 changes: 68 additions & 0 deletions lib/scidata/imdb_reviews.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
defmodule Scidata.IMDBReviews do
@moduledoc """
Module for downloading the [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/).
"""

@base_url "http://ai.stanford.edu/~amaas/data/sentiment/"
@dataset_file "aclImdb_v1.tar.gz"

alias Scidata.Utils

@type train_sentiment :: :pos | :neg | :unsup
@type test_sentiment :: :pos | :neg
@type opts :: [
transform_inputs: ([binary, ...] -> any),
transform_labels: ([integer, ...] -> any)
]

@doc """
Downloads the IMDB reviews training dataset or fetches it locally.
`example_types` specifies which examples in the dataset should be returned
according to each example's label: `:pos` for positive examples, `:neg` for
negative examples, and `:unsup` for unlabeled examples.
"""
@spec download(example_types: [test_sentiment]) :: %{review: [binary(), ...], sentiment: 1 | 0}
def download(opts \\ []), do: download_dataset(:train, opts)

@doc """
Downloads the IMDB reviews test dataset or fetches it locally.
`example_types` is the same argument in `download/2` but excludes `:unsup`
because all unlabeled examples are in the training set.
"""
@spec download_test(example_types: [test_sentiment]) :: %{
review: [binary(), ...],
sentiment: 1 | 0
}
def download_test(opts \\ []), do: download_dataset(:test, opts)

defp download_dataset(dataset_type, opts) do
example_types = opts[:example_types] || [:pos, :neg]
transform_inputs = opts[:transform_inputs] || (& &1)
transform_labels = opts[:transform_labels] || (& &1)

files = Utils.get!(@base_url <> @dataset_file).body
regex = ~r"#{dataset_type}/(#{Enum.join(example_types, "|")})/"

{inputs, labels} =
for {fname, contents} <- files,
List.to_string(fname) =~ regex,
reduce: {[], []} do
{inputs, labels} ->
{[contents | inputs], [get_label(fname) | labels]}
end

%{review: transform_inputs.(inputs), sentiment: transform_labels.(labels)}
end

defp get_label(fname) do
fname = List.to_string(fname)

cond do
fname =~ "pos" -> 1
fname =~ "neg" -> 0
fname =~ "unsup" -> nil
end
end
end
2 changes: 1 addition & 1 deletion mix.exs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
defmodule Scidata.MixProject do
use Mix.Project

@version "0.1.1"
@version "0.1.2"
@repo_url "https://github.com/elixir-nx/scidata"

def project do
Expand Down
58 changes: 58 additions & 0 deletions test/imdb_reviews_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
defmodule IMDBReviewsTest do
use ExUnit.Case

@moduletag timeout: 120_000

describe "download" do
test "retrieves training set" do
%{review: train_inputs, sentiment: train_targets} =
Scidata.IMDBReviews.download()

assert length(train_inputs) == 25000
assert length(train_targets) == 25000

%{review: train_inputs, sentiment: train_targets} =
Scidata.IMDBReviews.download(example_types: [:pos, :neg])

assert length(train_inputs) == 25000
assert length(train_targets) == 25000

%{review: train_inputs, sentiment: train_targets} =
Scidata.IMDBReviews.download(example_types: [:pos, :neg, :unsup])

assert length(train_inputs) == 75000
assert length(train_targets) == 75000
end

test "retrieves test set" do
%{review: test_inputs, sentiment: test_targets} =
Scidata.IMDBReviews.download_test(example_types: [:pos, :neg])

assert length(test_inputs) == 25000
assert length(test_targets) == 25000
assert [0, 0, 0, 0, 0] = Enum.take(test_targets, -5)
end

test "utilizes transform opts" do
clip = fn inputs -> Enum.map(inputs, &String.slice(&1, 0..20)) end

%{review: reviews, sentiment: targets} =
Scidata.IMDBReviews.download(example_types: [:pos], transform_inputs: clip)

assert Enum.take(reviews, 10) == [
"The story centers aro",
"'The Adventures Of Ba",
"This film and it's se",
"I love this movie lik",
"A hit at the time but",
"Very smart, sometimes",
"With the mixed review",
"This movie really kic",
"I'd always wanted Dav",
"Like I said its a hid"
]

assert Enum.take(targets, 10) == [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
end
end
end

0 comments on commit b122ec6

Please sign in to comment.