-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* download imdb reviews dataset * remove shuffle, add arg for example types * add basic docs * filter & reduce in comprehension Co-authored-by: José Valim <[email protected]> * remove parenthesis * add download tests * change `download*` return type, add specs & adapt tests to new return type * spec `download` w/o opts, opts spec more explicit * commit regex once Co-authored-by: José Valim <[email protected]> * Convert to string once and use binary matching instead of regex Co-authored-by: José Valim <[email protected]> * update tests * -1 -> 0 for negative examples * simplify api * bump release * alphabetize Co-authored-by: José Valim <[email protected]>
- Loading branch information
Showing
4 changed files
with
130 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
defmodule Scidata.IMDBReviews do | ||
@moduledoc """ | ||
Module for downloading the [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/). | ||
""" | ||
|
||
@base_url "http://ai.stanford.edu/~amaas/data/sentiment/" | ||
@dataset_file "aclImdb_v1.tar.gz" | ||
|
||
alias Scidata.Utils | ||
|
||
@type train_sentiment :: :pos | :neg | :unsup | ||
@type test_sentiment :: :pos | :neg | ||
@type opts :: [ | ||
transform_inputs: ([binary, ...] -> any), | ||
transform_labels: ([integer, ...] -> any) | ||
] | ||
|
||
@doc """ | ||
Downloads the IMDB reviews training dataset or fetches it locally. | ||
`example_types` specifies which examples in the dataset should be returned | ||
according to each example's label: `:pos` for positive examples, `:neg` for | ||
negative examples, and `:unsup` for unlabeled examples. | ||
""" | ||
@spec download(example_types: [test_sentiment]) :: %{review: [binary(), ...], sentiment: 1 | 0} | ||
def download(opts \\ []), do: download_dataset(:train, opts) | ||
|
||
@doc """ | ||
Downloads the IMDB reviews test dataset or fetches it locally. | ||
`example_types` is the same argument in `download/2` but excludes `:unsup` | ||
because all unlabeled examples are in the training set. | ||
""" | ||
@spec download_test(example_types: [test_sentiment]) :: %{ | ||
review: [binary(), ...], | ||
sentiment: 1 | 0 | ||
} | ||
def download_test(opts \\ []), do: download_dataset(:test, opts) | ||
|
||
defp download_dataset(dataset_type, opts) do | ||
example_types = opts[:example_types] || [:pos, :neg] | ||
transform_inputs = opts[:transform_inputs] || (& &1) | ||
transform_labels = opts[:transform_labels] || (& &1) | ||
|
||
files = Utils.get!(@base_url <> @dataset_file).body | ||
regex = ~r"#{dataset_type}/(#{Enum.join(example_types, "|")})/" | ||
|
||
{inputs, labels} = | ||
for {fname, contents} <- files, | ||
List.to_string(fname) =~ regex, | ||
reduce: {[], []} do | ||
{inputs, labels} -> | ||
{[contents | inputs], [get_label(fname) | labels]} | ||
end | ||
|
||
%{review: transform_inputs.(inputs), sentiment: transform_labels.(labels)} | ||
end | ||
|
||
defp get_label(fname) do | ||
fname = List.to_string(fname) | ||
|
||
cond do | ||
fname =~ "pos" -> 1 | ||
fname =~ "neg" -> 0 | ||
fname =~ "unsup" -> nil | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
defmodule IMDBReviewsTest do | ||
use ExUnit.Case | ||
|
||
@moduletag timeout: 120_000 | ||
|
||
describe "download" do | ||
test "retrieves training set" do | ||
%{review: train_inputs, sentiment: train_targets} = | ||
Scidata.IMDBReviews.download() | ||
|
||
assert length(train_inputs) == 25000 | ||
assert length(train_targets) == 25000 | ||
|
||
%{review: train_inputs, sentiment: train_targets} = | ||
Scidata.IMDBReviews.download(example_types: [:pos, :neg]) | ||
|
||
assert length(train_inputs) == 25000 | ||
assert length(train_targets) == 25000 | ||
|
||
%{review: train_inputs, sentiment: train_targets} = | ||
Scidata.IMDBReviews.download(example_types: [:pos, :neg, :unsup]) | ||
|
||
assert length(train_inputs) == 75000 | ||
assert length(train_targets) == 75000 | ||
end | ||
|
||
test "retrieves test set" do | ||
%{review: test_inputs, sentiment: test_targets} = | ||
Scidata.IMDBReviews.download_test(example_types: [:pos, :neg]) | ||
|
||
assert length(test_inputs) == 25000 | ||
assert length(test_targets) == 25000 | ||
assert [0, 0, 0, 0, 0] = Enum.take(test_targets, -5) | ||
end | ||
|
||
test "utilizes transform opts" do | ||
clip = fn inputs -> Enum.map(inputs, &String.slice(&1, 0..20)) end | ||
|
||
%{review: reviews, sentiment: targets} = | ||
Scidata.IMDBReviews.download(example_types: [:pos], transform_inputs: clip) | ||
|
||
assert Enum.take(reviews, 10) == [ | ||
"The story centers aro", | ||
"'The Adventures Of Ba", | ||
"This film and it's se", | ||
"I love this movie lik", | ||
"A hit at the time but", | ||
"Very smart, sometimes", | ||
"With the mixed review", | ||
"This movie really kic", | ||
"I'd always wanted Dav", | ||
"Like I said its a hid" | ||
] | ||
|
||
assert Enum.take(targets, 10) == [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] | ||
end | ||
end | ||
end |