From 605881e8939a1a9500f0a59e740c6e3ce6f416ca Mon Sep 17 00:00:00 2001 From: Shubham Gupta Date: Sat, 4 Dec 2021 12:45:08 +0800 Subject: [PATCH 01/10] Add Yelp Polarity Reviews dataset --- lib/scidata/utils.ex | 2 +- lib/scidata/yelp_polarity_reviews.ex | 48 ++++++++++++++++++++++++++++ mix.exs | 3 +- mix.lock | 2 ++ 4 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 lib/scidata/yelp_polarity_reviews.ex diff --git a/lib/scidata/utils.ex b/lib/scidata/utils.ex index 8beffca..72a5fee 100644 --- a/lib/scidata/utils.ex +++ b/lib/scidata/utils.ex @@ -56,7 +56,7 @@ defmodule Scidata.Utils do defp decode({request, response}) do cond do - String.ends_with?(request.url, ".tar.gz") -> + String.ends_with?(request.url, ".tar.gz") or String.ends_with?(request.url, ".tgz") -> {:ok, files} = :erl_tar.extract({:binary, response.body}, [:memory, :compressed]) response = %{response | body: files} {request, response} diff --git a/lib/scidata/yelp_polarity_reviews.ex b/lib/scidata/yelp_polarity_reviews.ex new file mode 100644 index 0000000..6c09b06 --- /dev/null +++ b/lib/scidata/yelp_polarity_reviews.ex @@ -0,0 +1,48 @@ +defmodule Scidata.YelpPolarityReviews do + @moduledoc """ + Module for downloading the [Yelp Reviews dataset](https://www.yelp.com/dataset). + """ + + @base_url "https://s3.amazonaws.com/fast-ai-nlp/" + + @dataset_file "yelp_review_polarity_csv.tgz" + + alias Scidata.Utils + + @doc """ + Downloads the Yelp Polarity Reviews training dataset or fetches it locally. + """ + @spec download() :: %{review: [binary(), ...], sentiment: 2 | 1} + def download(), do: download_dataset(:train) + + @doc """ + Downloads the Yelp Polarity Reviews test dataset or fetches it locally. + """ + @spec download_test() :: %{ + review: [binary(), ...], + sentiment: 2 | 1 + } + def download_test(), do: download_dataset(:test) + + defp download_dataset(dataset_type) do + files = Utils.get!(@base_url <> @dataset_file).body + regex = ~r"#{dataset_type}" + + [records | _] = + for {fname, contents} <- files do + if List.to_string(fname) =~ regex do + contents + |> StringIO.open() + |> elem(1) + |> IO.binstream(:line) + |> CSV.decode!() + |> Enum.to_list() + end + end + + %{ + review: records |> Enum.map(&List.last(&1)), + sentiment: records |> Enum.map(fn x -> x |> List.first() |> String.to_integer() end) + } + end +end diff --git a/mix.exs b/mix.exs index b3fbbc7..40b0877 100644 --- a/mix.exs +++ b/mix.exs @@ -30,7 +30,8 @@ defmodule Scidata.MixProject do defp deps do [ - {:ex_doc, ">= 0.24.0", only: :dev, runtime: false} + {:ex_doc, ">= 0.24.0", only: :dev, runtime: false}, + {:csv, "~> 2.4"} ] end diff --git a/mix.lock b/mix.lock index 0808273..9253e66 100644 --- a/mix.lock +++ b/mix.lock @@ -1,8 +1,10 @@ %{ + "csv": {:hex, :csv, "2.4.1", "50e32749953b6bf9818dbfed81cf1190e38cdf24f95891303108087486c5925e", [:mix], [{:parallel_stream, "~> 1.0.4", [hex: :parallel_stream, repo: "hexpm", optional: false]}], "hexpm", "54508938ac67e27966b10ef49606e3ad5995d665d7fc2688efb3eab1307c9079"}, "earmark_parser": {:hex, :earmark_parser, "1.4.13", "0c98163e7d04a15feb62000e1a891489feb29f3d10cb57d4f845c405852bbef8", [:mix], [], "hexpm", "d602c26af3a0af43d2f2645613f65841657ad6efc9f0e361c3b6c06b578214ba"}, "ex_doc": {:hex, :ex_doc, "0.24.2", "e4c26603830c1a2286dae45f4412a4d1980e1e89dc779fcd0181ed1d5a05c8d9", [:mix], [{:earmark_parser, "~> 1.4.0", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "e134e1d9e821b8d9e4244687fb2ace58d479b67b282de5158333b0d57c6fb7da"}, "makeup": {:hex, :makeup, "1.0.5", "d5a830bc42c9800ce07dd97fa94669dfb93d3bf5fcf6ea7a0c67b2e0e4a7f26c", [:mix], [{:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cfa158c02d3f5c0c665d0af11512fed3fba0144cf1aadee0f2ce17747fba2ca9"}, "makeup_elixir": {:hex, :makeup_elixir, "0.15.1", "b5888c880d17d1cc3e598f05cdb5b5a91b7b17ac4eaf5f297cb697663a1094dd", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.1", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "db68c173234b07ab2a07f645a5acdc117b9f99d69ebf521821d89690ae6c6ec8"}, "makeup_erlang": {:hex, :makeup_erlang, "0.1.1", "3fcb7f09eb9d98dc4d208f49cc955a34218fc41ff6b84df7c75b3e6e533cc65f", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "174d0809e98a4ef0b3309256cbf97101c6ec01c4ab0b23e926a9e17df2077cbb"}, "nimble_parsec": {:hex, :nimble_parsec, "1.1.0", "3a6fca1550363552e54c216debb6a9e95bd8d32348938e13de5eda962c0d7f89", [:mix], [], "hexpm", "08eb32d66b706e913ff748f11694b17981c0b04a33ef470e33e11b3d3ac8f54b"}, + "parallel_stream": {:hex, :parallel_stream, "1.0.6", "b967be2b23f0f6787fab7ed681b4c45a215a81481fb62b01a5b750fa8f30f76c", [:mix], [], "hexpm", "639b2e8749e11b87b9eb42f2ad325d161c170b39b288ac8d04c4f31f8f0823eb"}, } From 564caf15ee4445d2b92114456b71e501f7f3cd45 Mon Sep 17 00:00:00 2001 From: Shubham Gupta Date: Sat, 4 Dec 2021 14:13:52 +0800 Subject: [PATCH 02/10] Add unit test for yelp polarity reviews dataset --- lib/scidata/yelp_polarity_reviews.ex | 24 ++++++++++++++---------- test/yelp_polarity_reviews_test.exs | 22 ++++++++++++++++++++++ 2 files changed, 36 insertions(+), 10 deletions(-) create mode 100644 test/yelp_polarity_reviews_test.exs diff --git a/lib/scidata/yelp_polarity_reviews.ex b/lib/scidata/yelp_polarity_reviews.ex index 6c09b06..db7c4c6 100644 --- a/lib/scidata/yelp_polarity_reviews.ex +++ b/lib/scidata/yelp_polarity_reviews.ex @@ -28,16 +28,11 @@ defmodule Scidata.YelpPolarityReviews do files = Utils.get!(@base_url <> @dataset_file).body regex = ~r"#{dataset_type}" - [records | _] = - for {fname, contents} <- files do - if List.to_string(fname) =~ regex do - contents - |> StringIO.open() - |> elem(1) - |> IO.binstream(:line) - |> CSV.decode!() - |> Enum.to_list() - end + records = + for {fname, contents} <- files, + List.to_string(fname) =~ regex, + reduce: [[]] do + _ -> parse_csv(contents) end %{ @@ -45,4 +40,13 @@ defmodule Scidata.YelpPolarityReviews do sentiment: records |> Enum.map(fn x -> x |> List.first() |> String.to_integer() end) } end + + defp parse_csv(content) do + content + |> StringIO.open() + |> elem(1) + |> IO.binstream(:line) + |> CSV.decode!() + |> Enum.to_list() + end end diff --git a/test/yelp_polarity_reviews_test.exs b/test/yelp_polarity_reviews_test.exs new file mode 100644 index 0000000..7c5ca10 --- /dev/null +++ b/test/yelp_polarity_reviews_test.exs @@ -0,0 +1,22 @@ +defmodule YelpPolarityReviewsTest do + use ExUnit.Case + + @moduletag timeout: 120_0000 + + describe "download" do + test "retrieves training set" do + %{review: train_inputs, sentiment: train_targets} = Scidata.YelpPolarityReviews.download() + + assert length(train_inputs) == 560_000 + assert length(train_targets) == 560_000 + end + + test "retrieves test set" do + %{review: test_inputs, sentiment: test_targets} = + Scidata.YelpPolarityReviews.download_test() + + assert length(test_inputs) == 38000 + assert length(test_targets) == 38000 + end + end +end From c12d0bba46063c1852f631551a9c5f716680e8b9 Mon Sep 17 00:00:00 2001 From: Shubham Gupta Date: Sat, 4 Dec 2021 14:26:24 +0800 Subject: [PATCH 03/10] Add support for yelp full reviews dataset --- lib/scidata/yelp_full_reviews.ex | 52 +++++++++++++++++++++++++++++ test/yelp_full_reviews_test.exs | 21 ++++++++++++ test/yelp_polarity_reviews_test.exs | 2 +- 3 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 lib/scidata/yelp_full_reviews.ex create mode 100644 test/yelp_full_reviews_test.exs diff --git a/lib/scidata/yelp_full_reviews.ex b/lib/scidata/yelp_full_reviews.ex new file mode 100644 index 0000000..d8b5202 --- /dev/null +++ b/lib/scidata/yelp_full_reviews.ex @@ -0,0 +1,52 @@ +defmodule Scidata.YelpFullReviews do + @moduledoc """ + Module for downloading the [Yelp Reviews dataset](https://www.yelp.com/dataset). + """ + + @base_url "https://s3.amazonaws.com/fast-ai-nlp/" + + @dataset_file "yelp_review_full_csv.tgz" + + alias Scidata.Utils + + @doc """ + Downloads the Yelp Reviews training dataset or fetches it locally. + """ + @spec download() :: %{review: [binary(), ...], sentiment: 2 | 1} + def download(), do: download_dataset(:train) + + @doc """ + Downloads the Yelp Reviews test dataset or fetches it locally. + """ + @spec download_test() :: %{ + review: [binary(), ...], + sentiment: 2 | 1 + } + def download_test(), do: download_dataset(:test) + + defp download_dataset(dataset_type) do + files = Utils.get!(@base_url <> @dataset_file).body + regex = ~r"#{dataset_type}" + + records = + for {fname, contents} <- files, + List.to_string(fname) =~ regex, + reduce: [[]] do + _ -> parse_csv(contents) + end + + %{ + review: records |> Enum.map(&List.last(&1)), + rating: records |> Enum.map(fn x -> x |> List.first() |> String.to_integer() end) + } + end + + defp parse_csv(content) do + content + |> StringIO.open() + |> elem(1) + |> IO.binstream(:line) + |> CSV.decode!() + |> Enum.to_list() + end +end diff --git a/test/yelp_full_reviews_test.exs b/test/yelp_full_reviews_test.exs new file mode 100644 index 0000000..251f561 --- /dev/null +++ b/test/yelp_full_reviews_test.exs @@ -0,0 +1,21 @@ +defmodule YelpFullReviewsTest do + use ExUnit.Case + + @moduletag timeout: 120_000 + + describe "download" do + test "retrieves training set" do + %{review: train_inputs, rating: train_targets} = Scidata.YelpFullReviews.download() + + assert length(train_inputs) == 650_000 + assert length(train_targets) == 650_000 + end + + test "retrieves test set" do + %{review: test_inputs, rating: test_targets} = Scidata.YelpFullReviews.download_test() + + assert length(test_inputs) == 50000 + assert length(test_targets) == 50000 + end + end +end diff --git a/test/yelp_polarity_reviews_test.exs b/test/yelp_polarity_reviews_test.exs index 7c5ca10..1df2499 100644 --- a/test/yelp_polarity_reviews_test.exs +++ b/test/yelp_polarity_reviews_test.exs @@ -1,7 +1,7 @@ defmodule YelpPolarityReviewsTest do use ExUnit.Case - @moduletag timeout: 120_0000 + @moduletag timeout: 120_000 describe "download" do test "retrieves training set" do From fe292307f099fd5abfbe27a0d0d1ad421d14bc18 Mon Sep 17 00:00:00 2001 From: Shubham Gupta Date: Sat, 4 Dec 2021 14:35:45 +0800 Subject: [PATCH 04/10] Add assertions for unique values in targets --- lib/scidata/yelp_full_reviews.ex | 4 ++-- lib/scidata/yelp_polarity_reviews.ex | 18 +++++++++++++++--- test/yelp_full_reviews_test.exs | 2 ++ test/yelp_polarity_reviews_test.exs | 2 ++ 4 files changed, 21 insertions(+), 5 deletions(-) diff --git a/lib/scidata/yelp_full_reviews.ex b/lib/scidata/yelp_full_reviews.ex index d8b5202..a155ad6 100644 --- a/lib/scidata/yelp_full_reviews.ex +++ b/lib/scidata/yelp_full_reviews.ex @@ -12,7 +12,7 @@ defmodule Scidata.YelpFullReviews do @doc """ Downloads the Yelp Reviews training dataset or fetches it locally. """ - @spec download() :: %{review: [binary(), ...], sentiment: 2 | 1} + @spec download() :: %{review: [binary(), ...], rating: 5 | 4 | 3 | 2 | 1} def download(), do: download_dataset(:train) @doc """ @@ -20,7 +20,7 @@ defmodule Scidata.YelpFullReviews do """ @spec download_test() :: %{ review: [binary(), ...], - sentiment: 2 | 1 + rating: 5 | 4 | 3 | 2 | 1 } def download_test(), do: download_dataset(:test) diff --git a/lib/scidata/yelp_polarity_reviews.ex b/lib/scidata/yelp_polarity_reviews.ex index db7c4c6..b2f5b84 100644 --- a/lib/scidata/yelp_polarity_reviews.ex +++ b/lib/scidata/yelp_polarity_reviews.ex @@ -12,7 +12,7 @@ defmodule Scidata.YelpPolarityReviews do @doc """ Downloads the Yelp Polarity Reviews training dataset or fetches it locally. """ - @spec download() :: %{review: [binary(), ...], sentiment: 2 | 1} + @spec download() :: %{review: [binary(), ...], sentiment: 1 | 0} def download(), do: download_dataset(:train) @doc """ @@ -20,7 +20,7 @@ defmodule Scidata.YelpPolarityReviews do """ @spec download_test() :: %{ review: [binary(), ...], - sentiment: 2 | 1 + sentiment: 1 | 0 } def download_test(), do: download_dataset(:test) @@ -37,7 +37,7 @@ defmodule Scidata.YelpPolarityReviews do %{ review: records |> Enum.map(&List.last(&1)), - sentiment: records |> Enum.map(fn x -> x |> List.first() |> String.to_integer() end) + sentiment: get_rating(records) } end @@ -49,4 +49,16 @@ defmodule Scidata.YelpPolarityReviews do |> CSV.decode!() |> Enum.to_list() end + + defp get_rating(records) do + records + |> Enum.map(fn x -> + x + |> List.first() + |> case do + "1" -> 0 + "2" -> 1 + end + end) + end end diff --git a/test/yelp_full_reviews_test.exs b/test/yelp_full_reviews_test.exs index 251f561..216ad6e 100644 --- a/test/yelp_full_reviews_test.exs +++ b/test/yelp_full_reviews_test.exs @@ -9,6 +9,7 @@ defmodule YelpFullReviewsTest do assert length(train_inputs) == 650_000 assert length(train_targets) == 650_000 + assert train_targets |> Enum.uniq() |> Enum.sort() == [1, 2, 3, 4, 5] end test "retrieves test set" do @@ -16,6 +17,7 @@ defmodule YelpFullReviewsTest do assert length(test_inputs) == 50000 assert length(test_targets) == 50000 + assert test_targets |> Enum.uniq() |> Enum.sort() == [1, 2, 3, 4, 5] end end end diff --git a/test/yelp_polarity_reviews_test.exs b/test/yelp_polarity_reviews_test.exs index 1df2499..af0e686 100644 --- a/test/yelp_polarity_reviews_test.exs +++ b/test/yelp_polarity_reviews_test.exs @@ -9,6 +9,7 @@ defmodule YelpPolarityReviewsTest do assert length(train_inputs) == 560_000 assert length(train_targets) == 560_000 + assert train_targets |> Enum.uniq() |> Enum.sort() == [0, 1] end test "retrieves test set" do @@ -17,6 +18,7 @@ defmodule YelpPolarityReviewsTest do assert length(test_inputs) == 38000 assert length(test_targets) == 38000 + assert test_targets |> Enum.uniq() |> Enum.sort() == [0, 1] end end end From c82b64e15a886b9f5101a7c4f74a2fd60a5bc451 Mon Sep 17 00:00:00 2001 From: Shubham Gupta Date: Sat, 4 Dec 2021 15:32:00 +0800 Subject: [PATCH 05/10] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 3e23047..846055e 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ Scidata currently supports the following training and test datasets: - CIFAR100 - FashionMNIST - IMDb Reviews +- Yelp Reviews(Full and Polarity) - MNIST Download or fetch datasets locally: From fa23abbb9aa414d4fecceb12977044c30eb9a828 Mon Sep 17 00:00:00 2001 From: Shubham Gupta Date: Sun, 5 Dec 2021 01:51:37 +0800 Subject: [PATCH 06/10] Replace CSV with Nimble CSV and address comments --- lib/scidata/yelp_full_reviews.ex | 12 ++---------- lib/scidata/yelp_polarity_reviews.ex | 23 +++++------------------ mix.exs | 2 +- mix.lock | 1 + 4 files changed, 9 insertions(+), 29 deletions(-) diff --git a/lib/scidata/yelp_full_reviews.ex b/lib/scidata/yelp_full_reviews.ex index a155ad6..320fd6d 100644 --- a/lib/scidata/yelp_full_reviews.ex +++ b/lib/scidata/yelp_full_reviews.ex @@ -8,6 +8,7 @@ defmodule Scidata.YelpFullReviews do @dataset_file "yelp_review_full_csv.tgz" alias Scidata.Utils + alias NimbleCSV.RFC4180, as: CSV @doc """ Downloads the Yelp Reviews training dataset or fetches it locally. @@ -32,7 +33,7 @@ defmodule Scidata.YelpFullReviews do for {fname, contents} <- files, List.to_string(fname) =~ regex, reduce: [[]] do - _ -> parse_csv(contents) + _ -> CSV.parse_string(contents, skip_headers: false) end %{ @@ -40,13 +41,4 @@ defmodule Scidata.YelpFullReviews do rating: records |> Enum.map(fn x -> x |> List.first() |> String.to_integer() end) } end - - defp parse_csv(content) do - content - |> StringIO.open() - |> elem(1) - |> IO.binstream(:line) - |> CSV.decode!() - |> Enum.to_list() - end end diff --git a/lib/scidata/yelp_polarity_reviews.ex b/lib/scidata/yelp_polarity_reviews.ex index b2f5b84..0003459 100644 --- a/lib/scidata/yelp_polarity_reviews.ex +++ b/lib/scidata/yelp_polarity_reviews.ex @@ -8,6 +8,7 @@ defmodule Scidata.YelpPolarityReviews do @dataset_file "yelp_review_polarity_csv.tgz" alias Scidata.Utils + alias NimbleCSV.RFC4180, as: CSV @doc """ Downloads the Yelp Polarity Reviews training dataset or fetches it locally. @@ -32,7 +33,7 @@ defmodule Scidata.YelpPolarityReviews do for {fname, contents} <- files, List.to_string(fname) =~ regex, reduce: [[]] do - _ -> parse_csv(contents) + _ -> CSV.parse_string(contents, skip_headers: false) end %{ @@ -41,24 +42,10 @@ defmodule Scidata.YelpPolarityReviews do } end - defp parse_csv(content) do - content - |> StringIO.open() - |> elem(1) - |> IO.binstream(:line) - |> CSV.decode!() - |> Enum.to_list() - end - defp get_rating(records) do - records - |> Enum.map(fn x -> - x - |> List.first() - |> case do - "1" -> 0 - "2" -> 1 - end + Enum.map(records, fn + ["1" | _] -> 0 + ["2" | _] -> 1 end) end end diff --git a/mix.exs b/mix.exs index 40b0877..7e22e60 100644 --- a/mix.exs +++ b/mix.exs @@ -31,7 +31,7 @@ defmodule Scidata.MixProject do defp deps do [ {:ex_doc, ">= 0.24.0", only: :dev, runtime: false}, - {:csv, "~> 2.4"} + {:nimble_csv, "~> 1.1"} ] end diff --git a/mix.lock b/mix.lock index 9253e66..f76d6e3 100644 --- a/mix.lock +++ b/mix.lock @@ -5,6 +5,7 @@ "makeup": {:hex, :makeup, "1.0.5", "d5a830bc42c9800ce07dd97fa94669dfb93d3bf5fcf6ea7a0c67b2e0e4a7f26c", [:mix], [{:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cfa158c02d3f5c0c665d0af11512fed3fba0144cf1aadee0f2ce17747fba2ca9"}, "makeup_elixir": {:hex, :makeup_elixir, "0.15.1", "b5888c880d17d1cc3e598f05cdb5b5a91b7b17ac4eaf5f297cb697663a1094dd", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.1", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "db68c173234b07ab2a07f645a5acdc117b9f99d69ebf521821d89690ae6c6ec8"}, "makeup_erlang": {:hex, :makeup_erlang, "0.1.1", "3fcb7f09eb9d98dc4d208f49cc955a34218fc41ff6b84df7c75b3e6e533cc65f", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "174d0809e98a4ef0b3309256cbf97101c6ec01c4ab0b23e926a9e17df2077cbb"}, + "nimble_csv": {:hex, :nimble_csv, "1.1.0", "b1dba4a86be9e03065c9de829050468e591f569100332db949e7ce71be0afc25", [:mix], [], "hexpm", "e986755bc302832cac429be6deda0fc9d82d3c82b47abefb68b3c17c9d949a3f"}, "nimble_parsec": {:hex, :nimble_parsec, "1.1.0", "3a6fca1550363552e54c216debb6a9e95bd8d32348938e13de5eda962c0d7f89", [:mix], [], "hexpm", "08eb32d66b706e913ff748f11694b17981c0b04a33ef470e33e11b3d3ac8f54b"}, "parallel_stream": {:hex, :parallel_stream, "1.0.6", "b967be2b23f0f6787fab7ed681b4c45a215a81481fb62b01a5b750fa8f30f76c", [:mix], [], "hexpm", "639b2e8749e11b87b9eb42f2ad325d161c170b39b288ac8d04c4f31f8f0823eb"}, } From 56548246b7c7cab42b1ec8bad87b57e1ea013cc1 Mon Sep 17 00:00:00 2001 From: Shubham Gupta Date: Sun, 5 Dec 2021 01:53:16 +0800 Subject: [PATCH 07/10] Fix specs --- lib/scidata/yelp_full_reviews.ex | 4 ++-- lib/scidata/yelp_polarity_reviews.ex | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/scidata/yelp_full_reviews.ex b/lib/scidata/yelp_full_reviews.ex index 320fd6d..aeb9237 100644 --- a/lib/scidata/yelp_full_reviews.ex +++ b/lib/scidata/yelp_full_reviews.ex @@ -13,7 +13,7 @@ defmodule Scidata.YelpFullReviews do @doc """ Downloads the Yelp Reviews training dataset or fetches it locally. """ - @spec download() :: %{review: [binary(), ...], rating: 5 | 4 | 3 | 2 | 1} + @spec download() :: %{review: [binary(), ...], rating: [5 | 4 | 3 | 2 | 1]} def download(), do: download_dataset(:train) @doc """ @@ -21,7 +21,7 @@ defmodule Scidata.YelpFullReviews do """ @spec download_test() :: %{ review: [binary(), ...], - rating: 5 | 4 | 3 | 2 | 1 + rating: [5 | 4 | 3 | 2 | 1] } def download_test(), do: download_dataset(:test) diff --git a/lib/scidata/yelp_polarity_reviews.ex b/lib/scidata/yelp_polarity_reviews.ex index 0003459..2d3c657 100644 --- a/lib/scidata/yelp_polarity_reviews.ex +++ b/lib/scidata/yelp_polarity_reviews.ex @@ -13,7 +13,7 @@ defmodule Scidata.YelpPolarityReviews do @doc """ Downloads the Yelp Polarity Reviews training dataset or fetches it locally. """ - @spec download() :: %{review: [binary(), ...], sentiment: 1 | 0} + @spec download() :: %{review: [binary(), ...], sentiment: [1 | 0]} def download(), do: download_dataset(:train) @doc """ @@ -21,7 +21,7 @@ defmodule Scidata.YelpPolarityReviews do """ @spec download_test() :: %{ review: [binary(), ...], - sentiment: 1 | 0 + sentiment: [1 | 0] } def download_test(), do: download_dataset(:test) From 44c60765331c74212fdfa7c5af4471bb8aa14a0f Mon Sep 17 00:00:00 2001 From: Shubham Gupta Date: Sun, 5 Dec 2021 01:56:10 +0800 Subject: [PATCH 08/10] Remove CSV deps from mix.lock --- mix.lock | 2 -- 1 file changed, 2 deletions(-) diff --git a/mix.lock b/mix.lock index f76d6e3..f153f00 100644 --- a/mix.lock +++ b/mix.lock @@ -1,5 +1,4 @@ %{ - "csv": {:hex, :csv, "2.4.1", "50e32749953b6bf9818dbfed81cf1190e38cdf24f95891303108087486c5925e", [:mix], [{:parallel_stream, "~> 1.0.4", [hex: :parallel_stream, repo: "hexpm", optional: false]}], "hexpm", "54508938ac67e27966b10ef49606e3ad5995d665d7fc2688efb3eab1307c9079"}, "earmark_parser": {:hex, :earmark_parser, "1.4.13", "0c98163e7d04a15feb62000e1a891489feb29f3d10cb57d4f845c405852bbef8", [:mix], [], "hexpm", "d602c26af3a0af43d2f2645613f65841657ad6efc9f0e361c3b6c06b578214ba"}, "ex_doc": {:hex, :ex_doc, "0.24.2", "e4c26603830c1a2286dae45f4412a4d1980e1e89dc779fcd0181ed1d5a05c8d9", [:mix], [{:earmark_parser, "~> 1.4.0", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "e134e1d9e821b8d9e4244687fb2ace58d479b67b282de5158333b0d57c6fb7da"}, "makeup": {:hex, :makeup, "1.0.5", "d5a830bc42c9800ce07dd97fa94669dfb93d3bf5fcf6ea7a0c67b2e0e4a7f26c", [:mix], [{:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cfa158c02d3f5c0c665d0af11512fed3fba0144cf1aadee0f2ce17747fba2ca9"}, @@ -7,5 +6,4 @@ "makeup_erlang": {:hex, :makeup_erlang, "0.1.1", "3fcb7f09eb9d98dc4d208f49cc955a34218fc41ff6b84df7c75b3e6e533cc65f", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "174d0809e98a4ef0b3309256cbf97101c6ec01c4ab0b23e926a9e17df2077cbb"}, "nimble_csv": {:hex, :nimble_csv, "1.1.0", "b1dba4a86be9e03065c9de829050468e591f569100332db949e7ce71be0afc25", [:mix], [], "hexpm", "e986755bc302832cac429be6deda0fc9d82d3c82b47abefb68b3c17c9d949a3f"}, "nimble_parsec": {:hex, :nimble_parsec, "1.1.0", "3a6fca1550363552e54c216debb6a9e95bd8d32348938e13de5eda962c0d7f89", [:mix], [], "hexpm", "08eb32d66b706e913ff748f11694b17981c0b04a33ef470e33e11b3d3ac8f54b"}, - "parallel_stream": {:hex, :parallel_stream, "1.0.6", "b967be2b23f0f6787fab7ed681b4c45a215a81481fb62b01a5b750fa8f30f76c", [:mix], [], "hexpm", "639b2e8749e11b87b9eb42f2ad325d161c170b39b288ac8d04c4f31f8f0823eb"}, } From d7b8bddb23c8cd60ac12bc7942a4e9f021f55666 Mon Sep 17 00:00:00 2001 From: Shubham Gupta <14368181+goodhamgupta@users.noreply.github.com> Date: Tue, 7 Dec 2021 09:59:32 +0800 Subject: [PATCH 09/10] Update README.md Fix formatting Co-authored-by: Tom Rutten --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 846055e..c5cd74f 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Scidata currently supports the following training and test datasets: - CIFAR100 - FashionMNIST - IMDb Reviews -- Yelp Reviews(Full and Polarity) +- Yelp Reviews (Full and Polarity) - MNIST Download or fetch datasets locally: From 00823c2e8346591822e291205d24f8a129f71eb3 Mon Sep 17 00:00:00 2001 From: Shubham Gupta <14368181+goodhamgupta@users.noreply.github.com> Date: Tue, 7 Dec 2021 10:02:11 +0800 Subject: [PATCH 10/10] Update url for Yelp reviews dataset Co-authored-by: Tom Rutten --- lib/scidata/yelp_polarity_reviews.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/scidata/yelp_polarity_reviews.ex b/lib/scidata/yelp_polarity_reviews.ex index 2d3c657..7da9c3f 100644 --- a/lib/scidata/yelp_polarity_reviews.ex +++ b/lib/scidata/yelp_polarity_reviews.ex @@ -1,6 +1,6 @@ defmodule Scidata.YelpPolarityReviews do @moduledoc """ - Module for downloading the [Yelp Reviews dataset](https://www.yelp.com/dataset). + Module for downloading the [Yelp Polarity Reviews dataset](https://course.fast.ai/datasets#nlp). """ @base_url "https://s3.amazonaws.com/fast-ai-nlp/"