From 728b9d12bea2dddaf361db22110fa8888b1632e3 Mon Sep 17 00:00:00 2001 From: Vitalina Komashko Date: Fri, 23 Aug 2024 15:05:34 -0700 Subject: [PATCH] added notebooks and python module that outline generation of the synthetic data used for 2 AWS blogs on PyDeequ --- .../01-synthetic-data-electronics.ipynb | 2201 ++++++++++++ .../02-synthetic-data-jewelry.ipynb | 1700 +++++++++ .../03-synthetic-data-other-products.ipynb | 3194 +++++++++++++++++ .../review_generation_helpers.py | 506 +++ 4 files changed, 7601 insertions(+) create mode 100644 tutorials/synthetic_data/01-synthetic-data-electronics.ipynb create mode 100644 tutorials/synthetic_data/02-synthetic-data-jewelry.ipynb create mode 100644 tutorials/synthetic_data/03-synthetic-data-other-products.ipynb create mode 100644 tutorials/synthetic_data/review_generation_helpers.py diff --git a/tutorials/synthetic_data/01-synthetic-data-electronics.ipynb b/tutorials/synthetic_data/01-synthetic-data-electronics.ipynb new file mode 100644 index 0000000..5365c29 --- /dev/null +++ b/tutorials/synthetic_data/01-synthetic-data-electronics.ipynb @@ -0,0 +1,2201 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "201a7dc9-732c-4441-82c9-a0576f874f5e", + "metadata": {}, + "source": [ + "# Generation of synthetic reviews dataset for the AWS Blog \"Testing data quality at scale with PyDeequ\"\n", + "\n", + "This notebook outlines the steps to generate synthetic data for the AWS blog post [\"Testing data quality at scale with PyDeequ\"](https://aws.amazon.com/blogs/big-data/testing-data-quality-at-scale-with-pydeequ/) for the update of the blog publushed in June 2024. The new synthetic dataset replaces the original Amazon reviews dataset, however, it retains characteristics necessary to demonstrate features of PyDeequ library. \n", + "\n", + "This synthetic dataset resides in the public S3 bucket: `s3://aws-bigdata-blog/generated_synthetic_reviews/data/product_category=Electronics/` as parquet files.\n", + "\n", + "Install [awswrangler](https://aws-sdk-pandas.readthedocs.io/en/stable/) and [essential_generators](https://pypi.org/project/essential-generators/):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39aede62", + "metadata": {}, + "outputs": [], + "source": [ + "!python3 -m pip install awswrangler==3.7.2\n", + "!python3 -m pip install essential_generators==1.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efa8bcb9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "import numpy as np\n", + "rng = np.random.default_rng(seed = 42)\n", + "\n", + "import awswrangler as wr\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from essential_generators import DocumentGenerator\n", + "\n", + "import random\n", + "random.seed(a = 42, version=2)\n", + "import string" + ] + }, + { + "cell_type": "markdown", + "id": "35e88014", + "metadata": {}, + "source": [ + "Supporting functions:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "609b37a6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import review_generation_helpers as rgh" + ] + }, + { + "cell_type": "markdown", + "id": "1f0f3557-dec7-42bb-8652-cacd4796e2db", + "metadata": {}, + "source": [ + "## Engineer data columns" + ] + }, + { + "cell_type": "markdown", + "id": "28a42597-13d6-4b5b-8497-563c5a10e020", + "metadata": {}, + "source": [ + "### Generate marketplace, review_headline and review_body" + ] + }, + { + "cell_type": "markdown", + "id": "d7c7a097-661e-4889-8a55-82b51749b2a8", + "metadata": {}, + "source": [ + "To generate review titles/headlines and bodies we use [essential_generators](https://pypi.org/project/essential-generators/) module. This will create nonsensical sentences (for the titles) and paragraphs (for the review bodies)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "704b3901-4982-4e11-8d5d-a6f4cdebfce0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "gen = DocumentGenerator()\n", + "\n", + "template = {'marketplace': ['US', 'UK', 'DE', 'JP', 'FR', None, ''], \n", + " 'review_headline':'sentence', \n", + " 'review_body': 'paragraph'}\n", + "gen.set_template(template)\n", + "documents = gen.documents(3010972)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d1813b9-d294-4705-8792-e6c64927eb30", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "len(documents)" + ] + }, + { + "cell_type": "markdown", + "id": "4f845288-a7f0-4b13-bd70-6b9041bc2c54", + "metadata": {}, + "source": [ + "Review a few generated examples:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "345f996b-14d6-4b33-b17c-6e9ba4fa1f9f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "documents[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10a1306d-349f-4d04-bdc1-e09ddd34ee56", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "documents[100]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d44a75a-cd74-41c3-857a-36523786ca2c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "documents[10000]" + ] + }, + { + "cell_type": "markdown", + "id": "545995c5-716b-40fd-af8b-40873bc820cc", + "metadata": {}, + "source": [ + "Convert the json object to pandas DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24fb8ad5-62bb-4353-99ac-9bd11c74a156", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat = pd.DataFrame(documents)\n", + "dat.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18ed554b-b84e-4f13-938d-319b625f016b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"marketplace\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6eb0f8b6-d2f0-4c58-8365-9bf6dac3552f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat.shape" + ] + }, + { + "cell_type": "markdown", + "id": "8339ecc5-0458-450a-a529-39b982f9ef8a", + "metadata": { + "tags": [] + }, + "source": [ + "### Generate review years" + ] + }, + { + "cell_type": "markdown", + "id": "2d388808-9928-4fa8-ae6b-954f16868316", + "metadata": {}, + "source": [ + "We assume that each year brings more reviews for a successful retailer, therefore the values in year column will be drawn with weights according to the exponential distribution.\n", + "\n", + "Total number of reviews (and the rows in the dataset):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72dfb83f-55c4-4da4-8716-c4465d6dfb35", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "n = dat.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea337a7b-e15e-4626-94b1-73db52613400", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# array of years\n", + "years_range = np.arange(1996, 2017, 1)\n", + "\n", + "# generate the weights:\n", + "exp_weights = rng.exponential(1, size = len(years_range))\n", + "exp_weights.sort()\n", + "\n", + "# select the year according to the weights:\n", + "years = rng.choice(years_range, size = n, p = exp_weights/exp_weights.sum())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1a85589-5f7b-439a-b249-20f5f943ec56", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "np.unique(years, return_counts = True)" + ] + }, + { + "cell_type": "markdown", + "id": "8835d4aa-fb1c-4db1-9b61-a018fb755320", + "metadata": { + "tags": [] + }, + "source": [ + "The blog focuces on the data quality. Introduce out-of-range years to be detected by PyDeequ checks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03036fcc-2baa-4d14-b073-37a09a60f8f1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "k = np.where(years == 2002)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b8b0d04-03b2-4e56-807c-27b45bdbacc9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "years[k[0][0]] = 2202" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1de8a624-6886-444a-abec-fda755e913c8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "years[years == 2202]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e91886ba-cfc7-49fd-a528-9fb92800d4f2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "k = np.where(years == 1996)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73526bb2-ec7f-43c7-b0ae-1338834f294c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "years[[k[0][0], k[0][1]]] = 1696" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fd12e37-6be8-4f20-8d76-8ef41ef96e23", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "years[years == 1696]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d735b82-e2c5-4003-9b04-186e760f9621", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "k = np.where(years == 2001)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f33e343c-1d1c-4eaa-8641-6716f4dd9edd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "years[[k[0][0], k[0][30]]] = 2101" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52496546-6848-440a-83e3-e38f98a748b8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "years[years == 2101]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6a5d6c8-1286-4285-9cb6-72571d3caa3d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "years.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "481f33c4-78e2-4697-bb0c-f08d78e0ddf2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "np.unique(years, return_counts = True)" + ] + }, + { + "cell_type": "markdown", + "id": "4a33f8ff-078b-4973-ac13-5f56ec5d94cf", + "metadata": {}, + "source": [ + "### Generate review dates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db7ce977-6a09-416f-b0c0-711ea791053f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "review_year_date = rgh.generate_dates(rng, years)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c80417f-f3ab-4641-9102-280a92f86403", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "review_year_date" + ] + }, + { + "cell_type": "markdown", + "id": "04c49725-1ab7-4684-8d55-b5d6043032c6", + "metadata": {}, + "source": [ + "### Generate user ratings (star_rating)\n", + "\n", + "The original dataset used in the blog post had ratings between 1 and 5 stars, with an average rating of 4 stars, and 74.9% of reviews had the star rating 4 or higher (the last condition was defined in the blog [\"Test data quality at scale with Deequ\"](https://aws.amazon.com/blogs/big-data/test-data-quality-at-scale-with-deequ/).\n", + "\n", + "To satisfy the requirements for the star rating statistics we found that the following distribution of the number of stars works: `array([1, 1, 3, 7, 8])` for 1, 2, 3, 4, and 5 stars accordingly. This distribution was optimized on an array of length 20. We will use numpy.repeat() method to generate an array of final length." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa9ea8b2-7a1e-40fe-90ad-d5dee6e9cdc4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "repeats = np.round(n/20)\n", + "repeats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63f59b0f-6e6b-4d7b-88bd-cb38281f95fb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "arr_sample = np.array([1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5])\n", + "ratings_all = np.repeat(arr_sample, repeats = repeats, axis = 0)\n", + "ratings_all.shape" + ] + }, + { + "cell_type": "markdown", + "id": "6d4abf20-54ad-45b0-912c-bdfd1c4218a9", + "metadata": {}, + "source": [ + "Checking that it satisfies the data statistics of the original dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bde10ae9-f29f-468a-ba89-47e147cc6724", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ratings_all.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c40cd19-8b64-4e0e-91ea-7e7c6803fefb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ratings_all[ratings_all >= 4].shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ab1cd17-fa5c-40e3-92b9-0232b70cf5a1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "2258235*100/3010980" + ] + }, + { + "cell_type": "markdown", + "id": "9ab834fe-60f3-41c6-a689-114edd4df765", + "metadata": {}, + "source": [ + "Remove 8 elements from the back of the array (a few 5 star ratings) to get the final count needed for the dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a039d985-9ccc-4287-a894-1964beb06522", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ratings = ratings_all[0:(len(ratings_all)-8)]\n", + "ratings.shape[0] == n" + ] + }, + { + "cell_type": "markdown", + "id": "991e058a-8fe7-4134-beb4-80ae3ca4a73c", + "metadata": {}, + "source": [ + "Check that the new array still satisfies the requirements:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09fa7633-6bd5-41ca-8623-a2a1ee7041c1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ratings.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9da833c-25ba-4dd2-825a-91f6bb01155d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ratings[ratings >= 4].shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98819bb5-f1f3-4684-915e-7b797c170cc9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "2258227*100/n" + ] + }, + { + "cell_type": "markdown", + "id": "d9d4be8c-4f66-4887-a82d-72cdc5aa09dc", + "metadata": {}, + "source": [ + "Shuffle the ratings:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "633f6a62-0a57-4b02-9904-b646b7e8a5bf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "rng.shuffle(ratings)\n", + "ratings[1:40]" + ] + }, + { + "cell_type": "markdown", + "id": "eda9092a-ce18-4b30-84a7-da065352e371", + "metadata": {}, + "source": [ + "### Generate helpful and total votes\n", + "\n", + "For these 2 columns we have experimented with the mean and the variance to get the target correlation (0.99365) based on the formula for correlation. We ensured that there are no negative votes. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "009ae73a-5103-400e-af91-fca83e35ef6a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cov = [[15, 8.59668777], [8.59668777, 5]]\n", + "mean = [20, 15]\n", + "total_votes, helpful_votes = rng.multivariate_normal(mean, cov, n).T\n", + "total_votes.shape[0] == n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "822ab827-e33f-4a81-97ce-10c95fb07895", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "helpful_votes.shape[0] == n" + ] + }, + { + "cell_type": "markdown", + "id": "cd9ff9a9-1077-4bb4-8359-897171bfd5b5", + "metadata": {}, + "source": [ + "Visualize total vs helpful votes:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b858dbf-92fa-4db8-ae29-c6e9447e63d5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "plt.scatter(total_votes, helpful_votes, alpha = 0.5)\n", + "plt.xlabel('total votes')\n", + "plt.ylabel('helpful votes')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "28416497-0d52-4ff9-8cf8-5e6a8ce1ee31", + "metadata": {}, + "source": [ + "Check the correlation numerically:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "429f6d20-1bae-42c6-a3ca-0af4c0414de8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "np.corrcoef(total_votes, helpful_votes)" + ] + }, + { + "cell_type": "markdown", + "id": "f4b02f23-b5ec-41c6-8b7a-372e90414189", + "metadata": {}, + "source": [ + "This is a sufficiently close result." + ] + }, + { + "cell_type": "markdown", + "id": "40bc5fb2-e070-45a5-8509-7f31ef47cd00", + "metadata": {}, + "source": [ + "Total votes and star rating shouldn't be correlated:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23a745f7-66b2-4536-9e9a-041d1f8ab650", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "np.corrcoef(total_votes, ratings)" + ] + }, + { + "cell_type": "markdown", + "id": "61ae2fcf-07bf-40f8-a2ab-487752f0654e", + "metadata": {}, + "source": [ + "### Generate product ids and product titles\n", + "\n", + "Create product titles and add additional variability by adding prefixes (product descriptions) and suffixes (additional product features):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ff6e5ca-fb98-42e0-994c-af6d40079731", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# pool of product names, prefixes and suffixes from which we will generate the product titles\n", + "product_pool= [\"fax machine\", \"banknote counter\", \"electronic alarm clock\",\"electric pencil sharpener\", \n", + " \"blu-ray\", \"floor lamp\", \"hair dryer\", \"paper copier\", \"electric drill\", \"video camera\", \n", + " \"radio\", \"air purifier\", \"floor heater\", \"cd player\", \"iron\", \"kettle\", \"mp3 player\",\n", + " \"video player\", \"electric stove\", \"electric razor\", \"dvd\", \"curling iron\", \n", + " \"office printer\", \"wireless speaker\", \"kitchen scale\", \"theater receiver\", \"electronic cigarettes\", \n", + " \"computer\", \"television\", \"smartphone\", \"surge protector\", \"remote control\", \"headset\", \n", + " \"game controller\", \"cellular phone\", \"bluetooth speaker\"]\n", + "product_prefix_pool = [\"large\", \"red\", \"small\", \"orange\", \"green\", \"black\", \"silver\", \"yellow\", \"compact\", \n", + " \"energy-efficient\", \"vintage\", \"pink\", \"portable\", \"white\", \"metal\", \"stainless-steel\"]\n", + "product_suffix_pool = [\"newest model\", \"refurbished\", \"renewed\", \"1996 model\", \"with the storage case\", \n", + " \"charger included\", \"charger not included\", \"batteries not included\", \"waterproof\", \n", + " \"with adapter\", \"with wooden inlays\", \"with silver details\", \"with black handle\", \n", + " \"EU compatible\", \"Japan compatible\", \"US compatible\"]" + ] + }, + { + "cell_type": "markdown", + "id": "9bcfda45-ccc9-4b5a-8aac-8b6fc0884983", + "metadata": {}, + "source": [ + "Generate products using combinations of product names, prefixes and suffixes. Each product will then get a unique product id." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89c4e32b-ea72-43ee-a276-04089c2d1e1a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "products = rgh.generate_products(rng, [product_prefix_pool, product_pool, product_suffix_pool], n)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2800be78-7ab6-4dfb-b3dc-d85877691444", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "products" + ] + }, + { + "cell_type": "markdown", + "id": "a2633a16-10f9-42b7-a2f9-e3e315203b0f", + "metadata": {}, + "source": [ + "### Generate customer ids and insight\n", + "\n", + "Create a column 'insight' to indicate influential reviewers.\n", + "\n", + "Generate the following distribution of the number of reviews per customer:\n", + "\n", + "- 10% of the reviews come from a single customer (1 review : 1 customer)\n", + "- 15% - 2:1 (each customer created 2 reviews)\n", + "- 10% - 3:1 (each customer created 3 reviews)\n", + "- 10% - 4:1 (each customer created 4 reviews)\n", + "- 20% - 7:1 (each customer created 7 reviews)\n", + "- 35% - 15:1 (each customer created 15 reviews)\n", + "\n", + "Create more \"insightful\" reviews for customers who wrote more reviews. Insight is a Y/N field." + ] + }, + { + "cell_type": "markdown", + "id": "895bcbb5-f67b-45aa-8a71-58d6ddf8fa81", + "metadata": {}, + "source": [ + "#### Generate customer_id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aedec858-b050-46ae-b985-07290ca3a476", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "temp = np.round(np.array([0.1, 0.15, 0.2])*n).astype(int)\n", + "temp" + ] + }, + { + "cell_type": "markdown", + "id": "4db85ce5-2671-4d7c-8ad0-502e886c82d0", + "metadata": {}, + "source": [ + "Experiment with different numbers of customers to get obtain the distribution as outlined above. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff9fe35e-86a3-41b3-b3d8-b574f6f6f6eb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# 10% of the reviews come from a single customer (1 review from 1 customer)\n", + "r1 = 301097" + ] + }, + { + "cell_type": "markdown", + "id": "5145e5a6-c020-49f2-834a-fe9205a9ed31", + "metadata": {}, + "source": [ + "15% of reviews are expected to come from customers who wrote 2 reviews each:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c713d5d3-16cc-4804-a553-3f26fb7af883", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "451646/2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c73cd6c3-f599-4cc4-b8ed-4b36ad83a0cc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "r2 = 451646" + ] + }, + { + "cell_type": "markdown", + "id": "ae13a879-5fcd-4bbf-b3e6-edfd9141c754", + "metadata": {}, + "source": [ + "10% of reviews came from customers who have written 3 reviews each: use 301101." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5750105e-de4e-4c40-990a-f936e65234ef", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "(r1+4)/3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a54713c-2b26-4687-9469-5922ac51157b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "r3 = r1 + 4\n", + "r3" + ] + }, + { + "cell_type": "markdown", + "id": "613241bb-7435-4497-82f1-fe15c6f297c4", + "metadata": {}, + "source": [ + "10% reviews came from the customers who have written 4 reviews each, use 301100." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "972904f5-b7e2-4af3-9368-475a72757f84", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "(r1 + 3)/4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c2da659-90f3-47b9-ad52-aba9486e284d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "r4 = r1 +3 \n", + "r4" + ] + }, + { + "cell_type": "markdown", + "id": "616bca47-658c-4ec0-8c14-be2110acb53b", + "metadata": {}, + "source": [ + "20% of reviews came from the customers who have written 7 reviews each. Use 602196." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1213685f-4921-477a-a0de-e590c5dbbd57", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "(602194+2)/7" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efee8b80-6ad2-4f2d-a3a9-93795adb2db1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "r7 = 602194+2\n", + "r7" + ] + }, + { + "cell_type": "markdown", + "id": "e3b3f5a9-b391-4c90-b10a-09610342683a", + "metadata": {}, + "source": [ + "The rest of the reviews:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a08c11a-5737-40c2-871f-f324ed5f7e89", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "n - r1 - r2 - r3 - r4 - r7" + ] + }, + { + "cell_type": "markdown", + "id": "447e1a3f-7d26-4b17-a8f0-f3ae3b44b388", + "metadata": {}, + "source": [ + "26 reviews per customer gives us a round number of customers:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fcca9845-8321-43e4-bed4-7c264c35b799", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "1053832/26" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb5c7d5b-6303-4845-9542-0064c6e30e03", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# these people have written 26 reviews\n", + "rmax = 1053832" + ] + }, + { + "cell_type": "markdown", + "id": "07a6251f-cdd4-45b7-872c-e5e5cb37f0c5", + "metadata": {}, + "source": [ + "Verify that the sum is still correct:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1e37b65-3c34-473d-a274-81df516f2035", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "(r1 + r2 + r3 + r4 + r7 + rmax) == n" + ] + }, + { + "cell_type": "markdown", + "id": "d8c070f0-8aba-4ed6-bc67-3b296f093890", + "metadata": {}, + "source": [ + "How many customers do we actually need?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0cb2a96f-068f-4d31-b555-ed1d6df3d6f7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "c1, c2, c3, c4, c7, c26 = r1, int(r2/2), int(r3/3), int(r4/4), int(r7/7), int(rmax/26)\n", + "total_customers_needed = c1 + c2 + c3 + c4 + c7 + c26\n", + "total_customers_needed" + ] + }, + { + "cell_type": "markdown", + "id": "f6ff400e-ffbf-4737-ba0d-32c665145ab5", + "metadata": {}, + "source": [ + "Generate random customer ids and shuffle." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01008fb8-b754-47e9-bc32-e86fb8108a60", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "customer_ids = np.arange(100000, 100000 + total_customers_needed)\n", + "rng.shuffle(customer_ids)" + ] + }, + { + "cell_type": "markdown", + "id": "b42d1281-e07e-4fbd-8cd5-527855e62c9d", + "metadata": {}, + "source": [ + "Next, construct indices to split this array into:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9ce0674-fcdf-4472-a205-c76437b63cea", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "split_indices = [c1, \n", + " c1 + c2, \n", + " c1 + c2 + c3,\n", + " c1 + c2 + c3 + c4, \n", + " c1 + c2 + c3 + c4 + c7]\n", + "split_indices" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14b3aac1-e173-49d8-b038-783fdeec77eb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "customer_cohorts = np.split(customer_ids, split_indices)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f03767b-143b-4387-895a-a2a1850fcdee", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "len(customer_cohorts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b174f136-96e8-4965-b62b-b70edf8fa253", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "[len(x) for x in customer_cohorts]" + ] + }, + { + "cell_type": "markdown", + "id": "add9ccdd-53ce-4956-9679-f9d70979e7b8", + "metadata": {}, + "source": [ + "The code below completes generation of customer ids relative to the ratio of reviews as defined in the beginning of the section. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ba2d4c2-62a8-48ba-8d11-d62123de9276", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "customers = np.hstack([customer_cohorts[0], \n", + " np.repeat(customer_cohorts[1], 2),\n", + " np.repeat(customer_cohorts[2], 3),\n", + " np.repeat(customer_cohorts[3], 4),\n", + " np.repeat(customer_cohorts[4], 7),\n", + " np.repeat(customer_cohorts[5], 26)\n", + " ])\n", + "customers.shape[0] == n" + ] + }, + { + "cell_type": "markdown", + "id": "0fadfbd8-921c-48f9-a5be-872236842caf", + "metadata": {}, + "source": [ + "Next, we need to distribute 'insight' accordingly and shuffle." + ] + }, + { + "cell_type": "markdown", + "id": "a447f7dd-8d3e-4afb-92ba-f404746bbbdb", + "metadata": {}, + "source": [ + "#### Create vine = insight\n", + "\n", + "Customers with more reviews should have more Y than N in the vine = insight column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4c17d4a-86fd-406d-a0ce-b7edf2b27757", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "insight = np.hstack([rng.choice(['N'], r1), \n", + " rng.choice(['N'], r2),\n", + " rng.choice(['N'], r3),\n", + " rng.choice(['Y', 'N'], r4, p = [0.2, 0.8]),\n", + " rng.choice(['Y', 'N'], r7, p = [0.5, 0.5]),\n", + " rng.choice(['Y', 'N'], rmax, p = [0.9, 0.1])])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c82bf09-9508-466b-8b9c-44334ee63e69", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "insight.shape[0] == n" + ] + }, + { + "cell_type": "markdown", + "id": "ca6c7bf4-3514-44d7-9f38-911905a55df7", + "metadata": {}, + "source": [ + "### Associate customer ids with the indicator for insight and shuffle" + ] + }, + { + "cell_type": "markdown", + "id": "b80f5201-6d4a-4453-801f-c746fdebf76f", + "metadata": {}, + "source": [ + "Combine into a single array:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dce82615-d9b3-4200-9fff-d88a710fea2d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cust_insight = np.vstack([customers, insight])\n", + "cust_insight.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2bb8a8a-01ef-4e91-8edb-9d1b0847335e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ind = np.arange(cust_insight.shape[1])\n", + "ind[0:100]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7183a324-bf67-4100-9f5d-7a47c421911d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "rng.shuffle(ind)\n", + "ind[0:100]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4de5b51-c2d8-4e77-9386-eef4ee2c0dfa", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ind.max()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8308da74-3020-4068-ad14-21abeeecd89a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ind.min()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a001bf3d-fe25-4a1c-8a71-f4ce5520509f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cust_insight_shuffled = cust_insight[:, ind]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d82e23d-8b38-4dea-8009-8e1d2c2c036f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cust_insight[:, 0:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1929d97d-9db6-4847-ad99-9601c8dc2354", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cust_insight_shuffled[:, 0:10]" + ] + }, + { + "cell_type": "markdown", + "id": "b5ef47bf-c832-49d5-acd2-b0b170482b2a", + "metadata": {}, + "source": [ + "### Generate review ids\n", + "\n", + "The review ids need to be mostly unique: 0.9926566948782706 of unique values. Each review is 14 characters long, starts with the letter 'R', and followed by a mix of uppercase letters and digits. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7153dc5-5f33-4c24-8fa6-eb4facdfeac9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "reviews_unique = ['R' + ''.join(random.choices(string.ascii_uppercase + string.digits, k=14)) for x in range(n)]\n", + "\n", + "len(reviews_unique)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df67375a-ec06-43ab-ae4c-f82dd90fa108", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "reviews_unique[0:100]" + ] + }, + { + "cell_type": "markdown", + "id": "6c785c9a-dac5-4048-a382-e64acb94326f", + "metadata": {}, + "source": [ + "Verify uniquiness:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ad87899-bf21-4828-b24f-dcdd9ab40bbe", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "temp = set(reviews_unique)\n", + "len(temp)" + ] + }, + { + "cell_type": "markdown", + "id": "d74832e7-6ea2-47b8-a997-0cc25667235b", + "metadata": {}, + "source": [ + "Introduce duplicated ids:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "367e5509-260f-451e-bb4c-faed6c644d25", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "count_dup = (n - np.round(n*0.9926566948782706)).astype(int)\n", + "count_dup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9522d581-1f28-4317-9bee-f99af719d998", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "reviews_unique[count_dup:count_dup*2] = reviews_unique[0:count_dup]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba2b7424-b0cc-448b-a5a6-f68ff75f7b1a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "len(set(reviews_unique))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e052210-4fab-4c75-a204-25547c73c219", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "2988862/n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e632028-50a5-4936-adb0-bc7d75101918", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "reviews_non_unique = np.array(reviews_unique)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6da9e131-45fa-4044-bc98-6aaa3801f7d5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "rng.shuffle(reviews_non_unique)" + ] + }, + { + "cell_type": "markdown", + "id": "e1048305-cf49-4998-b2ab-75755a97683c", + "metadata": {}, + "source": [ + "## Assemble the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73d8a36f-73dd-44f1-a734-ab8d14e9fa72", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40e3fde5-bb6c-4dbd-b28b-afb64d3c62e8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"customer_id\"] = cust_insight_shuffled[0, :]\n", + "\n", + "dat[\"review_id\"] = reviews_non_unique\n", + "\n", + "dat[\"product_title\"] = products[0, :]\n", + "dat[\"product_id\"] = products[1, :]\n", + "\n", + "dat[\"star_rating\"] = ratings\n", + "\n", + "dat[\"helpful_votes\"] = helpful_votes.astype(\"int\")\n", + "dat[\"total_votes\"] = total_votes.astype(\"int\")\n", + "dat[\"insight\"] = cust_insight_shuffled[1, :]\n", + "dat[[\"review_year\", \"review_date\"]] = review_year_date\n", + "dat[\"review_year\"] = dat[\"review_year\"].astype(\"int\")\n", + "dat[\"review_date\"] = pd.to_datetime(dat[\"review_date\"])\n", + "dat[\"product_category\"] = \"Electronics\"\n", + "\n", + "dat.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b052d19", + "metadata": {}, + "outputs": [], + "source": [ + "s3_bucket_name = " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e941f1a3-6aac-4c00-881b-e7650754d9ab", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "wr.s3.to_parquet(\n", + " df = dat[[\"product_category\", \"marketplace\", \"customer_id\", \"review_id\", \"product_id\", \"product_title\", \"star_rating\",\n", + " \"helpful_votes\", \"total_votes\", \"insight\", \"review_headline\", \"review_body\", \"review_date\", \"review_year\"]],\n", + " path = s3_bucket_name,\n", + " dataset = True,\n", + " max_rows_by_file = 3000000,\n", + " partition_cols=['product_category']\n", + ")" + ] + } + ], + "metadata": { + "availableInstances": [ + { + "_defaultOrder": 0, + "_isFastLaunch": true, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 4, + "name": "ml.t3.medium", + "vcpuNum": 2 + }, + { + "_defaultOrder": 1, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 8, + "name": "ml.t3.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 2, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.t3.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 3, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.t3.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 4, + "_isFastLaunch": true, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 8, + "name": "ml.m5.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 5, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.m5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 6, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.m5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 7, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 64, + "name": "ml.m5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 8, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 128, + "name": "ml.m5.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 9, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 192, + "name": "ml.m5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 10, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 256, + "name": "ml.m5.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 11, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 384, + "name": "ml.m5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 12, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 8, + "name": "ml.m5d.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 13, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.m5d.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 14, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.m5d.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 15, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 64, + "name": "ml.m5d.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 16, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 128, + "name": "ml.m5d.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 17, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 192, + "name": "ml.m5d.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 18, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 256, + "name": "ml.m5d.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 19, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 384, + "name": "ml.m5d.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 20, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": true, + "memoryGiB": 0, + "name": "ml.geospatial.interactive", + "supportedImageNames": [ + "sagemaker-geospatial-v1-0" + ], + "vcpuNum": 0 + }, + { + "_defaultOrder": 21, + "_isFastLaunch": true, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 4, + "name": "ml.c5.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 22, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 8, + "name": "ml.c5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 23, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.c5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 24, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.c5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 25, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 72, + "name": "ml.c5.9xlarge", + "vcpuNum": 36 + }, + { + "_defaultOrder": 26, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 96, + "name": "ml.c5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 27, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 144, + "name": "ml.c5.18xlarge", + "vcpuNum": 72 + }, + { + "_defaultOrder": 28, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 192, + "name": "ml.c5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 29, + "_isFastLaunch": true, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.g4dn.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 30, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.g4dn.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 31, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 64, + "name": "ml.g4dn.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 32, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 128, + "name": "ml.g4dn.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 33, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "hideHardwareSpecs": false, + "memoryGiB": 192, + "name": "ml.g4dn.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 34, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 256, + "name": "ml.g4dn.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 35, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 61, + "name": "ml.p3.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 36, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "hideHardwareSpecs": false, + "memoryGiB": 244, + "name": "ml.p3.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 37, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "hideHardwareSpecs": false, + "memoryGiB": 488, + "name": "ml.p3.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 38, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "hideHardwareSpecs": false, + "memoryGiB": 768, + "name": "ml.p3dn.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 39, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.r5.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 40, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.r5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 41, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 64, + "name": "ml.r5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 42, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 128, + "name": "ml.r5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 43, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 256, + "name": "ml.r5.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 44, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 384, + "name": "ml.r5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 45, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 512, + "name": "ml.r5.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 46, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 768, + "name": "ml.r5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 47, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.g5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 48, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.g5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 49, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 64, + "name": "ml.g5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 50, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 128, + "name": "ml.g5.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 51, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 256, + "name": "ml.g5.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 52, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "hideHardwareSpecs": false, + "memoryGiB": 192, + "name": "ml.g5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 53, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "hideHardwareSpecs": false, + "memoryGiB": 384, + "name": "ml.g5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 54, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "hideHardwareSpecs": false, + "memoryGiB": 768, + "name": "ml.g5.48xlarge", + "vcpuNum": 192 + }, + { + "_defaultOrder": 55, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "hideHardwareSpecs": false, + "memoryGiB": 1152, + "name": "ml.p4d.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 56, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "hideHardwareSpecs": false, + "memoryGiB": 1152, + "name": "ml.p4de.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 57, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.trn1.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 58, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 512, + "name": "ml.trn1.32xlarge", + "vcpuNum": 128 + }, + { + "_defaultOrder": 59, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 512, + "name": "ml.trn1n.32xlarge", + "vcpuNum": 128 + } + ], + "instance_type": "ml.m5.2xlarge", + "kernelspec": { + "display_name": "Python 3 (Data Science 3.0)", + "language": "python", + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/sagemaker-data-science-310-v1" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/synthetic_data/02-synthetic-data-jewelry.ipynb b/tutorials/synthetic_data/02-synthetic-data-jewelry.ipynb new file mode 100644 index 0000000..c9ec851 --- /dev/null +++ b/tutorials/synthetic_data/02-synthetic-data-jewelry.ipynb @@ -0,0 +1,1700 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c5003099-9b9c-4ac5-b870-4adfed85c585", + "metadata": {}, + "source": [ + "# Generation of synthetic reviews dataset for the AWS Blog \"Monitor data quality in your data lake using PyDeequ and AWS Glue\"\n", + "\n", + "\n", + "This notebook outlines the steps to generate synthetic data for the AWS blog post [\"Monitor data quality in your data lake using PyDeequ and AWS Glue\"](https://aws.amazon.com/blogs/big-data/monitor-data-quality-in-your-data-lake-using-pydeequ-and-aws-glue/) for the blog update publushed in August 2024. The new synthetic dataset replaces the original Amazon reviews dataset, however, it retains characteristics necessary to demonstrate features of PyDeequ library in AWS Glue. \n", + "\n", + "This synthetic dataset resides in the public S3 bucket: `s3://aws-bigdata-blog/generated_synthetic_reviews/data/product_category=Jewelry/` as parquet files.\n", + "\n", + "Install [awswrangler](https://aws-sdk-pandas.readthedocs.io/en/stable/) and [essential_generators](https://pypi.org/project/essential-generators/):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73801505", + "metadata": {}, + "outputs": [], + "source": [ + "!python3 -m pip install awswrangler==3.7.2\n", + "!python3 -m pip install essential_generators==1.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "035f589c-77f6-41aa-8fd0-dfe8bdeb213b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from matplotlib import ticker\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "from scipy.linalg import cholesky\n", + "import awswrangler as wr\n", + "\n", + "from essential_generators import DocumentGenerator" + ] + }, + { + "cell_type": "markdown", + "id": "ab1a29dd-b954-418d-86e3-ce5f3640f9ac", + "metadata": {}, + "source": [ + "Import supporting functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bfc21bc4-524e-4920-9171-f2cb33ef4c8a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import review_generation_helpers as rgh" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f62fe07-64cf-450b-a248-e5250aa1aeaa", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "rng = np.random.default_rng()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "882d4a33-5fe9-4033-b770-5c20e42f0735", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "float_formatter = \"{:.4f}\".format\n", + "np.set_printoptions(formatter={'float_kind':float_formatter})" + ] + }, + { + "cell_type": "markdown", + "id": "814e08e0-2f7f-41b2-8f54-1042dfc461a3", + "metadata": {}, + "source": [ + "## Engineer data columns" + ] + }, + { + "cell_type": "markdown", + "id": "febda416-f4fc-41c0-826a-0e7655219f0c", + "metadata": { + "tags": [] + }, + "source": [ + "### Generate customer ids and insight\n", + "\n", + "Create a column 'insight' to indicate influential reviewers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ccb3d568-298b-44c1-8915-ab0e6269814b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cust_ins = rgh.create_customers_insight(rng)\n", + "cust_ins_ready = rgh.shuffle_customer_insight(rng, cust_ins)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0eade18-0a0a-497d-9ccf-449a536f046f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cust_ins" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "699dc442-4c16-4cda-8820-a0b079148efa", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cust_ins_ready" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09d4d909-872f-4564-bed5-c3b65eb48433", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cust_ins_ready.shape" + ] + }, + { + "cell_type": "markdown", + "id": "17526ef2-4bcb-4cd7-bfa2-0c09fee9408f", + "metadata": {}, + "source": [ + "This dataset " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d0caf2a-a7b9-429b-83d5-21f3ba4ab1bd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "n = cust_ins_ready.shape[1]\n", + "n" + ] + }, + { + "cell_type": "markdown", + "id": "4113cece-ec4d-42f8-a649-82d2fe66050b", + "metadata": {}, + "source": [ + "### Generate total votes" + ] + }, + { + "cell_type": "markdown", + "id": "0b8afad5-48ab-4295-8004-1231cd8bacb2", + "metadata": {}, + "source": [ + "The code for generating seasonal semi-correlated total votes is based on the [PyMC notebook](https://www.pymc.io/projects/examples/en/latest/time_series/MvGaussianRandomWalk_demo.html).\n", + "\n", + "In the blog we investigate seasonal correlation in the montly count of total reviews between 3 years. Since the number of the product reviews is not divisible by 12 we take the closest number and then trim the result to match the size of the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd48a6c7-f8ca-4145-8caf-03f2eb98c39f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0795d6d-70aa-46e8-8347-75d24626d0ca", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "12*268734" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4955017d-45e4-4c97-8e2a-031304f29787", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "3224808/3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c7d0acd-c69f-46be-8f56-fcdff7fd0fe0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "D = 3 # Dimension of random walks = time series for each year\n", + "N = 1074936 # Number of steps = data points\n", + "sections = 12 # Number of sections = months\n", + "period = N / sections # Number steps in each section\n", + "\n", + "Sigma_alpha = rng.standard_normal((D, D))\n", + "Sigma_alpha = Sigma_alpha.T.dot(Sigma_alpha) # Construct covariance matrix for alpha\n", + "L_alpha = cholesky(Sigma_alpha, lower=True) # Obtain its Cholesky decomposition\n", + "\n", + "Sigma_beta = rng.standard_normal((D, D))\n", + "Sigma_beta = Sigma_beta.T.dot(Sigma_beta) # Construct covariance matrix for beta\n", + "L_beta = cholesky(Sigma_beta, lower=True) # Obtain its Cholesky decomposition\n", + "\n", + "# Gaussian random walks:\n", + "alpha = np.cumsum(L_alpha.dot(rng.standard_normal((D, sections))), axis=1).T\n", + "beta = np.cumsum(L_beta.dot(rng.standard_normal((D, sections))), axis=1).T\n", + "t = np.arange(N)[:, None] / N\n", + "alpha = np.repeat(alpha, period, axis=0)\n", + "beta = np.repeat(beta, period, axis=0)\n", + "\n", + "# Correlated series\n", + "sigma = 0.1\n", + "\n", + "# This is number of points (N) by 3 years array:\n", + "y = alpha + beta * t + sigma * rng.standard_normal((N, 1))" + ] + }, + { + "cell_type": "markdown", + "id": "4d62f94a-9a78-42ce-a37e-7c0396ea894f", + "metadata": {}, + "source": [ + "Since `y` represents votes, therefore it can't be negative, we also prefer to increase the number to emulate the number of votes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b8f9738-3dd6-481f-96c5-d3687f347b98", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "y.min()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be455f12-a13c-44d1-9496-c65790614d7a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "y.max()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec044934-8ab0-47d2-9ee8-bd29b0b65c77", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "total_votes = np.abs(np.round(y*10))\n", + "total_votes.max()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "344b3973-b553-46ba-97a4-b52e3e53d2d9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "total_votes.min()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26ee00a8-949f-4e83-98e7-3b1324668116", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "total_votes.shape" + ] + }, + { + "cell_type": "markdown", + "id": "6e9e26c6-c1ef-4b13-9318-45020a72fabc", + "metadata": {}, + "source": [ + "Plot the series:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9dd95200-00a3-447a-9c19-c667585a50fb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "plt.figure(figsize=(12, 5))\n", + "plt.plot(t, total_votes, \".\", markersize=2, label=(\"y_0 data\", \"y_1 data\", \"y_2 data\"))\n", + "plt.title(\"Three Correlated Series\")\n", + "plt.xlabel(\"Time\")\n", + "plt.legend()\n", + "plt.show();" + ] + }, + { + "cell_type": "markdown", + "id": "0bb2cccb-f0ed-4346-9038-21ff6d6abb9a", + "metadata": {}, + "source": [ + "#### Plot the sum of total votes by month and year" + ] + }, + { + "cell_type": "markdown", + "id": "aaf658a6-f4fa-424d-8d4a-2800afe2722c", + "metadata": {}, + "source": [ + "We need a figure to demonstrate monthly and yearly variability and correlation in the sum of total votes. Calculate the sum and generate the figure." + ] + }, + { + "cell_type": "markdown", + "id": "48546e3f-8208-4192-a3ec-0ca599daef49", + "metadata": {}, + "source": [ + "Prepare 1D array of the total votes:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e97b0ea-a386-46e8-8e08-50bbf149f694", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "total_votes_1D = np.reshape(total_votes, (total_votes.shape[0]*3,) , order='F')" + ] + }, + { + "cell_type": "markdown", + "id": "9464eff2-f9c5-4674-92bb-46cbfe29abb5", + "metadata": { + "tags": [] + }, + "source": [ + "Split the array by month to calculate monthly sum and plot:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e375abd-5aff-40f3-8a44-76b3e4bd7072", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "month_split = np.split(total_votes, list(range(0, N, int(N/12)))[1:], axis=0)" + ] + }, + { + "cell_type": "markdown", + "id": "c49b317b-9cda-43a2-9d13-c5381db87467", + "metadata": { + "tags": [] + }, + "source": [ + "Calculate the sum for each month/year:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab58886e-077d-4db7-a73f-baa2e0dfef7d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sum_month_year_list = [month_split[x].sum(axis = 0) for x in range(len(month_split))]\n", + "sum_month_year_list_array = np.array(sum_month_year_list)\n", + "sum_month_year_list_array.max()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2b316f7-dd07-45d0-93e2-e438321434d1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sum_month_year_list_array.min()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba6e428e-0f42-49e0-b9f6-4c09c2bccd8c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(layout='constrained')\n", + "ax.plot([\"Jan\", \"Feb\", \"Mar\", \"Apr\", \"May\", \"Jun\", \"Jul\", \"Aug\", \"Sep\", \"Oct\", \"Nov\", \"Dec\"], \n", + " sum_month_year_list_array, \n", + " marker = 's',\n", + " markersize=5, label=(\"2013\", \"2014\", \"2015\"))\n", + "ax.set_title(\"Total votes in jewelry review\")\n", + "ax.set_xlabel(\"Month\")\n", + "ax.set_ylabel(\"Total votes\")\n", + "ax.grid(True, which='major', color = 'lightgrey', alpha = 0.5)\n", + "ax.yaxis.set_major_formatter(ticker.StrMethodFormatter(\"{x:,.0f}\"))\n", + "ax.legend()\n", + "plt.savefig('total_votes_in_jewelry_review.jpg')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "72212e82-80a6-4677-bf6d-864ff5d40c8f", + "metadata": {}, + "source": [ + "### Generate review years\n", + "\n", + "In the blog the dataset has reviews from 3 years: 2013, 2014 and 2015:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14729c86-aefe-4bee-ae20-feb5be881ffd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "years = np.repeat([[2013, 2014, 2015]], total_votes.shape[0], axis = 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5c6c083-42c5-4e78-a424-802e2a41c86e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "years.shape" + ] + }, + { + "cell_type": "markdown", + "id": "d964eebd-5063-4583-8bf8-5046d714b9f7", + "metadata": {}, + "source": [ + "Reshape to a 1D array:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5087120-aad7-4c46-b3b8-38e8d04411a2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "years_1D = np.reshape(years, (years.shape[0]*3,) , order='F')" + ] + }, + { + "cell_type": "markdown", + "id": "2b466346-1fbb-42d7-928b-517058177ccf", + "metadata": {}, + "source": [ + "### Generate review dates" + ] + }, + { + "cell_type": "markdown", + "id": "320171c7-e9c5-455f-9da4-92fa7c119e79", + "metadata": {}, + "source": [ + "Generate the review dates in accordance to the periods used in the generation of the total votes. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6b0a51c-2f11-47bb-8a44-b867c4ec7225", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "review_dates = rgh.generate_dates_even_per_month(rng, [2013, 2014, 2015], days_per_month = int(N/12))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb9604a5-22b4-48ed-8314-3c5b97c2f7da", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "len(review_dates)" + ] + }, + { + "cell_type": "markdown", + "id": "2fffd7b7-e447-467b-b6fa-b8e49a78a1f5", + "metadata": {}, + "source": [ + "### Assemble years, review dates, total votes and trim to size" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "089a02ee-2633-4a4c-b3be-df33fa224d04", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat = pd.DataFrame({\"review_year\": years_1D, \"review_date\": review_dates, \"total_votes\": total_votes_1D})\n", + "dat.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5defd6f-4d56-4668-90c4-2de5c8382f3a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat = dat.iloc[0:-2,]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6360dd0-d283-4267-81c5-9a1526e36ed6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79b7abf7-3017-4208-a912-f9ec75c45bb3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat.shape[0] == n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f451c4bb-7da2-4409-a137-810e75d6a2e7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat.head()" + ] + }, + { + "cell_type": "markdown", + "id": "1c1425ae-2021-4ea5-be8c-81225def4cd8", + "metadata": {}, + "source": [ + "Add customer id and the insight to the DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22aa2d81-4fd9-44ea-8a00-f71ef0eafde0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"customer_id\"] = cust_ins_ready[0, :]\n", + "dat[\"insight\"] = cust_ins_ready[1, :]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "990c7a2e-dfb8-4a7e-a665-536e481f7a82", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b0c9660-e657-498c-b26e-7b5c436e21a3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat.dtypes" + ] + }, + { + "cell_type": "markdown", + "id": "843259a1-ac1a-453a-b0af-cbdd3f3f4daf", + "metadata": {}, + "source": [ + "Correct the data types:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1d9874d-ff93-4fb8-9cb0-c505482bf02e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"review_year\"] = dat[\"review_year\"].astype(\"int\")\n", + "dat[\"review_date\"] = pd.to_datetime(dat[\"review_date\"])\n", + "dat[\"total_votes\"] = dat[\"total_votes\"].astype(\"int\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5aa4cba1-d3ff-40bb-969e-e57ffe98d5c7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat.dtypes" + ] + }, + { + "cell_type": "markdown", + "id": "388bef32-a4ce-45f3-8608-25e8c83136f5", + "metadata": {}, + "source": [ + "### Generate product ids and product titles\n", + "\n", + "Create product titles, add variability by adding prefixes (adjectives) and suffixes:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f10ce57-ae9f-4b7c-bcfe-31aa655b5757", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# pool of product names, prefixes and suffixes from which we will generate the product titles\n", + "product_pool= [\"earrings\", \"crown\", \"headband\",\"hairclip\", 'armlet', 'bracelet', \"cuff links\", \"ring\", \"pin\", \"brooch\",\n", + " \"buckle\", \"toe ring\", \"anklet\", \"amulet\", \"beads\", \"jewelry\", \"necklace\", \"pendant\", \"tie clip\"]\n", + "product_prefix_pool = [\"wire wrapped\", \"charm\", \"Italian\", \"friendship\", \"silver\", \"gold\", \"ametist\", \"coral\", \"silver-plated\", \"gold-plated\",\n", + " \"vintage\", \"unique\", \"cute\", \"adorable\", \"elegant\", \"designer\", \"championship\", \"class\", \"engagement\", \n", + " \"promise\", \"wedding\", \"art\", \"estate\", \"forever love heart\", \"jade\", \"pearl\", \"gold pearl\", \"silver pearl\",\n", + " \"imitation pearl\", \"black pearl\", \"white gold\", \"yellow gold\", \"white and yellow gold\", \"paw print\",\n", + " \"chunky\", \"gold dipped\"]\n", + "product_suffix_pool = [\"with natural stones\", \"with cubic zirconia\", \"with diamonds\", \"with princess cut-stones\", \"size adjustable\",\n", + " \"with custom engraving\", \"for women\", \"for men\", \"for men and women\", \"for couples\", \"unisex\", \"excellent gift\",\n", + " \"engagement gift\", \"best gift for friends\", \"best for Mother's day\", \"excellent gift for mother-in-law\", \n", + " \"set of 3\", \"for every day of the week\", \"with interchangeable stones\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02be0605-6017-4cae-afe5-e86c63085542", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "products = rgh.generate_products(rng, [product_prefix_pool, product_pool, product_suffix_pool], n) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3a193f9-f401-43c1-961f-d3ec630bb9ff", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "products.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dff0b48-989c-4241-9c33-2242eae0ad3d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "products" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "636b27ba-0192-40b0-9e44-0d10d41360dc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"product_title\"] = products[0, :]\n", + "dat[\"product_id\"] = products[1, :]\n", + "dat.head()" + ] + }, + { + "cell_type": "markdown", + "id": "a10613dc-2ef7-425c-83b0-d622195c297b", + "metadata": {}, + "source": [ + "### Generate review titles and text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "585fa6ca-09a0-4e39-bc73-f809be6e673f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "gen = DocumentGenerator()\n", + "\n", + "template = {'review_headline':'sentence', \n", + " 'review_body': 'paragraph'}\n", + "gen.set_template(template)\n", + "documents = gen.documents(n)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1aa37840-7c9b-4963-9ad6-fd96571e93ea", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "len(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6bec4f2-60cd-4f21-98f3-181b8afe5d08", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "documents[0:3]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83a99f48-0c69-4938-a0ac-2d2ed4e2775e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "reviews = pd.DataFrame(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbfd1605-156b-4638-9a7c-4db3c3108967", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"review_headline\"] = reviews[\"review_headline\"]\n", + "dat[\"review_body\"] = reviews[\"review_body\"]" + ] + }, + { + "cell_type": "markdown", + "id": "4fd643d1-722b-42a5-919c-296cfb519b28", + "metadata": {}, + "source": [ + "### Generate marketplace codes\n", + "\n", + "Majority of the reviews have code \"US\" and ~1000 have code \"MX\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8502c9ce-1d0b-4946-91aa-bb12b20b600b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "marketplace = np.repeat([\"US\"], n)\n", + "marketplace.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c88b7ad-4e7f-4f6b-bd1e-c2046f5078f3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "random_subs = rng.choice(np.arange(n), 1000)\n", + "marketplace[random_subs] = \"MX\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85e63320-520b-47bb-b36f-0559317a7c2e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "np.unique(marketplace, return_counts = True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c86abc6f-1e53-473d-a9c9-7dc9b9dff9c5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"marketplace\"] = marketplace" + ] + }, + { + "cell_type": "markdown", + "id": "7b47eeb9-db44-4a86-8e50-3a5375b70a6a", + "metadata": {}, + "source": [ + "### Generate review ids" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebf44f6d-5bc0-426e-96b2-66ee1e3459ee", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"review_id\"] = rgh.generate_random_review_id(n)" + ] + }, + { + "cell_type": "markdown", + "id": "10328041-889b-4341-b305-1415f6a4e58c", + "metadata": {}, + "source": [ + "### Generate star ratings\n", + "\n", + "Use exponential distribution:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a39612d-cf0a-4452-916d-33c2a5751a5d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "star_rating = np.array([1, 2, 3, 4, 5])\n", + "\n", + "dat[\"star_rating\"] = rgh.subset_array_exponential(rng, star_rating, n, scale = 0.9, sort = False)" + ] + }, + { + "cell_type": "markdown", + "id": "2e2514e8-6cf9-410b-9de9-64777daba006", + "metadata": {}, + "source": [ + "### Generate helpful votes\n", + "\n", + "Use exponential distribution:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a26f60e-c88e-48d9-843f-917bf77da8f0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "votes = np.arange(0, 55, 5)\n", + "\n", + "dat[\"helpful_votes\"] = rgh.subset_array_exponential(rng, votes, n, scale = 0.9, sort = False)\n", + "dat[\"helpful_votes\"] = dat[\"helpful_votes\"].astype(\"int\")\n", + "dat[\"star_rating\"] = dat[\"star_rating\"].astype(\"int\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df9e8fcf-f190-43d9-932c-ff70fdf0db69", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"product_category\"] = \"Jewelry\"" + ] + }, + { + "cell_type": "markdown", + "id": "bce38e2a-0e0d-4dd9-85b3-0fe0fe4d188a", + "metadata": {}, + "source": [ + "## Write the data to S3 in parquet format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4994a610", + "metadata": {}, + "outputs": [], + "source": [ + "s3_bucket_name = " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adda3b72", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "wr.s3.to_parquet(\n", + " df = dat[[\"product_category\", \"marketplace\", \"customer_id\", \"review_id\", \"product_id\", \"product_title\", \"star_rating\",\n", + " \"helpful_votes\", \"total_votes\", \"insight\", \"review_headline\", \"review_body\", \"review_date\", \"review_year\"]],\n", + " path = s3_bucket_name,\n", + " dataset = True,\n", + " max_rows_by_file = 3000000,\n", + " partition_cols = ['product_category']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bae8fd48-c43f-4297-9842-d2001d6fd628", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "availableInstances": [ + { + "_defaultOrder": 0, + "_isFastLaunch": true, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 4, + "name": "ml.t3.medium", + "vcpuNum": 2 + }, + { + "_defaultOrder": 1, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 8, + "name": "ml.t3.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 2, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.t3.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 3, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.t3.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 4, + "_isFastLaunch": true, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 8, + "name": "ml.m5.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 5, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.m5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 6, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.m5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 7, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 64, + "name": "ml.m5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 8, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 128, + "name": "ml.m5.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 9, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 192, + "name": "ml.m5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 10, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 256, + "name": "ml.m5.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 11, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 384, + "name": "ml.m5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 12, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 8, + "name": "ml.m5d.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 13, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.m5d.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 14, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.m5d.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 15, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 64, + "name": "ml.m5d.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 16, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 128, + "name": "ml.m5d.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 17, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 192, + "name": "ml.m5d.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 18, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 256, + "name": "ml.m5d.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 19, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 384, + "name": "ml.m5d.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 20, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": true, + "memoryGiB": 0, + "name": "ml.geospatial.interactive", + "supportedImageNames": [ + "sagemaker-geospatial-v1-0" + ], + "vcpuNum": 0 + }, + { + "_defaultOrder": 21, + "_isFastLaunch": true, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 4, + "name": "ml.c5.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 22, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 8, + "name": "ml.c5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 23, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.c5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 24, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.c5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 25, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 72, + "name": "ml.c5.9xlarge", + "vcpuNum": 36 + }, + { + "_defaultOrder": 26, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 96, + "name": "ml.c5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 27, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 144, + "name": "ml.c5.18xlarge", + "vcpuNum": 72 + }, + { + "_defaultOrder": 28, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 192, + "name": "ml.c5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 29, + "_isFastLaunch": true, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.g4dn.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 30, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.g4dn.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 31, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 64, + "name": "ml.g4dn.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 32, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 128, + "name": "ml.g4dn.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 33, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "hideHardwareSpecs": false, + "memoryGiB": 192, + "name": "ml.g4dn.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 34, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 256, + "name": "ml.g4dn.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 35, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 61, + "name": "ml.p3.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 36, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "hideHardwareSpecs": false, + "memoryGiB": 244, + "name": "ml.p3.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 37, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "hideHardwareSpecs": false, + "memoryGiB": 488, + "name": "ml.p3.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 38, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "hideHardwareSpecs": false, + "memoryGiB": 768, + "name": "ml.p3dn.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 39, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.r5.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 40, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.r5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 41, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 64, + "name": "ml.r5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 42, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 128, + "name": "ml.r5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 43, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 256, + "name": "ml.r5.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 44, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 384, + "name": "ml.r5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 45, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 512, + "name": "ml.r5.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 46, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 768, + "name": "ml.r5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 47, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.g5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 48, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.g5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 49, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 64, + "name": "ml.g5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 50, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 128, + "name": "ml.g5.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 51, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 256, + "name": "ml.g5.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 52, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "hideHardwareSpecs": false, + "memoryGiB": 192, + "name": "ml.g5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 53, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "hideHardwareSpecs": false, + "memoryGiB": 384, + "name": "ml.g5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 54, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "hideHardwareSpecs": false, + "memoryGiB": 768, + "name": "ml.g5.48xlarge", + "vcpuNum": 192 + }, + { + "_defaultOrder": 55, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "hideHardwareSpecs": false, + "memoryGiB": 1152, + "name": "ml.p4d.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 56, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "hideHardwareSpecs": false, + "memoryGiB": 1152, + "name": "ml.p4de.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 57, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.trn1.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 58, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 512, + "name": "ml.trn1.32xlarge", + "vcpuNum": 128 + }, + { + "_defaultOrder": 59, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 512, + "name": "ml.trn1n.32xlarge", + "vcpuNum": 128 + } + ], + "instance_type": "ml.m5.2xlarge", + "kernelspec": { + "display_name": "Python 3 (Data Science 3.0)", + "language": "python", + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/sagemaker-data-science-310-v1" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/synthetic_data/03-synthetic-data-other-products.ipynb b/tutorials/synthetic_data/03-synthetic-data-other-products.ipynb new file mode 100644 index 0000000..6a6aa36 --- /dev/null +++ b/tutorials/synthetic_data/03-synthetic-data-other-products.ipynb @@ -0,0 +1,3194 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "374c20bb-4eb0-4f01-9206-74389ca0d576", + "metadata": {}, + "source": [ + "# Generation of the syntheric reviews data for other product categories" + ] + }, + { + "cell_type": "markdown", + "id": "43566700-3db8-49f7-a070-9e02a0fef35e", + "metadata": {}, + "source": [ + "**DISCLAIMER:** This notebook is an extension of the synthetic product reviews generation for products in [Electronics](./01-synthetic-data-electronics.ipynb) and [Jewelry](./02-synthetic-data-jewelry.ipynb) to 18 other product categories. The data for these categories does not follow any temporary trends and there are no relationships between the variables. This data shouldn't be used for data science projects as it doesn't have any meaningful patterns (or these patterns are random or occidental). However, this dataset is sufficiently large to demonstrate 'big data' capabilities of services, databases and query engines.\n", + "\n", + "We generate the synthetic data for the following product categories:\n", + "\n", + "- Automotive\n", + "- Home_Kitchen\n", + "- Beauty_Personal_Care\n", + "- Apparel\n", + "- Video_Games\n", + "- Toys_Games\n", + "- Office_Products\n", + "- Pet_Supplies\n", + "- Sports_Outdoors\n", + "- Tools_Home_Improvement\n", + "- Garden_Outdoor\n", + "- Arts_Crafts_Sewing\n", + "- Health_Household\n", + "- Computers\n", + "- Books\n", + "- Music\n", + "- Movies_TV\n", + "- Grocery_Gourment_Food \n", + "\n", + "Product titles in the majority of cases were generated using prompts to Anthropic's Claude v3 Sonnet model using Amazon Bedrock. These prompts are recorded.\n", + "\n", + "Review titles and bodies have been generated separately using a helper function `review_generation_helpers.generate_review_headline_body()`. These were saved in an Amazon S3 bucket and subsequently read back to augment the data for each product category. \n", + "\n", + "The final size of the synthetic data is approximately 130 million rows. \n", + "\n", + "The dataset resides in the public S3 bucket: `s3://aws-bigdata-blog/generated_synthetic_reviews/data/` as parquet files partitioned by product_category as listed above.\n", + "\n", + "License: CC-BY-4.0\n", + "\n", + "Install [awswrangler](https://aws-sdk-pandas.readthedocs.io/en/stable/) and [essential_generators](https://pypi.org/project/essential-generators/):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d071f7e1", + "metadata": {}, + "outputs": [], + "source": [ + "!python3 -m pip install awswrangler==3.7.2\n", + "!python3 -m pip install essential_generators==1.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f91bdd1-446c-4049-b51d-cd22cb6ea934", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "# custom module: review_generation_helpers.py\n", + "import review_generation_helpers as rgh\n", + "\n", + "import awswrangler as wr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f89a04f", + "metadata": {}, + "outputs": [], + "source": [ + "# private backet with review titles and bodies\n", + "s3_bucket_text = \n", + "s3_bucket_output = " + ] + }, + { + "cell_type": "markdown", + "id": "061ca488-f6e7-4019-abc4-4a5eed98ac41", + "metadata": {}, + "source": [ + "## Automotive" + ] + }, + { + "cell_type": "markdown", + "id": "9e09c4f6-170a-4482-ba15-a6c1ed2c4983", + "metadata": { + "tags": [] + }, + "source": [ + "Prompt to Claude v3 Sonnet: \"Generate 50 products related to automotive. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "315e6551-27c2-41b9-8954-f53ba1d7e9c0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_pool = ['tire', 'battery', 'engine', 'headlight', 'wiper', 'brake', 'muffler', 'mirror', 'radiator', \n", + " 'alternator', 'spark', 'plug', 'filter', 'bearing', 'belt', 'fuse', 'sensor', 'gasket', 'hose', \n", + " 'pump', 'piston', 'caliper', 'rotor', 'suspension', 'strut', 'bushing', 'control', 'arm', 'axle', \n", + " 'driveshaft', 'differential', 'transmission', 'clutch', 'flywheel', 'starter', 'ignition', 'coil', \n", + " 'distributor', 'carburetor', 'injector', 'valve', 'cylinder', 'turbocharger', 'supercharger', \n", + " 'catalytic', 'converter', 'manifold', 'thermostat', 'compressor']" + ] + }, + { + "cell_type": "markdown", + "id": "17a9d72e-92cc-4edb-8981-61b76a9b9045", + "metadata": {}, + "source": [ + "Prompt to Claude v3 Sonnet: \"Generate 50 automotive product characteristics. Each word must be an adjective. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "502c29f9-6ddb-436f-9c08-505ec13b4b17", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_prefix_pool = ['durable', 'efficient', 'powerful', 'sleek', 'sophisticated', 'innovative', \n", + " 'eco-friendly', 'luxurious', 'sporty', 'advanced', 'spacious', 'versatile', 'ergonomic', \n", + " 'aerodynamic', 'stylish', 'comfortable', 'responsive', 'intuitive', 'intelligent', \n", + " 'rugged', 'agile', 'dynamic', 'premium', 'reliable', 'sturdy', 'silent', 'smooth', \n", + " 'swift', 'elegant', 'secure', 'customizable', 'futuristic', 'energy-efficient', \n", + " 'performance-oriented', 'technologically-advanced', 'user-friendly', \n", + " 'environmentally-conscious', 'cutting-edge', 'safety-focused', 'connected', \n", + " 'automated', 'handy', 'compact', 'maneuverable', 'sustainable', 'adaptable', 'refined', \n", + " 'robust', 'smart']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a210b459-f131-4e5f-bd16-d0a8d2690045", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# author generated\n", + "product_suffix_pool = [\"foldable\", \"collapsible\", \"expandable\", \"with slogans\", \"new\", \"refurbished\", \"replacement\", \"12x\",\n", + " \"new in box\", \"with dividers\", \"large capacity\", \"rechargeable\", \"for the trunk\", \"extra-large\", \"compact\", \n", + " \"4x\", \"for pets\", \"against pets\", \"for cars and homes\", \"for the roof\", \"for the interior\", \"with automatic sensor\", \n", + " \"high performance\", \"with charger\", \"with mirror\", \"with aerodinamic features\", \"secure\", \"with app\",\n", + " '2 years warranty']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c810186-8076-4bff-ac3d-b20b5d7ecb90", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat = rgh.create_dataset(size_factor = 3, \n", + " total_votes = np.arange(0, 65, 5),\n", + " helpful_votes = np.arange(0, 32), \n", + " scale = 1, \n", + " review_years = np.arange(1996, 2017), \n", + " product_category = 'Automotive', \n", + " product_components = [product_prefix_pool, product_pool, product_suffix_pool],\n", + " marketplace_factor = 0.8)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "362a8e77-7f8a-41b5-abc7-885f43e0b09f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "reviews = pd.read_parquet(s3_bucket_text + \n", + " '/review_body_headline/10d7317906cf4aa6845a8d7d66b0c651_0.snappy.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df77683d-ced3-4df4-b185-3f39f980a155", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"review_headline\"] = reviews.iloc[0:dat.shape[0]][\"review_headline\"].array\n", + "dat[\"review_body\"] = reviews.iloc[0:dat.shape[0]][\"review_body\"].array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2dd92b6c-d6b0-4d57-a8a5-d6ff28748364", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "wr.s3.to_parquet(\n", + " df = dat[[\"product_category\", \"marketplace\", \"customer_id\", \"review_id\", \"product_id\", \"product_title\", \"star_rating\",\n", + " \"helpful_votes\", \"total_votes\", \"insight\", \"review_headline\", \"review_body\", \"review_date\", \"review_year\"]],\n", + " path = s3_bucket_output,\n", + " dataset = True,\n", + " max_rows_by_file = 3000000,\n", + " partition_cols=['product_category']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "35ac2825-ac79-4059-83fa-e3e99b4ada69", + "metadata": {}, + "source": [ + "## Home_Kitchen\n", + "\n", + "Prompt: \"Generate 50 products related to home and kitchen. Provide the output as a comma separated list, \n", + "surround each word in single quotes, each word as lower case, each word is singular.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0087e37-4006-49d1-8b78-c3c61b844ded", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_pool = ['blender', 'saucepan', 'oven', 'spatula', 'chopping board', 'kettle', 'toaster', 'microwave', 'refrigerator',\n", + " 'dishwasher', 'coffee maker', 'food processor', 'mixer', 'frying pan', 'baking tray', 'pressure cooker', \n", + " 'slow cooker', 'knife set', 'cutting board', 'peeler', 'grater', 'strainer', 'whisk', 'ladle', 'tongs', \n", + " 'dish rack', 'oven mitt', 'kitchen towel', 'apron', 'trash can', 'storage container', 'jar', 'bottle', \n", + " 'bowl', 'plate', 'cup', 'glass', 'mug', 'fork', 'knife', 'spoon', 'napkin', 'coaster', 'placemat', \n", + " 'tablecloth', 'vase', 'candle holder', 'centerpiece']" + ] + }, + { + "cell_type": "markdown", + "id": "353f4d23-d25c-4711-9b50-966f545f8a7e", + "metadata": {}, + "source": [ + "Prompt: \"Generate 50 home and kitchen product characteristics. Each word must be an adjective. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b353505-1d11-4258-8fc9-ed2dc82e4587", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_prefix_pool = ['durable', 'efficient', 'sleek', 'versatile', 'ergonomic', 'eco-friendly', 'compact', 'multifunctional', 'innovative', \n", + " 'stylish', 'user-friendly', 'dishwasher-safe', 'energy-saving', 'non-stick', 'portable', 'stainless', 'cordless', \n", + " 'rust-resistant', 'powerful', 'quiet', 'programmable', 'decorative', 'adjustable', 'insulated', 'collapsible', \n", + " 'lightweight', 'lockable', 'spill-proof', 'automated', 'hygienic', 'microwave-safe', 'retractable', 'wireless', \n", + " 'leak-proof', 'robust', 'customizable', 'washable', 'scratch-resistant', 'biodegradable', 'tamper-proof', \n", + " 'heat-resistant', 'space-saving', 'child-proof', 'soundproof', 'breathable', 'waterproof', 'odor-free', 'shatterproof']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39c0a1bf-052b-4dfa-a887-7dde4c015296", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# author generated\n", + "product_suffix_pool = [\"a perfect gift\", \"santa's little helper\", \"with cord storage\", \"with beautiful print\",\n", + " \"for a beautiful home\", \"for a well-appointed kitchen\", \"long-lasting\", \"with ergonimic handle\", \"latest model\",\n", + " \"quick ship\", \"set of 12\", '2 years warranty', '1 year warranty']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7cd92406-1c5a-4972-a167-f7ad6272eb9a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat = rgh.create_dataset(size_factor = 3, \n", + " total_votes = np.arange(0, 100, 2),\n", + " helpful_votes = np.arange(0, 52), \n", + " scale = 0.7, \n", + " review_years = np.arange(1997, 2018), \n", + " product_category = 'Home_Kitchen', \n", + " product_components = [product_prefix_pool, product_pool, product_suffix_pool],\n", + " marketplace_factor = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b492f5d1-d52f-4e8e-917c-cfc0874b64da", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "reviews = pd.read_parquet(s3_bucket_text+\n", + " '/review_body_headline/10d7317906cf4aa6845a8d7d66b0c651_1.snappy.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3606843c-c7a2-41fe-9fec-7f880f93ae99", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"review_headline\"] = reviews.iloc[0:dat.shape[0]][\"review_headline\"].array\n", + "dat[\"review_body\"] = reviews.iloc[0:dat.shape[0]][\"review_body\"].array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65747f3d-bf6c-4d7d-a1bf-de7451f785e2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "wr.s3.to_parquet(\n", + " df = dat[[\"product_category\", \"marketplace\", \"customer_id\", \"review_id\", \"product_id\", \"product_title\", \"star_rating\",\n", + " \"helpful_votes\", \"total_votes\", \"insight\", \"review_headline\", \"review_body\", \"review_date\", \"review_year\"]],\n", + " path = s3_bucket_output,\n", + " dataset = True,\n", + " max_rows_by_file = 3000000,\n", + " partition_cols = ['product_category']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8cd907f2-c708-429f-9077-45556c0d252c", + "metadata": {}, + "source": [ + "## Beauty_Personal_Care\n", + "\n", + "Prompt: \"Generate 50 products related to beauty and personal care. Provide the output as a comma separated list, \n", + "surround each word in single quotes, each word as lower case, each word is singular.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ef64daf-9ad1-46ee-b25d-84068e5d586a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_pool = ['lipstick', 'mascara', 'eyeliner', 'foundation', 'concealer', 'blush', 'bronzer', 'highlighter', \n", + " 'eyeshadow', 'lip gloss', 'lip balm', 'face powder', 'face serum', 'face cream', 'face mask', \n", + " 'face scrub', 'face toner', 'face cleanser', 'body lotion', 'body butter', 'body scrub', \n", + " 'body wash', 'shampoo', 'conditioner', 'hair oil', 'hair serum', 'hair spray', 'hair gel', \n", + " 'hair mousse', 'hair dye', 'nail polish', 'nail polish remover', 'nail file', 'nail clipper', \n", + " 'tweezers', 'razor', 'shaving cream', 'aftershave', 'deodorant', 'perfume', 'cologne', 'bath bomb', \n", + " 'bath salt', 'loofah', 'toothbrush', 'toothpaste', 'mouthwash', 'floss']" + ] + }, + { + "cell_type": "markdown", + "id": "164fd28e-0a33-43c9-9512-c2b80d6ec4f4", + "metadata": {}, + "source": [ + "Prompt: \"Generate 50 beauty and personal care product characteristics. Each word must be an adjective. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "462c6163-e0e7-4279-874c-57469b6c5fad", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_prefix_pool = ['radiant', 'nourishing', 'rejuvenating', 'silky', 'luminous', 'hydrating', 'soothing', \n", + " 'revitalizing', 'purifying', 'luxurious', 'anti-aging', 'botanical', 'gentle', 'organic', \n", + " 'natural', 'enriching', 'firming', 'smoothing', 'antioxidant', 'protecting', 'repairing', \n", + " 'clarifying', 'refining', 'toning', 'balancing', 'reviving', 'energizing', 'brightening', \n", + " 'lifting', 'anti-wrinkle', 'moisturizing', 'detoxifying', 'invigorating', 'regenerating', \n", + " 'refreshing', 'softening', 'volumizing', 'conditioning', 'replenishing', 'soothing', 'illuminating', \n", + " 'mattifying', 'nurturing', 'therapeutic', 'pampering', 'calming', 'vitalizing', 'renewing', 'polishing']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ec5ea3f-f328-4b18-bc05-7f7d44b18ea4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# author generated\n", + "product_suffix_pool = [\"in renewable packaging\", \"in recyclable packaging\", \"two-pack\", \"for the whole family\", \"good for sensitive skin\",\n", + " \"appropriate for all skin types\", \"16oz jar\", \"8oz jar\", \"200 ml\", \"appropriate for any skin tone\", \"appropriate for any hair type\",\n", + " \"gentle smell\", \"no aritificial components\", \"no artifical colors\", \"no artifical fragrances\", \"refreshing aroma\", \"refreshing smell\", \n", + " \"long-lasting\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ae050a1-2ed5-433b-898b-80c58068d5bf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat = rgh.create_dataset(size_factor = 1, \n", + " total_votes = np.arange(0, 150),\n", + " helpful_votes = np.arange(0, 60), \n", + " scale = 0.5, \n", + " review_years = np.arange(1998, 2015), \n", + " product_category = 'Beauty_Personal_Care', \n", + " product_components = [product_prefix_pool, product_pool, product_suffix_pool],\n", + " marketplace_factor = 0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba2a1837-43be-4bd9-aed2-b15a80b5bedf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "reviews = pd.read_parquet(s3_bucket_text + \n", + " '/review_body_headline/10d7317906cf4aa6845a8d7d66b0c651_2.snappy.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7ff9274-2d5c-4702-bcfd-589015e1b2dc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "row_marker = dat.shape[0]\n", + "row_marker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42af9241-d2dc-4c18-8807-a0030ca5790f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"review_headline\"] = reviews.iloc[0:dat.shape[0]][\"review_headline\"].array\n", + "dat[\"review_body\"] = reviews.iloc[0:dat.shape[0]][\"review_body\"].array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7058c397-c027-451e-92d5-37f2c32cbb28", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "wr.s3.to_parquet(\n", + " df = dat[[\"product_category\", \"marketplace\", \"customer_id\", \"review_id\", \"product_id\", \"product_title\", \"star_rating\",\n", + " \"helpful_votes\", \"total_votes\", \"insight\", \"review_headline\", \"review_body\", \"review_date\", \"review_year\"]],\n", + " path = s3_bucket_output,\n", + " dataset = True,\n", + " max_rows_by_file = 3000000,\n", + " partition_cols = ['product_category']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "94f530e4-6b08-40cf-8743-c6f2dd5dc9d7", + "metadata": {}, + "source": [ + "## Apparel\n", + "\n", + "Prompt: \"Generate 50 apparel items. Provide the output as a comma separated list, \n", + "surround each word in single quotes, each word as lower case, each word is singular.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6dae3276-ec7e-438f-9f9f-b8ce0ba5bcb0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_pool = ['shirt', 'blouse', 'dress', 'skirt', 'pants', 'jeans', 'shorts', 'jacket', 'coat', 'sweater', \n", + " 'cardigan', 'vest', 't-shirt', 'tank', 'camisole', 'hoodie', 'sweatshirt', 'blazer', 'suit', \n", + " 'robe', 'gown', 'jumpsuit', 'romper', 'leggings', 'tights', 'stockings', 'socks', 'shoes', 'boots', \n", + " 'sandals', 'slippers', 'flip-flops', 'sneakers', 'heels', 'flats', 'hat', 'cap', 'beanie', 'scarf', \n", + " 'gloves', 'mittens', 'belt', 'tie', 'bowtie', 'necklace', 'bracelet', 'ring', 'earrings', 'watch']" + ] + }, + { + "cell_type": "markdown", + "id": "66397589-e07f-45a3-b24f-6e873ae19928", + "metadata": {}, + "source": [ + "Prompt: \"Generate 50 apparel characteristics. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\". The author then provided modifications." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78b22ffd-8248-496e-8440-9ae5782aefeb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_prefix_pool = ['multi-colored', 'modern pattern', 'pleasing texture', 'made with natural materials', \n", + " 'modern fit for men and women', 'in the latest style', \n", + " 'floor length', 'short sleeve', 'long-sleeve', 'flat collar', 'no collar', 'round neckline', \n", + " 'with embellishmenst', 'slimming silhouette', 'in fun prints!', 'heavy weight fabric', 'long lasting material',\n", + " 'drape', 'sheerness', 'stretch', 'opacity', 'high breathability',\n", + " 'very washabile fabric', 'with wrinkle-resistantant properties', 'insulation for low temperatures', \n", + " 'moisture-wicking', 'odor-resistance in any circumstances', 'sun-protection', 'warm and cozy', \n", + " 'unbelievable softness', 'with high sheen', 'shimmering fabric', 'matte fabric', 'shiny fabric', 'rugged construction', \n", + " 'with delicate lace', 'casual and comfy style', 'business formal', 'vintage', 'contemporary', \n", + " 'classic style', 'trendy lines', 'feminine or masculine']" + ] + }, + { + "cell_type": "markdown", + "id": "ed2b76f4-ca49-4ba4-9e0b-0955424bc044", + "metadata": {}, + "source": [ + "Prompt for the suffixes: \"Generate 50 apparel characteristics. Each word must be an adjective. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0685c13-16da-43fe-b2f9-2010246ffdcc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_suffix_pool = ['slim','sleek','loose','baggy','tight','flowy','sheer','opaque','vibrant','muted','patterned','solid',\n", + " 'striped','polka-dotted','floral','abstract','plain','faded','distressed','embroidered','sequined',\n", + " 'beaded','fringed','pleated','ruffled','belted','layered','oversized','cropped','cinched','tapered',\n", + " 'asymmetric','structured','draped','knitted','woven','lace','mesh','crocheted','denim','leather','suede',\n", + " 'satin','silk','cotton','linen']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a20067e7-b440-4be4-91c3-975b91a4451d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat = rgh.create_dataset(size_factor = 2, \n", + " total_votes = np.arange(0, 40),\n", + " helpful_votes = np.arange(0, 38), \n", + " scale = 0.9, \n", + " review_years = np.arange(2000, 2017), \n", + " product_category = 'Apparel', \n", + " product_components = [product_prefix_pool, product_pool, product_suffix_pool],\n", + " marketplace_factor = 1)" + ] + }, + { + "cell_type": "markdown", + "id": "7938ff71-fc26-4b87-ab89-c71c13e18a8a", + "metadata": {}, + "source": [ + "Use remaining rows from the last reviews file:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a437e953-587c-4754-9e9e-40acc40e082d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"review_headline\"] = reviews.iloc[row_marker:(row_marker + dat.shape[0])][\"review_headline\"].array\n", + "dat[\"review_body\"] = reviews.iloc[row_marker:(row_marker + dat.shape[0])][\"review_body\"].array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc4dfae7-c0ea-4acd-a1e4-2918f3e996b2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "wr.s3.to_parquet(\n", + " df = dat[[\"product_category\", \"marketplace\", \"customer_id\", \"review_id\", \"product_id\", \"product_title\", \"star_rating\",\n", + " \"helpful_votes\", \"total_votes\", \"insight\", \"review_headline\", \"review_body\", \"review_date\", \"review_year\"]],\n", + " path = s3_bucket_output,\n", + " dataset = True,\n", + " max_rows_by_file = 3000000,\n", + " partition_cols = ['product_category']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "dfcf9488-c122-4832-8023-ae59e777f212", + "metadata": {}, + "source": [ + "## Video_Games" + ] + }, + { + "cell_type": "markdown", + "id": "4a5ae0e9-589f-47e3-9aeb-4463bffd0c38", + "metadata": {}, + "source": [ + "Prompt: \"Generate 50 products related to video games. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\". The author then provided modifications." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4682f240-04df-4845-a0ca-22970357cac9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_pool = ['video game', 'game controller', 'console', 'controller', 'joystick', 'headset', 'mouse', 'keyboard', 'monitor', \n", + " 'speaker', 'microphone', 'webcam', 'chair', 'desk', 'laptop', 'graphics card', 'memory card', 'processor', \n", + " 'motherboard', 'memory', 'storage', 'hard-drive', 'drive', 'power', 'supply', 'cooling fan', 'fan', 'case', \n", + " 'accessory', 'cable', 'software', 'antivirus protection', 'antivirus software', 'firewall', 'peripheral', 'printer', \n", + " 'scanner', 'platform', 'lighting', 'lamp', 'touch panel', 'headphones', 'docking station',\n", + " 'subscription service', 'merchandise', 'figurine', 'plushie', 'apparel', 'poster', 'book', 'guide', 'walkthrough sheet', \n", + " 'strategy walkthrough', 'cheat-sheet', 'fidget spinner', 'sweatband']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6470d623-3804-43bf-94b1-0d01a9482fe1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_prefix_pool = ['ergonomic', 'gaming', 'large', 'fastest', 'cooling', 'stress relief', '', 'wearable', 'physical', 'virtual',\n", + " 'computer', 'single', 'streaming', 'board', 'play', 'flat' ]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eef327a9-f212-42a8-a5ca-2569a3af9707", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# author generated\n", + "product_suffix_pool = ['alleviates carpal tunnel syndrome', 'with thumb support', 'with elbow support', \n", + " 'in all colors', 'with large capacity', 'newest version', 'set', 'mount', 'adapter', \n", + " 'can be rotated in any direction', 'peripheral set']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e5079f6-0d4d-453a-92eb-ae3b752cfa47", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat = rgh.create_dataset(size_factor = 1, \n", + " total_votes = np.arange(0, 87, 2),\n", + " helpful_votes = np.arange(0, 54, 3), \n", + " scale = 0.7, \n", + " review_years = np.arange(1999, 2018), \n", + " product_category = 'Video_Games', \n", + " product_components = [product_prefix_pool, product_pool, product_suffix_pool],\n", + " marketplace_factor = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efb79700-9063-4ea0-96e0-a128cd12a545", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "reviews = pd.read_parquet(s3_bucket_text + \n", + " '/review_body_headline/7b80671e92144a27bcc4ea59e656e543_0.snappy.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba57ea70-dbe0-4b23-9383-78f2fefc0fa4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"review_headline\"] = reviews.iloc[0:dat.shape[0]][\"review_headline\"].array\n", + "dat[\"review_body\"] = reviews.iloc[0:dat.shape[0]][\"review_body\"].array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "531aa3b9-107b-4f9c-aaf0-0c9b9a583dd8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "wr.s3.to_parquet(\n", + " df = dat[[\"product_category\", \"marketplace\", \"customer_id\", \"review_id\", \"product_id\", \"product_title\", \"star_rating\",\n", + " \"helpful_votes\", \"total_votes\", \"insight\", \"review_headline\", \"review_body\", \"review_date\", \"review_year\"]],\n", + " path = s3_bucket_output,\n", + " dataset = True,\n", + " max_rows_by_file = 3000000,\n", + " partition_cols = ['product_category']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f72ad4a-f5ce-47f9-8398-d1563d7935b0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "row_marker = dat.shape[0]" + ] + }, + { + "cell_type": "markdown", + "id": "3e8d3869-0131-41d0-9451-1bd95e83adcd", + "metadata": {}, + "source": [ + "## Toys_Games\n", + "\n", + "Prompt: \"Generate 50 toys and games products. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9cfce6e-8c5e-444f-93f7-9ec5a118f31e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_pool = ['doll', 'puzzle', 'action figure', 'board game', 'stuffed animal', 'video game', 'lego set', 'card game', \n", + " 'toy car', 'art supply', 'ball', 'playset', 'toy robot', 'kite', 'jump rope', 'musical instrument', \n", + " 'building block', 'science kit', 'puzzle cube', 'toy truck', 'coloring book', 'playing card', 'frisbee', \n", + " 'yo-yo', 'bubble wand', 'remote control car', 'chalk', 'play dough', 'jigsaw puzzle', 'plush toy', \n", + " 'educational game', 'toy train', 'sticker book', 'craft kit', 'memory game', 'model kit', 'playground ball', \n", + " 'spinning top', 'marionette', 'kaleidoscope', 'dress-up costume', 'puppet', 'play tent', 'magic set', \n", + " 'toy laptop', 'pretend play set', 'juggling ball', 'sports equipment', 'musical toy']" + ] + }, + { + "cell_type": "markdown", + "id": "f04a92bb-ca80-41d6-a124-681b2da48159", + "metadata": {}, + "source": [ + "Prompt: \"Generate 50 words related to toys and games products. Each word must be an adjective. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5465e856-5adb-46ae-8de4-a888c7290d8e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_prefix_pool = ['educational','fun','interactive','colorful','wooden','plastic','electronic','durable','safe',\n", + " 'entertaining','stimulating','creative','imaginative','developmental','engaging','challenging',\n", + " 'collectible','lifelike','plush','cuddly','sturdy','eco-friendly','vibrant','musical','artistic',\n", + " 'construction','coding','virtual','augmented','puzzling','competitive','strategic','collaborative',\n", + " 'skill-building','themed','licensed','nostalgic','retro','innovative','compact','portable','versatile',\n", + " 'customizable','programmable','robotic','battery-powered','motion-activated']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcd4f402-8096-4f85-a4eb-70f30af0ad39", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# author generated\n", + "product_suffix_pool = ['unisex', 'for boys and girls', 'for kids and adults', 'for ages 0-1', 'for ages 1 to 3', \n", + " 'for ages 3 to 7', 'for ages 8 to 13', 'for ages 13+', 'for adults', 'for stress relief', \n", + " 'battery operated', 'in box', 'with a storage box', 'perfect birthday gift', 'for little girls', '10 pieces',\n", + " 'extra large', 'extra pieces included', 'for all ages', 'set of 2', 'pastel colors', 'full activity set']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3832fa1-73ff-41f0-a6ee-7780408807b3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat = rgh.create_dataset(size_factor = 2, \n", + " total_votes = np.arange(0, 20),\n", + " helpful_votes = np.arange(0, 18), \n", + " scale = 0.6, \n", + " review_years = np.arange(2000, 2015), \n", + " product_category = 'Toys_Games', \n", + " product_components = [product_prefix_pool, product_pool, product_suffix_pool],\n", + " marketplace_factor = 0.5)" + ] + }, + { + "cell_type": "markdown", + "id": "9dbb7e59-35af-49cb-aaab-19045df624f0", + "metadata": {}, + "source": [ + "Use remaining rows from the last reviews file:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8f51289-931c-4e27-ac95-8602f769972e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"review_headline\"] = reviews.iloc[row_marker:(row_marker + dat.shape[0])][\"review_headline\"].array\n", + "dat[\"review_body\"] = reviews.iloc[row_marker:(row_marker + dat.shape[0])][\"review_body\"].array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5537acbd-3ff2-443b-89ff-de35ccaca4fb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "wr.s3.to_parquet(\n", + " df = dat[[\"product_category\", \"marketplace\", \"customer_id\", \"review_id\", \"product_id\", \"product_title\", \"star_rating\",\n", + " \"helpful_votes\", \"total_votes\", \"insight\", \"review_headline\", \"review_body\", \"review_date\", \"review_year\"]],\n", + " path = s3_bucket_output,\n", + " dataset = True,\n", + " max_rows_by_file = 3000000,\n", + " partition_cols = ['product_category']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "0c7cc8c0-0bb5-4afb-8853-d714a3bfc227", + "metadata": {}, + "source": [ + "## Office_Products\n", + "\n", + "Prompt: \"Generate 50 office products. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3aa1bb6b-dc11-4b97-bcf8-b63e65c4130f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_pool = ['pen', 'pencil', 'eraser', 'ruler', 'stapler', 'scissors', 'paper', 'folder', 'clipboard', 'binder', 'tape', \n", + " 'glue', 'notebook', 'marker', 'highlighter', 'envelope', 'label', 'stamp', 'calendar', 'calculator', 'desk', \n", + " 'chair', 'lamp', 'whiteboard', 'projector', 'printer', 'scanner', 'shredder', 'telephone', 'headset', 'monitor', \n", + " 'keyboard', 'mouse', 'laptop', 'tablet', 'dock', 'cable', 'adapter', 'charger', 'battery', 'fan', 'humidifier', \n", + " 'dehumidifier', 'clock', 'planner', 'organizer', 'cup', 'mug']" + ] + }, + { + "cell_type": "markdown", + "id": "e01c503a-be87-462c-b879-ec585739718c", + "metadata": {}, + "source": [ + "Prompt: \"Generate 50 words related to office products. Each word must be an adjective. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81cc8cdf-7079-4bc7-9bc3-306c4e37579d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_prefix_pool = ['ergonomic','comfortable','multifunctional','compact','durable','affordable','versatile','efficient',\n", + " 'eco-friendly','stylish','portable','adjustable','wireless','noise-canceling','premium','innovative',\n", + " 'sturdy','sleek','secure','paperless','spacious','customizable','integrated','organized','lightweight',\n", + " 'programmable','user-friendly','modern','modular','automated','energy-saving','hygienic','smart',\n", + " 'flexible','intuitive','collaborative','productive','clutter-free','convenient','mobile','sophisticated',\n", + " 'minimalist','elegant','digital','streamlined','reliable','intuitive']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b2af3d8-258b-4fc7-8115-e31793b77601", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# author generated\n", + "product_suffix_pool = ['makes a great gift for boss', 'best manager gift', 'excellent employee gift', 'can be modified with your logo',\n", + " '5 boxes', 'box of 3', 'box of 10', 'box of 5', 'set of 10', 'set of 2', 'set of 20', 'elegant gift', '50-count',\n", + " '100-count', '200-count', \n", + " 'with your custom message', 'recyclable', 'made in the USA', 'non-slip', 'large capacity', 'comfortable grip',\n", + " 'soft grip', 'one touch', 'with remote', 'assorted colors', '96 color bulk pack', 'for adults and kids', \n", + " 'non-toxic', 'color-coordinated']\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eeeda4be-b8db-4a2e-8a6d-6aff5e32c76d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat = rgh.create_dataset(size_factor = 3, \n", + " total_votes = np.arange(2, 44, 2),\n", + " helpful_votes = np.arange(0, 32), \n", + " scale = 0.4, \n", + " review_years = np.arange(1995, 2014), \n", + " product_category = 'Office_Products', \n", + " product_components = [product_prefix_pool, product_pool, product_suffix_pool],\n", + " marketplace_factor = 0.4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "229c009f-ca0a-4817-8d63-30b6399267a2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "reviews = pd.read_parquet(s3_bucket_text + \n", + " '/review_body_headline/7b80671e92144a27bcc4ea59e656e543_1.snappy.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9136ae6c-86c1-4b8a-b13c-8bbf328da9a1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"review_headline\"] = reviews.iloc[0:dat.shape[0]][\"review_headline\"].array\n", + "dat[\"review_body\"] = reviews.iloc[0:dat.shape[0]][\"review_body\"].array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6468cd23-80be-4743-a1bf-56499557bb5c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "wr.s3.to_parquet(\n", + " df = dat[[\"product_category\", \"marketplace\", \"customer_id\", \"review_id\", \"product_id\", \"product_title\", \"star_rating\",\n", + " \"helpful_votes\", \"total_votes\", \"insight\", \"review_headline\", \"review_body\", \"review_date\", \"review_year\"]],\n", + " path = s3_bucket_output,\n", + " dataset = True,\n", + " max_rows_by_file = 3000000,\n", + " partition_cols = ['product_category']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6024e8d0-9071-4065-9b79-d268e7f12f29", + "metadata": {}, + "source": [ + "## Pet_Supplies\n", + "\n", + "Prompt: \"Generate 50 pet supplies related products. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\". The author then provided modifications." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce6c801d-8f55-47fe-9faa-16c5990eb44e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_pool = ['leash', 'collar', 'bed', 'toy', 'bowl', 'treat', 'brush', 'crate', 'shampoo', 'conditioner', 'harness',\n", + " 'sweater', 'carrier', 'litter', 'box', 'scratching pad', 'scratching post', 'aquarium', 'filter', 'food', 'dispenser',\n", + " 'water', 'fountain', 'grooming kit', 'nail clipper', 'vitamin', 'supplement', 'training pad', 'pad', \n", + " 'odor', 'remover', 'waste bag', 'waste bag dispenser', 'travel', 'bowl', 'pet', 'gate', 'ramp', 'stairs', 'playpen', 'feeder',\n", + " 'house', 'costume', 'bandana', 'tag', 'charm', 'bed', 'cover', 'heating pad', 'wheel', 'tree']" + ] + }, + { + "cell_type": "markdown", + "id": "685b0d9e-6241-4f87-80df-4151e29676f1", + "metadata": {}, + "source": [ + "Prompt: \"Generate 50 words related to pet supplies. Each word must be an adjective. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\" " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1e151ba-ba3a-4d2e-98b5-27435a3e8984", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_prefix_pool = ['absorbent', 'chewy','squeaky','durable','cozy','scratchy','feathery', 'collared',\n", + " 'groomed','nibbled','digestible','cuddly',\n", + " 'indestructible','chewable','nutritious','hypoallergenic','ergonomic','eco-friendly','orthopedic',\n", + " 'automated','heated','cooling','antibacterial','deodorizing','waterproof','plush','removable',\n", + " 'adjustable','machine-washable','non-toxic','refillable','training','corrective','enzymatic',\n", + " 'tasty','flavored','interactive','robotic','intelligent','monitored','wearable','portable','hands-free',\n", + " 'remote-controlled']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "155d8fcf-943e-4ed1-abc2-f0832d7a4151", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# author generated\n", + "product_suffix_pool = ['for pets', 'for cats', 'for small and large pets', 'for dogs', 'for your furry baby', \n", + " 'for your furrbaby', 'for small dogs', 'for large dogs', 'for birds', 'for hamsters', 'for indoor and outdoor',\n", + " 'with antimicrobial protection', 'no smell', 'perfect for Halloween', 'collapsible for easy storage',\n", + " 'for cars and home', 'for travel']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13c2bbed-25b5-4da5-a87f-06d3d3d3820c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat = rgh.create_dataset(size_factor = 3, \n", + " total_votes = np.arange(10, 30),\n", + " helpful_votes = np.arange(1, 25), \n", + " scale = 0.4, \n", + " review_years = np.arange(2003, 2015), \n", + " product_category = 'Pet_Supplies', \n", + " product_components = [product_prefix_pool, product_pool, product_suffix_pool],\n", + " marketplace_factor = 0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "225c2c8f-7b30-4eaf-89a4-94a514878722", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "reviews = pd.read_parquet(s3_bucket_text + \n", + " '/review_body_headline/7b80671e92144a27bcc4ea59e656e543_2.snappy.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b56739d-ecb2-4d4f-8531-db37fc869e57", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"review_headline\"] = reviews.iloc[0:dat.shape[0]][\"review_headline\"].array\n", + "dat[\"review_body\"] = reviews.iloc[0:dat.shape[0]][\"review_body\"].array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b60aee1-8e33-4c83-b9c1-ab0f45f4fc9f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "wr.s3.to_parquet(\n", + " df = dat[[\"product_category\", \"marketplace\", \"customer_id\", \"review_id\", \"product_id\", \"product_title\", \"star_rating\",\n", + " \"helpful_votes\", \"total_votes\", \"insight\", \"review_headline\", \"review_body\", \"review_date\", \"review_year\"]],\n", + " path = s3_bucket_output,\n", + " dataset = True,\n", + " max_rows_by_file = 3000000,\n", + " partition_cols = ['product_category']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "db1820ed-60ec-4075-b1bd-f918e3717b1a", + "metadata": {}, + "source": [ + "## Sports_Outdoors" + ] + }, + { + "cell_type": "markdown", + "id": "659d162c-e792-46fb-a5b0-4d39dfb92459", + "metadata": {}, + "source": [ + "Prompt: \"Generate 50 product names related to sports and outdoor activities. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3261e02d-df50-4d12-b478-9f1325144b24", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_pool = ['basketball', 'football', 'baseball', 'tennis', 'volleyball', 'frisbee', 'swimsuit', 'surfboard', 'kayak', 'canoe', \n", + " 'tent', 'backpack', 'sleeping bag', 'compass', 'binoculars', 'hiking boots', 'fishing rod', 'fishing line', 'tackle box', 'life jacket', \n", + " 'water bottle', 'sunscreen', 'hat', 'snorkel', 'diving mask', 'wetsuit', 'skateboard', 'rollerblades', 'bicycle', \n", + " 'helmet', 'knee pad', 'elbow pad', 'climbing rope', 'carabiner', 'harness', 'golf club', 'golf ball', 'tee', \n", + " 'racquet', 'shuttlecock', 'goalpost', 'whistle', 'stopwatch', 'scoreboard', 'trophy', 'medal', 'jersey', 'shorts',\n", + " 'meal replacement', 'napkin']" + ] + }, + { + "cell_type": "markdown", + "id": "2089dfc2-f203-48a7-838d-7f4c092e2071", + "metadata": {}, + "source": [ + "Prompt: \"Generate 50 words that describe products for sports and outdoor activities. Each word must be an adjective. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fab7f30b-906f-4003-a48b-1058b31b352c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_prefix_pool = ['lightweight', 'durable', 'breathable', 'waterproof', 'moisture-wicking', 'insulated', 'quick-drying', \n", + " 'abrasion-resistant', 'shock-absorbing', 'ergonomic', 'buoyant', 'reflective', 'aerodynamic', 'non-slip', \n", + " 'sweat-resistant', 'adjustable', 'flexible', 'padded', 'compressible', 'weatherproof', 'ventilated', \n", + " 'UV-resistant', 'high-visibility', 'anti-microbial', 'tear-resistant', 'grip-enhancing', 'cushioned', \n", + " 'thermal', 'odor-resistant', 'windproof', 'impact-resistant', 'breathable', 'water-repellent', 'anti-chafe', \n", + " 'snug-fitting', 'quick-release', 'versatile', 'rugged', 'lightweight', 'compact', 'sturdy', 'reinforced', \n", + " 'hypoallergenic', 'anti-static', 'anti-glare', 'heat-retaining', 'sweat-wicking', 'disposable']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aec8e794-03fd-4d72-9043-6436dca35a52", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# author generated\n", + "product_suffix_pool = ['with long lasting properties', 'set of 6', 'with heavy duty clips', 'D ring shape', 'mix in a mesh bag',\n", + " '36 pack', 'personalized with your name', 'personalized with your logo', 'for the whole team', 'with team logo',\n", + " 'multi-colored', 'great for daily trips', 'organized', 'easy to store', 'with the instruction manual',\n", + " 'for hunting', 'for fishing', 'for walking', 'for running and walking']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9140d2eb-6a3d-40a3-ba14-fd3c51870e9b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat = rgh.create_dataset(size_factor = 1, \n", + " total_votes = np.arange(10, 71),\n", + " helpful_votes = np.arange(3, 34, 3), \n", + " scale = 0.3, \n", + " review_years = np.arange(2001, 2013), \n", + " product_category = 'Sports_Outdoors', \n", + " product_components = [product_prefix_pool, product_pool, product_suffix_pool],\n", + " marketplace_factor = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0f04fa0-7355-46fe-9546-3fcdf7428c4d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "reviews = pd.read_parquet(s3_bucket_text + \n", + " '/review_body_headline/a15fc40fcfa545bb93730f74a462d1a3_0.snappy.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9ed8635-cf9d-4f54-acea-d45e00d193d6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"review_headline\"] = reviews.iloc[0:dat.shape[0]][\"review_headline\"].array\n", + "dat[\"review_body\"] = reviews.iloc[0:dat.shape[0]][\"review_body\"].array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76b5d16c-f46c-4047-bd28-234db4a26b8d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "wr.s3.to_parquet(\n", + " df = dat[[\"product_category\", \"marketplace\", \"customer_id\", \"review_id\", \"product_id\", \"product_title\", \"star_rating\",\n", + " \"helpful_votes\", \"total_votes\", \"insight\", \"review_headline\", \"review_body\", \"review_date\", \"review_year\"]],\n", + " path = s3_bucket_output,\n", + " dataset = True,\n", + " max_rows_by_file = 3000000,\n", + " partition_cols = ['product_category']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc7646c3-77c8-4a57-9965-7be294f8b050", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "row_marker = dat.shape[0]\n", + "row_marker" + ] + }, + { + "cell_type": "markdown", + "id": "9e087be3-ccc0-42ef-a108-f3375b3c8ffc", + "metadata": {}, + "source": [ + "## Tools_Home_Improvement\n", + "\n", + "Prompt: \"Generate 50 product names related to tools and home improvement. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41a7fd9f-f194-4854-b0f6-4c16e7eed05b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_pool = ['hammer', 'screwdriver', 'wrench', 'pliers', 'drill', 'saw', 'chisel', 'level', 'tape measure', \n", + " 'ladder', 'paintbrush', 'roller', 'sandpaper', 'putty knife', 'caulk gun', 'utility knife', \n", + " 'stud finder', 'clamp', 'vise', 'hacksaw', 'crowbar', 'socket set', 'shovel', 'rake', 'wheelbarrow', \n", + " 'trowel', 'sander', 'grinder', 'hedge trimmer', 'lawnmower', 'leaf blower', 'chainsaw', 'nail gun', \n", + " 'heat gun', 'welding torch', 'soldering iron', 'multimeter', 'voltmeter', 'pipe wrench', 'wire stripper', \n", + " 'bolt cutter', 'sledgehammer', 'pickaxe', 'pry bar', 'plumb bob', 'spirit level', 'chalk line']" + ] + }, + { + "cell_type": "markdown", + "id": "cb965898-1c16-41dd-b042-4af684df622b", + "metadata": {}, + "source": [ + "Prompt: \"Generate 50 words that describe products such as tools and that are used for home improvement. Each word must be an adjective. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e33cad6b-ec5d-47ad-9eef-df46151c6010", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_prefix_pool = ['sturdy', 'durable', 'ergonomic', 'versatile', 'efficient', 'powerful', 'precise', 'lightweight', \n", + " 'reliable', 'rust-resistant', 'corrosion-resistant', 'heavy-duty', 'high-performance', 'sleek', \n", + " 'compact', 'user-friendly', 'multipurpose', 'innovative', 'energy-saving', 'eco-friendly', 'robust', \n", + " 'long-lasting', 'portable', 'weatherproof', 'scratch-resistant', 'vibration-resistant', 'noise-reducing', \n", + " 'adjustable', 'flexible', 'lockable', 'swiveling', 'retractable', 'collapsible', 'foldable', 'seamless', \n", + " 'cordless', 'rechargeable', 'detachable', 'interchangeable', 'customizable', 'programmable', 'automated', \n", + " 'digital', 'intuitive', 'heated', 'insulated', 'magnetic', 'laser-guided']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3079797-1205-40fe-807b-e07d7ee97c92", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# author generated\n", + "product_suffix_pool = ['with magnetics holders', 'rust resistant metal parts', 'double sided', 'with large easy to read letters', \n", + " 'with fiberglass handles', 'general purpose', 'all purpose', 'for smaller hands', 'with slip cushion grip', \n", + " 'large size', 'small size', 'red', 'black', 'yellow', 'quality steel', 'quality materials', 'long lasting materials',\n", + " 'exceptional worksmanship', 'with 40V lithium-ion battery', 'orange', 'orange and black colors', '2 years warranty']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f97e9842-cb76-45bc-ad91-f20bc1fbb169", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat = rgh.create_dataset(size_factor = 2, \n", + " total_votes = np.arange(10, 66, 2),\n", + " helpful_votes = np.arange(2, 48), \n", + " scale = 1, \n", + " review_years = np.arange(1996, 2010), \n", + " product_category = 'Tools_Home_Improvement', \n", + " product_components = [product_prefix_pool, product_pool, product_suffix_pool],\n", + " marketplace_factor = 0.6)" + ] + }, + { + "cell_type": "markdown", + "id": "1ea0fcec-b90b-4316-b7a1-ba08cdcfde53", + "metadata": {}, + "source": [ + "Use remaining rows from the last reviews file:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08ec4b0a-ad51-41c7-a2b8-27a22c5a84b9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"review_headline\"] = reviews.iloc[row_marker:(row_marker + dat.shape[0])][\"review_headline\"].array\n", + "dat[\"review_body\"] = reviews.iloc[row_marker:(row_marker + dat.shape[0])][\"review_body\"].array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd80208d-6506-40fe-8f80-0d90c1d5e628", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "wr.s3.to_parquet(\n", + " df = dat[[\"product_category\", \"marketplace\", \"customer_id\", \"review_id\", \"product_id\", \"product_title\", \"star_rating\",\n", + " \"helpful_votes\", \"total_votes\", \"insight\", \"review_headline\", \"review_body\", \"review_date\", \"review_year\"]],\n", + " path = s3_bucket_output,\n", + " dataset = True,\n", + " max_rows_by_file = 3000000,\n", + " partition_cols = ['product_category']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "bc3c1cd7-35c0-48c3-882f-a679aeadd4f0", + "metadata": {}, + "source": [ + "## Garden_Outdoor\n", + "\n", + "Prompt: \"Generate 50 product names related to garden and outdoor. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\". " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d0ad961-ead9-4293-ab9c-c31cc7dc6d74", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_pool = ['rake', 'shovel', 'hose', 'sprinkler', 'lawnmower', 'wheelbarrow', 'pruner', 'trowel', 'watering_can', \n", + " 'hoe', 'hedge trimmer', 'leaf blower', 'patio set', 'hammock', 'bird feeder', 'bird bath', 'garden gnome', \n", + " 'sundial', 'wind chime', 'garden bench', 'potting soil', 'fertilizer', 'seed packet', 'bulb', 'plant pot', \n", + " 'garden sculpture', 'outdoor lighting', 'patio heater', 'grill', 'smoker', 'fire pit', 'umbrella', 'gazebo', \n", + " 'trellis', 'pergola', 'garden_hose', 'nozzle', 'garden fork', 'cultivator', 'edger', 'pruning saw', \n", + " 'loppers', 'garden shears', 'gardening gloves', 'gardening hat', 'gardening apron', 'gardening knee pad', \n", + " 'gardening tool set', 'garden cart', 'compost_bin']" + ] + }, + { + "cell_type": "markdown", + "id": "f5341fad-b16c-419d-b02c-12aabd78b3f0", + "metadata": {}, + "source": [ + "Prompt: \"Generate 50 words that describe products for garden and outdoors. Each word must be an adjective. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\". The author then provided modifications." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7f215e9-e3d0-45b1-a85a-d0f91970d4a2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_prefix_pool = ['durable','weather-resistant','eco-friendly','sustainable','ergonomic','portable','lightweight','compact',\n", + " 'multi-functional','versatile','decorative','ornamental','rust-proof','water-resistant','low-maintenance',\n", + " 'vibrant','colorful','elegant','rustic','vintage','stylish','contemporary','sleek','modern','innovative',\n", + " 'space-saving','efficient','user-friendly','sturdy','robust','reliable','easy-to-clean','easy-to-assemble',\n", + " 'foldable','adjustable','lockable','breathable','ventilated','insulated','energy-saving','rechargeable',\n", + " 'cordless','solar-powered','automated','programmable', 'uv-resistant', 'uv-protective', 'spacious'] " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ddf0f745-1afb-4097-bdb0-815f07e90f65", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# author generated\n", + "product_suffix_pool = ['from heavy duty material', 'for patio, backyard and garden', 'DIY kit', 'quality structure', \n", + " 'one size fits all', 'washable canvas', 'with non-slip grip', 'excellent gift for a new homeowner', \n", + " 'with storage basket', \"ideal gift for women, birthdays\",\"great gift for a gardener\", \n", + " \"great gift for a beginner\", \"for raised beds\", \"deer resistant\", \"for pots\", '2 years warranty']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c2534df-167a-4d35-ab1a-e2d827fe7aca", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat = rgh.create_dataset(size_factor = 3, \n", + " total_votes = np.arange(1, 40, 3),\n", + " helpful_votes = np.arange(1, 30, 2), \n", + " scale = 0.6, \n", + " review_years = np.arange(2002, 2018), \n", + " product_category = 'Garden_Outdoor', \n", + " product_components = [product_prefix_pool, product_pool, product_suffix_pool],\n", + " marketplace_factor = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef3325bc-94b3-4740-919f-553178dafd2b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "reviews = pd.read_parquet(s3_bucket_text + \n", + " '/review_body_headline/a15fc40fcfa545bb93730f74a462d1a3_1.snappy.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77c37f42-ac81-42c2-a0e4-7421299231e5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"review_headline\"] = reviews.iloc[0:dat.shape[0]][\"review_headline\"].array\n", + "dat[\"review_body\"] = reviews.iloc[0:dat.shape[0]][\"review_body\"].array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1b52420-d62c-4528-acde-b2bd8bcb1207", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "wr.s3.to_parquet(\n", + " df = dat[[\"product_category\", \"marketplace\", \"customer_id\", \"review_id\", \"product_id\", \"product_title\", \"star_rating\",\n", + " \"helpful_votes\", \"total_votes\", \"insight\", \"review_headline\", \"review_body\", \"review_date\", \"review_year\"]],\n", + " path = s3_bucket_output,\n", + " dataset = True,\n", + " max_rows_by_file = 3000000,\n", + " partition_cols = ['product_category']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1f7e6e5f-467b-47e0-b571-18f93467b53d", + "metadata": {}, + "source": [ + "## Arts_Crafts_Sewing\n", + "\n", + "Prompt: \"Generate 100 product names related to arts, crafts and sewing. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dad3abdf-7a0e-442b-856b-51f03c6ced31", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_pool = ['paintbrush', 'sketchbook', 'easel', 'canvas', 'palette', 'pastel', 'marker', 'charcoal', 'calligraphy', 'ink', \n", + " 'watercolor', 'acrylic paints', 'oil brushes', 'oil for painting', 'pencil', 'eraser', 'sharpener', 'ruler', 'compass', 'protractor', 'scissor', \n", + " 'glue', 'tape', 'stencil', 'stamp', 'embosser', 'embroidery', 'needle', 'thread', 'yarn', 'fabric', 'felt', 'ribbon', \n", + " 'button', 'zipper', 'clasp', 'hook', 'bead', 'sequin', 'feather', 'leather', 'clay', 'polymer clay', 'resin', 'mold', \n", + " 'air dry clay', 'carving tools', 'engraving tools', 'etching tools', 'printing supplies', 'calligraphy kit', \n", + " 'lettering', 'font', 'typeface', \n", + " 'quilling', 'origami', 'papercutting', 'decoupage', 'collage', 'mosaic', 'macrame', 'weaving', 'knitting', \n", + " 'crocheting', 'felting', 'dyeing', 'painting', 'sketching', 'drawing', 'coloring', 'scrapbooking', 'journaling', \n", + " 'cardmaking', 'stamping', 'stenciling', 'woodburning', 'woodcarving', 'metalsmithing', 'jewelry', 'beading', \n", + " 'wirework', 'enameling', 'glassblowing', 'ceramics', 'pottery', 'sculpture', 'photography', 'framing', 'matting']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ccf0d034-e2c8-48ee-969a-0698f90a28d6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# author generated\n", + "product_prefix_pool = ['cotton', 'polyester', 'glass', 'wooden', 'full-color', 'multicolored', 'natural', 'premium', 'custom',\n", + " 'rubber', 'personalized', 'durable', 'sturdy', 'easy to handle', 'beautiful', 'sleek']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6f55def-8cb9-49ad-82a8-5255712c8d92", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# author generated\n", + "product_suffix_pool = ['for beginners', 'perfect for beginners', 'complete kit', '120 pieces kit', 'starter kit', 'for Christmas trees',\n", + " 'pack of 1', 'pack of 3', 'pack of 5', 'pack of 10', 'for drawing', 'for illustration', 'for all media', 'for coloring',\n", + " 'acid free', 'for wedding', 'for anniversary', 'for birthdays', 'for engagement', 'with floral pattern',\n", + " 'mounted', 'for card making', 'large bundle', 'for crafting', 'for DIY projects', 'for toys, sporting goods and glass',\n", + " 'DIY arts, crafts project', 'for beginners and professionals']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e94258c-43ac-48e4-ae7e-1281d8736ded", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat = rgh.create_dataset(size_factor = 3, \n", + " total_votes = np.arange(10, 80),\n", + " helpful_votes = np.arange(9, 77), \n", + " scale = 0.6, \n", + " review_years = np.arange(2001, 2011), \n", + " product_category = 'Arts_Crafts_Sewing', \n", + " product_components = [product_prefix_pool, product_pool, product_suffix_pool],\n", + " marketplace_factor = 0.3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb41c15b-9259-4a87-9256-b59d880760b0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "reviews = pd.read_parquet(s3_bucket_text + \n", + " '/review_body_headline/a15fc40fcfa545bb93730f74a462d1a3_2.snappy.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07d41ed7-4811-45d5-b052-9d5e32c82504", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"review_headline\"] = reviews.iloc[0:dat.shape[0]][\"review_headline\"].array\n", + "dat[\"review_body\"] = reviews.iloc[0:dat.shape[0]][\"review_body\"].array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4704a914-9d8d-4776-a9e0-bb3fab890cb1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "wr.s3.to_parquet(\n", + " df = dat[[\"product_category\", \"marketplace\", \"customer_id\", \"review_id\", \"product_id\", \"product_title\", \"star_rating\",\n", + " \"helpful_votes\", \"total_votes\", \"insight\", \"review_headline\", \"review_body\", \"review_date\", \"review_year\"]],\n", + " path = s3_bucket_output,\n", + " dataset = True,\n", + " max_rows_by_file = 3000000,\n", + " partition_cols = ['product_category']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "abb4b7e2-867d-4b87-a179-d281b65f5383", + "metadata": {}, + "source": [ + "## Health_Household\n", + "\n", + "Prompt: \"Generate 50 products related to health and household. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1e2e353-2e71-4325-841d-859e26780aeb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_pool = ['vitamin', 'supplement', 'medicine', 'bandage', 'thermometer', 'disinfectant', 'soap', 'shampoo', 'conditioner', \n", + " 'toothbrush', 'toothpaste', 'floss', 'mouthwash', 'deodorant', 'lotion', 'sunscreen', 'cleanser', 'towel', \n", + " 'tissue', 'detergent', 'bleach', 'sponge', 'brush', 'mop', 'broom', 'vacuum', 'duster', 'trash', 'can', 'bag', \n", + " 'container', 'box', 'basket', 'organizer', 'shelf', 'rack', 'closet', 'cabinet', 'drawer', 'light', 'bulb', \n", + " 'battery', 'charger', 'cable', 'adapter', 'extension', 'cord', 'tape', \n", + " 'glue', 'scissors']" + ] + }, + { + "cell_type": "markdown", + "id": "9cd37cfe-f975-40bb-9453-a38590f87802", + "metadata": {}, + "source": [ + "Prompt: \"Generate 50 words that describe products for health and household. Each word must be an adjective. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f20c564-cafc-4706-aca1-aa5ee4cde846", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_prefix_pool = ['natural','organic','eco-friendly','sustainable','biodegradable','non-toxic','hypoallergenic',\n", + " 'cruelty-free','vegan','plant-based','gentle','fragrance-free','antibacterial','antimicrobial',\n", + " 'multipurpose','versatile','durable','long-lasting','compact','lightweight','ergonomic',\n", + " 'energy-efficient','water-saving','stain-resistant','scratch-proof','unscented','disinfecting',\n", + " 'deodorizing','air-purifying','odor-eliminating','moisture-wicking','quick-drying','absorbent',\n", + " 'leak-proof','spill-proof','dustproof','lint-free','pet-friendly','child-safe','dishwasher-safe',\n", + " 'microwave-safe','oven-safe','freezer-safe','insulated','recyclable','compostable']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28e39112-7b0f-48ac-b5ab-5598c0e3e021", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# author generated\n", + "product_suffix_pool = ['under sink storage', 'for office and living room', 'for baby room', 'with skin restorative complex',\n", + " 'with hair restorative complex', 'navy blue', 'green', 'blue', 'yellow', 'white', 'grey', 'black',\n", + " 'natural color', 'with woodgrain pattern', 'with hooks', 'smell preventing', 'smell trapping', 'for kitchen',\n", + " 'for bathroom', 'for toilets', 'for closet', 'child-safe', 'kid-safe']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79a1d07d-976c-48fb-b564-2c3dcaecc8a1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat = rgh.create_dataset(size_factor = 2, \n", + " total_votes = np.arange(7, 69),\n", + " helpful_votes = np.arange(4, 50), \n", + " scale = 0.2, \n", + " review_years = np.arange(1998, 2013), \n", + " product_category = 'Health_Household', \n", + " product_components = [product_prefix_pool, product_pool, product_suffix_pool],\n", + " marketplace_factor = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "954ff6a0-1a7e-42e9-8e17-2143595a9560", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "reviews = pd.read_parquet(s3_bucket_text + \n", + " '/review_body_headline/c74b5acddec04b65a16193140fd25fab_0.snappy.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ede875aa-8b05-4737-8ef1-7c63e033509d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"review_headline\"] = reviews.iloc[0:dat.shape[0]][\"review_headline\"].array\n", + "dat[\"review_body\"] = reviews.iloc[0:dat.shape[0]][\"review_body\"].array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efef98e1-b7dc-425f-8774-e87e7e61c2df", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "row_marker = dat.shape[0]\n", + "row_marker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa5d55ca-ad17-4cdb-8b4f-be394dd45536", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "wr.s3.to_parquet(\n", + " df = dat[[\"product_category\", \"marketplace\", \"customer_id\", \"review_id\", \"product_id\", \"product_title\", \"star_rating\",\n", + " \"helpful_votes\", \"total_votes\", \"insight\", \"review_headline\", \"review_body\", \"review_date\", \"review_year\"]],\n", + " path = s3_bucket_output,\n", + " dataset = True,\n", + " max_rows_by_file = 3000000,\n", + " partition_cols = ['product_category']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "aae37da3-a9c5-470d-8b18-b57d89b3cc6a", + "metadata": {}, + "source": [ + "## Computers\n", + "\n", + "Prompt: \"Generate 50 products related to computers. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac2ffda7-b54c-4b35-a1ab-69bf6bc1608d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_pool = ['laptop','desktop','keyboard','mouse','monitor','printer','speaker','webcam','headset','microphone','router',\n", + " 'modem','cable','adapter','charger','battery','case','ram','processor','motherboard','graphics card',\n", + " 'power supply','hard drive','solid state drive','optical drive','projector','scanner','camera','camcorder',\n", + " 'tablet','stylus','dock','hub','antivirus','firewall','vpn','network switch','ethernet cable','wireless adapter',\n", + " 'server','software','operating system','application','external_drive','usb flash drive','cooling fan']" + ] + }, + { + "cell_type": "markdown", + "id": "127171ff-bad9-4503-a646-63ed579df2c2", + "metadata": {}, + "source": [ + "Prompt: \"Generate 50 words related to computer products. Each word must be an adjective. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a67c8e2e-d1d4-4789-99c8-8ea0a2516450", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_prefix_pool = ['portable','lightweight','compact','ergonomic','sleek','durable','intuitive','innovative','versatile',\n", + " 'responsive','efficient','secure','user-friendly','high-performance','multifunctional','customizable',\n", + " 'advanced','intelligent','eco-friendly','wireless','interactive','multimedia','cutting-edge','integrated',\n", + " 'futuristic','sophisticated','robust','powerful','flexible','dynamic','compatible','intuitive',\n", + " 'energy-efficient','reliable','stylish','elegant','rugged','practical','affordable','accessible',\n", + " 'high-quality','fast','smart','premium','versatile','modern','adaptable','convenient','streamlined']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5405b889-73e1-44cb-a923-791960e8747b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# author generated\n", + "product_suffix_pool = ['with 6 outlets', 'quiet', 'very quiet', 'small size', 'under the desk storage', 'modern standby support',\n", + " 'black', 'fully modular', '1 year warranty', '2 years warranty', '10 years warranty', 'for gaming', \n", + " 'with cloud backup and virus protection', 'with cloud backup and black-web monitoring', 'upgraded version',\n", + " 'newest version', 'latest model', 'latest version', 'with updated firmware', 'water and dust resistant',\n", + " '2nd generation', '1st generation', '3d generation', 'with a sleeve', 'unlocked', 'refurbished', 'renewed']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80009f98-c89c-4b33-b1bb-eceb3d1369a8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat = rgh.create_dataset(size_factor = 1, \n", + " total_votes = np.arange(20, 120),\n", + " helpful_votes = np.arange(10, 60), \n", + " scale = 0.8, \n", + " review_years = np.arange(1996, 2017), \n", + " product_category = 'Computers', \n", + " product_components = [product_prefix_pool, product_pool, product_suffix_pool],\n", + " marketplace_factor = 0.4)" + ] + }, + { + "cell_type": "markdown", + "id": "4798119a-6773-44da-9f01-c2b40eeff2c5", + "metadata": {}, + "source": [ + "Use remaining rows from the last reviews file:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "607cadc1-ae19-4add-9e88-67f45256af75", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"review_headline\"] = reviews.iloc[row_marker:(row_marker + dat.shape[0])][\"review_headline\"].array\n", + "dat[\"review_body\"] = reviews.iloc[row_marker:(row_marker + dat.shape[0])][\"review_body\"].array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37bc49bd-9474-4720-99bc-29b6b25161de", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "wr.s3.to_parquet(\n", + " df = dat[[\"product_category\", \"marketplace\", \"customer_id\", \"review_id\", \"product_id\", \"product_title\", \"star_rating\",\n", + " \"helpful_votes\", \"total_votes\", \"insight\", \"review_headline\", \"review_body\", \"review_date\", \"review_year\"]],\n", + " path = s3_bucket_output,\n", + " dataset = True,\n", + " max_rows_by_file = 3000000,\n", + " partition_cols = ['product_category']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "bb56ae94-4c1d-4449-ab1e-5f79e8aa9562", + "metadata": {}, + "source": [ + "## Books\n", + "\n", + "Prompt: \"Generate 50 products related to books. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b51ad050-b25f-4de2-8aa5-63fe0eae279b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_pool = ['book', 'novel', 'textbook', 'magazine', 'comic', 'journal', 'dictionary', 'thesaurus', 'encyclopedia', \n", + " 'atlas', 'anthology', 'memoir', 'biography', 'autobiography', 'essay', 'poetry', 'play', 'script', \n", + " 'manuscript', 'ebook', 'audiobook', 'paperback', 'hardcover', 'bookmark', 'bookcase', 'bookshelf', \n", + " 'bookstand', 'bookholder', 'bookend', 'booklight', 'bookbag', 'bookcover', 'bookbinding', 'bookplate', \n", + " 'bookmarker', 'booklet', \n", + " 'bookmark']" + ] + }, + { + "cell_type": "markdown", + "id": "36d1b028-4c0f-429e-b2fc-4e0884b89aa6", + "metadata": {}, + "source": [ + "Prompt: \"Generate 50 products related to books. Each word must be an adjective. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3109c7ba-131f-4d88-a3c4-622f22063f70", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_prefix_pool = ['literary', 'educational', 'fictional', 'non-fictional', 'classic', 'modern', 'best-selling', 'popular', \n", + " 'acclaimed', 'award-winning', 'illustrated', 'hardcover', 'paperback', 'digital', 'audio', 'signed', \n", + " 'collectible', 'rare', 'vintage', 'antique', 'scholarly', 'academic', 'reference', 'informative', \n", + " 'instructional', 'inspirational', 'motivational', 'self-help', 'biographical', 'autobiographical', \n", + " 'historical', 'scientific', 'technical', 'childrens', 'young adult', 'adventure', 'romantic', 'mysterious', \n", + " 'suspenseful', 'thrilling', 'horror', 'fantasy', 'sci-fi', 'comic', 'graphic', 'coffee table', 'travel', \n", + " 'cookbook']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0253035-6741-43d8-87c7-b1fbab9fb6a9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# author generated\n", + "product_suffix_pool = ['large print', 'electronic version', 'with authors commentary', 'with excerpts from the upcoming book',\n", + " 'latest edition', 'comprehensive volume', 'full volume', 'unabridged', 'English translation', 'French translation',\n", + " 'Spanish translation', 'Russian translation', 'Chinese translation', 'unabridged version', '2-volume set',\n", + " 'with updates', 'student text', 'self-teaching guide', 'selected works', 'trilogy', 'boxed set', 'leather bound set',\n", + " 'cardstock', '1 pc', 'cute']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e61db5e9-a9f5-4371-9371-2f1ec194f387", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat = rgh.create_dataset(size_factor = 3, \n", + " total_votes = np.arange(40, 150, 3),\n", + " helpful_votes = np.arange(10, 30), \n", + " scale = 0.4, \n", + " review_years = np.arange(1996, 2018), \n", + " product_category = 'Books', \n", + " product_components = [product_prefix_pool, product_pool, product_suffix_pool],\n", + " marketplace_factor = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d1a36bd-0f0c-4297-8db0-d3ee6e37f96a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "reviews = pd.read_parquet(s3_bucket_text + \n", + " '/review_body_headline/c74b5acddec04b65a16193140fd25fab_1.snappy.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6adb0753-77ee-4447-8582-3d4f7d30f85d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"review_headline\"] = reviews.iloc[0:dat.shape[0]][\"review_headline\"].array\n", + "dat[\"review_body\"] = reviews.iloc[0:dat.shape[0]][\"review_body\"].array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27ec57ad-705a-4c94-aa42-0acb2f258948", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "wr.s3.to_parquet(\n", + " df = dat[[\"product_category\", \"marketplace\", \"customer_id\", \"review_id\", \"product_id\", \"product_title\", \"star_rating\",\n", + " \"helpful_votes\", \"total_votes\", \"insight\", \"review_headline\", \"review_body\", \"review_date\", \"review_year\"]],\n", + " path = s3_bucket_output,\n", + " dataset = True,\n", + " max_rows_by_file = 3000000,\n", + " partition_cols = ['product_category']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3c5df42d-fb0b-4bf0-9c8d-09587dd4f4bf", + "metadata": {}, + "source": [ + "## Music\n", + "\n", + "Prompt: \"Generate 100 music products. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40f01ef9-d127-4dd1-9467-2de3cbf225f3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_pool = ['guitar', 'piano', 'drum', 'violin', 'trumpet', 'saxophone', 'harmonica', 'flute', 'clarinet', 'trombone', \n", + " 'cello', 'harp', 'ukulele', 'synthesizer', 'keyboard', 'microphone', 'amplifier', 'speaker', 'headphone', \n", + " 'mixer', 'turntable', 'recorder', 'tuner', 'metronome', 'cajon', 'tambourine', 'maracas', 'cowbell', \n", + " 'shaker', 'triangle', 'cymbal', 'gong', 'xylophone', 'vibraphone', 'melodica', 'kazoo', 'accordion', \n", + " 'banjo', 'mandolin', 'sitar', 'kalimba', 'didgeridoo', 'djembe', 'bongo', 'congas', 'timbale', 'clave', \n", + " 'guiro', 'rainstick', 'beatbox', 'sampler', 'vocoder', 'distortion', 'reverb', 'delay', 'compressor', \n", + " 'equalizer', 'looper', 'sequencer', 'drumpad', 'groovebox', 'guitar', \n", + " 'pedal board', 'cable manager', 'music stand', 'sustain', 'capo', 'slidebar', 'pick', 'drumstick', 'mallets', \n", + " 'bagpipe', 'shehnai', 'erhu', 'koto', 'shamisen', 'saz', 'oud', 'qanun', 'daf', 'bodhrán', 'concertina', \n", + " 'harmonica', 'melodion', 'steelpan', 'washboard', 'jawsharp', 'mridangam', 'ghatam', 'kanjira', 'veena', 'sarangi']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e25e7df-c9f2-44b5-80cd-dddcace31773", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# author generated\n", + "product_prefix_pool = ['acoustic', 'electric', 'bass', 'mahogany', 'portable', 'full size', 'black', 'professional',\n", + " 'semi-professional', 'smart', 'rosewood', 'miniature', 'wooden', 'all wood']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96fae3be-a5e0-4a39-89d2-3a60a2f46aee", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# author generated\n", + "product_suffix_pool = ['instrument kit', 'with tuner', 'with shoulder strap', 'with picks', 'with picks for beginners',\n", + " 'with the guide book for beginners', 'with extra strings', 'table version, collectable', 'collectable',\n", + " 'small size', 'table top set', 'with travel case', 'with power supply', 'for beginners', 'expandable',\n", + " 'for electronic music making', 'black and silver', 'silver', 'black', 'handheld', 'with a switch', \n", + " 'concert grade', 'for kids', 'for adults', 'full size', 'natural wood gloss']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0a38552-3e5b-44a4-9887-a438ca0ca2d0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat = rgh.create_dataset(size_factor = 3, \n", + " total_votes = np.arange(11, 91),\n", + " helpful_votes = np.arange(3, 71), \n", + " scale = 0.4, \n", + " review_years = np.arange(1996, 2018), \n", + " product_category = 'Music', \n", + " product_components = [product_prefix_pool, product_pool, product_suffix_pool],\n", + " marketplace_factor = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51234601-b35c-4205-8af3-74ba2cce5f01", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "reviews = pd.read_parquet(s3_bucket_text + \n", + " '/review_body_headline/c74b5acddec04b65a16193140fd25fab_2.snappy.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d8acaec-018c-45e3-a518-37f38d7b1ddf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"review_headline\"] = reviews.iloc[0:dat.shape[0]][\"review_headline\"].array\n", + "dat[\"review_body\"] = reviews.iloc[0:dat.shape[0]][\"review_body\"].array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70fca5a8-ec63-487d-b9ff-696658c3e82a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "wr.s3.to_parquet(\n", + " df = dat[[\"product_category\", \"marketplace\", \"customer_id\", \"review_id\", \"product_id\", \"product_title\", \"star_rating\",\n", + " \"helpful_votes\", \"total_votes\", \"insight\", \"review_headline\", \"review_body\", \"review_date\", \"review_year\"]],\n", + " path = s3_bucket_output,\n", + " dataset = True,\n", + " max_rows_by_file = 3000000,\n", + " partition_cols = ['product_category']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "fe8e4aee-62ea-4369-ba98-e11ee8047262", + "metadata": {}, + "source": [ + "## Movies_TV\n", + "\n", + "Here the authors sent multiple requests to the model to generate movies and TV titles for different genres. Prompts:\n", + "\n", + "- Generate 50 titles for action movies. Provide the output as a comma separated list, surround each title in single quotes.\n", + "\n", + "- Generate 50 titles for romantic movies. Provide the output as a comma separated list, surround each title in single quotes.\n", + "\n", + "- Generate 50 titles for documentary movies. Provide the output as a comma separated list, surround each title in single quotes.\n", + "\n", + "- Generate 50 titles for historic movies. Provide the output as a comma separated list, surround each title in single quotes.\n", + "\n", + "- Generate 50 titles for children's movies. Provide the output as a comma separated list, surround each title in single quotes.\n", + "\n", + "- Generate 50 titles for comedic movies. Provide the output as a comma separated list, surround each title in single quotes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3352cb64-f48d-492a-aee2-18a7d3ef5c12", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_pool = ['Rogue Assassin', 'Explosive Vengeance', 'Lethal Strike', 'Adrenaline Rush', 'Bullet Proof', 'Unstoppable Force',\n", + " 'Maximum Impact', 'Relentless Pursuit', 'Crimson Requiem', 'Shattered Redemption', 'Fists of Fury', \n", + " 'Chaos Reigned', 'Collateral Damage', 'Eternal Reckoning', 'Unleashing the Beast', 'Scorched Earth', \n", + " 'Mercenary Uprising', 'Ultima Protocol', 'Deadly Infiltration', 'Expendable Retribution', 'Savage Fury', \n", + " 'Firestorm Rising', 'Armageddon Operative', 'Doomsday Directive', 'Renegade Enforcer', 'Detonation Point', \n", + " 'Apex Predator', 'Decimation Code', 'Crimson Retaliation', 'Havoc Unleashed', 'Cataclysmic Impact', \n", + " 'Hellfire Blitz', 'Obliteration Protocol', 'Annihilation Sequence', 'Infernal Supremacy', 'Archangels Wrath', \n", + " 'Epoch of Carnage', 'Extinction Event', 'Devastation Vortex', 'Apocalyptic Purge', 'Maelstrom Operative', \n", + " 'Cyber Insurrection', 'Terminal Velocity', 'Rampage Agenda', 'Vanguard of Chaos', 'Cataclysmic Vengeance', \n", + " 'Maelstrom Rising', 'Eternal Embrace', 'Loves Symphony', 'Whispers of the Heart', 'Destined Souls', \n", + " 'Midnight Serenade', 'Autumn Bliss', 'Unbreakable Bond', 'Crimson Desire', 'Serendipitous Encounter', \n", + " 'Forever and a Day', 'Twilight Rendezvous', 'Celestial Dance', 'Velvet Embrace', 'Moonlit Passion', \n", + " 'Everlasting Promise', 'Amorous Rhapsody', 'Rapturous Rhythm', 'Enchanted Affection', 'Soulful Symphony', \n", + " 'Ardent Devotion', 'Poetic Bliss', 'Amorous Overture', 'Intoxicating Melody', 'Cherished Rapture', \n", + " 'Infinite Adoration', 'Ethereal Caress', 'Spellbinding Aria', 'Transcendent Desire', 'Impassioned Embrace', \n", + " 'Entrancing Sonata', 'Tender Rhapsody', 'Fervent Allegro', 'Blissful Serenade', 'Enamored Concerto', \n", + " 'Exquisite Affinity', 'Mesmerizing Overture', 'Euphoric Cadence', 'Rapturous Reverie', 'Impassioned Crescendo', \n", + " 'Enthralling Opus', 'Amorous Elegy', 'Fervent Adagio', 'Soulful Interlude', 'Captivating Refrain', \n", + " 'Entrancing Idyll', 'Ethereal Rhapsody', 'Enraptured Fantasia', 'Unraveling the Universe', \n", + " 'The Secret Lives of Oceans', 'Resilience: Stories of Human Triumph', 'Endangered: A Cry for Conservation', \n", + " 'The Digital Revolution: Reshaping Our World', 'Culinary Explorers: A Gastronomic Journey', \n", + " 'Homeless Voices: Unheard Stories', 'Behind the Canvas: The Art of Masterpieces', \n", + " 'Untamed Wilderness: Exploring Earths Last Frontiers', 'The Rise of Artificial Intelligence', \n", + " 'Forgotten Histories: Untold Tales of the Past', 'The Mindfulness Movement: Finding Inner Peace', \n", + " 'Extreme Survival: Pushing Human Limits', 'Beneath the Surface: Uncovering Ancient Civilizations', \n", + " 'The Future of Energy: Sustainable Solutions', 'Monuments of Humankind: Architectural Wonders', \n", + " 'Unsung Heroes: Inspiring Acts of Courage', 'The Cosmic Odyssey: Exploring the Universe', \n", + " 'Music Across Cultures: A Melodic Journey', 'The Science of Happiness: Unlocking Lifes Secrets', \n", + " 'Pioneers of Progress: Visionaries Who Shaped Our World', 'The World of Sports: Passion and Perseverance', \n", + " 'Forgotten Wars: Untold Stories of Conflict', 'The Future of Medicine: Revolutionary Breakthroughs', \n", + " 'Extraordinary Minds: Exploring Genius', 'The Language Barrier: Bridging Cultural Divides', \n", + " 'Sustainable Living: A Path to a Greener Future', 'Unsung Innovators: Ideas That Changed the World', \n", + " 'The Power of Dreams: Inspiring Life Stories', 'Vanishing Traditions: Preserving Cultural Heritage', \n", + " 'The Invisible Struggle: Mental Health Unveiled', 'Frontiers of Exploration: Pushing the Boundaries', \n", + " 'The Science of Food: Unraveling Culinary Mysteries', 'The Art of Storytelling: Captivating Narratives', \n", + " 'The Human Condition: Exploring Our Existence', 'Saving Species: Conservation Efforts Worldwide', \n", + " 'The World of Dance: Movement and Expression', 'Underwater Wonders: Exploring Earths Oceans', \n", + " 'The Future of Transportation: Innovative Solutions', 'Citizen Scientists: Unlocking Knowledge for All', \n", + " 'The Hidden World of Insects: Tiny Marvels', 'Extraordinary Educators: Inspiring Young Minds', \n", + " 'The Science of Love: Understanding Human Connections', 'Unsung Heroes of History: Untold Stories of Bravery',\n", + " 'The Music Revolution: Shaping Cultural Movements', 'Extreme Environments: Life on the Edge',\n", + " 'The Conquerors Legacy', 'Echoes of Valor', 'Destinys Crucible', 'Whispers of Revolution', \n", + " 'The Monarchs Downfall', 'Forged in Fire', 'Clash of Empires', 'The Forgotten Warrior', \n", + " 'Shadows of the Past', 'A Nation Divided', 'The Price of Freedom', 'Legends of the Battleground', \n", + " 'Triumph and Tragedy', 'Relics of Antiquity', 'Echoes of Rebellion', 'The Sands of Time', 'Immortal Legacies', \n", + " 'The Rise of a Dynasty', 'Whispers of the Ancients', 'Eternal Conquest', 'The Unforgotten Heroes', \n", + " 'Remnants of Glory', 'Echoes of Rebellion', 'The Vanquished Empire', 'Remnants of an Era', \n", + " 'Relics of a Bygone Age', 'The Fallen Kingdoms', 'Whispers of the Ancestors', 'Echoes of Valor', \n", + " 'Immortal Legacies', 'The Rise of a Dynasty', 'Whispers of the Ancients', 'Eternal Conquest', \n", + " 'The Unforgotten Heroes', 'Remnants of Glory', 'Echoes of Rebellion', 'The Vanquished Empire', \n", + " 'Remnants of an Era', 'Relics of a Bygone Age', 'The Fallen Kingdoms', 'Whispers of the Ancestors', \n", + " 'Echoes of Valor', 'Immortal Legacies', 'The Rise of a Dynasty', 'Whispers of the Ancients', \n", + " 'Eternal Conquest', 'The Magical Treehouse Adventure', 'Dino Explorers', 'Starry Night Dreamers', \n", + " 'The Curious Robot', 'Mermaids Treasure Cove', 'Superhero Schoolyard', 'Friendship Forest Fables', \n", + " 'Circus of Wonders', 'Enchanted Woodlands', 'Intergalactic Explorers', 'Unicorn Meadows', 'Pirate Island Quest', \n", + " 'Fairytale Kingdom Chronicles', 'Wizards Apprentice', 'Dinosaur Rescue Rangers', 'Candy Land Shenanigans', \n", + " 'Mythical Creature Companions', 'Rainforest Rumble', 'Underwater Odyssey', 'Playtime Pandemonium', \n", + " 'Secret Clubhouse Capers', 'Puppet Theater Antics', 'Farmyard Frolic', 'Storybook Singalong', \n", + " 'Alien Adventurers', 'Magical Museum Mishaps', 'Superhero School', 'Enchanted Toy Emporium', \n", + " 'Outer Space Odyssey', 'Junkyard Robots', 'Mythical Creature Academy', 'Fairytale Forest Friends', \n", + " 'Circus of Dreams', 'Dinosaur Discoveries', 'Wonderland Wanders', 'Pirate Ship Escapades', \n", + " 'Magical Treehouse Chronicles', 'Superhero Sidekicks', 'Mermaid Melodies', 'Woodland Whimsies', \n", + " 'Alien Explorers', 'Candy Land Quests', 'Puppet Theater Plays', 'Farmyard Follies', 'Storybook Singalongs', \n", + " 'Rainforest Ramblings', 'The Laughing Llama', 'Giggles in the Grocery Aisle', 'Punchline Pandemonium', \n", + " 'Chuckle Chaos', 'Hysteria on the High Seas', 'Splitting Sides in Split', 'Knee-Slapping Knights', \n", + " 'Guffaw Galaxy', 'Chortles and Chaos', 'Snicker Snafus', 'Laughter Liftoff', 'Funny Farm Fiasco', \n", + " 'Giggle Galore', 'Mirth Mayhem', 'Hilarity Hijinks', 'Titter Town', 'Cackle Capers', 'Grin and Guffaw', \n", + " 'Rib-Tickling Romp', 'Laugh Riot Rampage', 'Yuks Unleashed', 'Snort-Worthy Shenanigans', 'Howl with Hilarity', \n", + " 'Chortle Circus', 'Guffaw Gala', 'Chuckle Champs', 'Funny Bone Bonanza', 'Laughter Lollapalooza', \n", + " 'Giggle Gang', 'Mirth Marathon', 'Grin Galore', 'Chuckle Chaos', 'Laugh-a-Palooza,' 'Comedy Caper',\n", + " 'Guffaw Gauntlet', 'Snicker Showdown', 'Hilarity Hell', 'Titter Tornado', 'Giggle Gauntlet', 'Mirth Madness',\n", + " 'Chuckle Champs', 'Laugh Riot Rodeo', 'Guffaw Galore', 'Snort-Worthy Showdown', 'Chortle Chaos', \n", + " 'Grin and Giggle', 'Mirth Mania']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd1de0b1-77c4-4151-9015-4fb72a7efedf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat = rgh.create_dataset(size_factor = 3, \n", + " total_votes = np.arange(15, 2000, 5),\n", + " helpful_votes = np.arange(15, 1500), \n", + " scale = 0.6, \n", + " review_years = np.arange(2004, 2017), \n", + " product_category = 'Movies_TV', \n", + " product_components = [product_pool],\n", + " marketplace_factor = 0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4a2cd71-7768-45ce-81b2-5f5bd3d70b78", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "reviews = pd.read_parquet(s3_bucket_text + \n", + " '/review_body_headline/instance1_dat1.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd6612b1-ff55-49e6-a1e7-ddb5aadf4a7c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"review_headline\"] = reviews.iloc[0:dat.shape[0]][\"review_headline\"].array\n", + "dat[\"review_body\"] = reviews.iloc[0:dat.shape[0]][\"review_body\"].array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e9c949d-f56c-4be9-912c-a07f2c633183", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "wr.s3.to_parquet(\n", + " df = dat[[\"product_category\", \"marketplace\", \"customer_id\", \"review_id\", \"product_id\", \"product_title\", \"star_rating\",\n", + " \"helpful_votes\", \"total_votes\", \"insight\", \"review_headline\", \"review_body\", \"review_date\", \"review_year\"]],\n", + " path = s3_bucket_output,\n", + " dataset = True,\n", + " max_rows_by_file = 3000000,\n", + " partition_cols = ['product_category']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "5a812de6-d1f4-4744-9297-3075050cda4e", + "metadata": {}, + "source": [ + "## Grocery_Gourmet_Food" + ] + }, + { + "cell_type": "markdown", + "id": "ac240677-16ef-4460-8b2c-45962fa056ee", + "metadata": {}, + "source": [ + "Prompt: \"Generate 100 products in groceries and gourmet foods. Provide the output as a comma separated list, surround each word in single quotes, each word as lower case, each word is singular\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f278f389-d171-4156-a168-57b61fc35f4e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_pool = ['apple', 'orange', 'banana', 'grape', 'lemon', 'lime', 'pineapple', 'mango', 'strawberry', 'blueberry', 'raspberry', 'blackberry', 'kiwi', 'peach', 'plum', 'apricot', 'nectarine', 'avocado', 'tomato', 'onion', 'potato', 'carrot', 'broccoli', 'cauliflower', 'spinach', 'lettuce', 'cucumber', 'bell pepper', 'zucchini', 'eggplant', 'mushroom', 'garlic', 'ginger', 'bread', 'bagel', 'croissant', 'muffin', 'doughnut', 'cookie', 'cake', 'pie', 'tart', 'pastry', 'cheese', 'yogurt', 'milk', 'butter', 'egg', 'beef', 'chicken', 'turkey', 'pork', 'fish', 'shrimp', \n", + " 'crab', 'lobster', 'oyster', 'pasta', 'rice', 'cereal', 'oatmeal', 'honey', 'jam', 'peanut butter', 'olive oil', \n", + " 'vinegar', 'ketchup', 'mustard', 'mayonnaise', 'salsa', 'hummus', 'guacamole', 'salad dressing', 'chip', 'cracker', \n", + " 'pretzel', 'popcorn', 'nut', 'seed', 'dried fruit', 'candy', 'chocolate', 'ice cream', 'sorbet', 'froyo', 'coffee', \n", + " 'tea', 'juice', 'soda', 'water', 'beer', 'wine', 'liquor']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b30197c7-fdc3-436d-9f69-b48ed904ee75", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "product_prefix_pool = ['fresh','organic','gourmet','savory','sweet','tangy','zesty','creamy','crunchy','juicy',\n", + " 'succulent','tender','flavorful','aromatic','spicy','pungent','earthy','nutty','robust',\n", + " 'velvety','buttery','crispy','cheesy','baked','sauteed','grilled','smoked','brined','pickled',\n", + " 'fermented','dried','candied','glazed','frosted','toasted','infused','aged','artisanal',\n", + " 'premium','imported','exotic','rare','authentic','decadent','indulgent','exquisite','wholesome',\n", + " 'nutritious','delectable','mouthwatering']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a2c403a-3b86-45b2-a53e-97855b5c97f6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# author generated\n", + "product_suffix_pool = ['box of 10', 'box of 5', 'box of 12', 'box of 24',\n", + " '6 cans', '1 can', '5 bars', '10 bars',\n", + " '1 bag', 'bag of 12', 'family pack', 'party pack', 'family sized bag',\n", + " '1 pound', '5 pounds', \n", + " '1 bottle', '2 bottles', '3 bottles', '12 bottles',\n", + " 'bundle of 3', '1 carton', '12 cartons',\n", + " 'a gift basket', 'celebration basket', \n", + " 'equally portioned', 'low calories', 'low in sugar', 'low in fat', 'vegan', 'vegetarian',\n", + " 'nuts and seeds free', 'low glycemic index', 'healthy food', 'great snack', 'great for kids',\n", + " 'great for kids and adults', 'sugarless', 'only natural sugar', 'no artificial sweeteners',\n", + " 'no high fructose syrup', 'no transfats', 'perfect for low calory diet']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2017d9b4-36e9-4f5e-91a9-c9e133230573", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat = rgh.create_dataset(size_factor = 3, \n", + " total_votes = np.arange(7, 50, 2),\n", + " helpful_votes = np.arange(2, 45, 3), \n", + " scale = 0.3, \n", + " review_years = np.arange(1997, 2007), \n", + " product_category = 'Grocery_Gourmet_Food', \n", + " product_components = [product_prefix_pool, product_pool, product_suffix_pool],\n", + " marketplace_factor = 0.9)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be9fc646-8578-48df-8735-f944d1f0519e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "reviews = pd.read_parquet(s3_bucket_text + \n", + " '/review_body_headline/instance1_dat2.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a618034-556d-45a6-8773-2e48dc97f7ca", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dat[\"review_headline\"] = reviews.iloc[0:dat.shape[0]][\"review_headline\"].array\n", + "dat[\"review_body\"] = reviews.iloc[0:dat.shape[0]][\"review_body\"].array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce2a8662-80e3-4871-a9f5-f36d2564ad57", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "wr.s3.to_parquet(\n", + " df = dat[[\"product_category\", \"marketplace\", \"customer_id\", \"review_id\", \"product_id\", \"product_title\", \"star_rating\",\n", + " \"helpful_votes\", \"total_votes\", \"insight\", \"review_headline\", \"review_body\", \"review_date\", \"review_year\"]],\n", + " path = s3_bucket_output,\n", + " dataset = True,\n", + " max_rows_by_file = 3000000,\n", + " partition_cols = ['product_category']\n", + ")" + ] + } + ], + "metadata": { + "availableInstances": [ + { + "_defaultOrder": 0, + "_isFastLaunch": true, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 4, + "name": "ml.t3.medium", + "vcpuNum": 2 + }, + { + "_defaultOrder": 1, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 8, + "name": "ml.t3.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 2, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.t3.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 3, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.t3.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 4, + "_isFastLaunch": true, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 8, + "name": "ml.m5.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 5, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.m5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 6, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.m5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 7, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 64, + "name": "ml.m5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 8, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 128, + "name": "ml.m5.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 9, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 192, + "name": "ml.m5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 10, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 256, + "name": "ml.m5.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 11, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 384, + "name": "ml.m5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 12, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 8, + "name": "ml.m5d.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 13, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.m5d.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 14, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.m5d.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 15, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 64, + "name": "ml.m5d.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 16, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 128, + "name": "ml.m5d.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 17, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 192, + "name": "ml.m5d.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 18, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 256, + "name": "ml.m5d.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 19, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 384, + "name": "ml.m5d.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 20, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "hideHardwareSpecs": true, + "memoryGiB": 0, + "name": "ml.geospatial.interactive", + "supportedImageNames": [ + "sagemaker-geospatial-v1-0" + ], + "vcpuNum": 0 + }, + { + "_defaultOrder": 21, + "_isFastLaunch": true, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 4, + "name": "ml.c5.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 22, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 8, + "name": "ml.c5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 23, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.c5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 24, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.c5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 25, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 72, + "name": "ml.c5.9xlarge", + "vcpuNum": 36 + }, + { + "_defaultOrder": 26, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 96, + "name": "ml.c5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 27, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 144, + "name": "ml.c5.18xlarge", + "vcpuNum": 72 + }, + { + "_defaultOrder": 28, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 192, + "name": "ml.c5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 29, + "_isFastLaunch": true, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.g4dn.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 30, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.g4dn.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 31, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 64, + "name": "ml.g4dn.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 32, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 128, + "name": "ml.g4dn.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 33, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "hideHardwareSpecs": false, + "memoryGiB": 192, + "name": "ml.g4dn.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 34, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 256, + "name": "ml.g4dn.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 35, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 61, + "name": "ml.p3.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 36, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "hideHardwareSpecs": false, + "memoryGiB": 244, + "name": "ml.p3.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 37, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "hideHardwareSpecs": false, + "memoryGiB": 488, + "name": "ml.p3.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 38, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "hideHardwareSpecs": false, + "memoryGiB": 768, + "name": "ml.p3dn.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 39, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.r5.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 40, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.r5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 41, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 64, + "name": "ml.r5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 42, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 128, + "name": "ml.r5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 43, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 256, + "name": "ml.r5.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 44, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 384, + "name": "ml.r5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 45, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 512, + "name": "ml.r5.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 46, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 768, + "name": "ml.r5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 47, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 16, + "name": "ml.g5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 48, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.g5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 49, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 64, + "name": "ml.g5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 50, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 128, + "name": "ml.g5.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 51, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "hideHardwareSpecs": false, + "memoryGiB": 256, + "name": "ml.g5.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 52, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "hideHardwareSpecs": false, + "memoryGiB": 192, + "name": "ml.g5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 53, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "hideHardwareSpecs": false, + "memoryGiB": 384, + "name": "ml.g5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 54, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "hideHardwareSpecs": false, + "memoryGiB": 768, + "name": "ml.g5.48xlarge", + "vcpuNum": 192 + }, + { + "_defaultOrder": 55, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "hideHardwareSpecs": false, + "memoryGiB": 1152, + "name": "ml.p4d.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 56, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "hideHardwareSpecs": false, + "memoryGiB": 1152, + "name": "ml.p4de.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 57, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 32, + "name": "ml.trn1.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 58, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 512, + "name": "ml.trn1.32xlarge", + "vcpuNum": 128 + }, + { + "_defaultOrder": 59, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 0, + "hideHardwareSpecs": false, + "memoryGiB": 512, + "name": "ml.trn1n.32xlarge", + "vcpuNum": 128 + } + ], + "instance_type": "ml.m5.4xlarge", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/synthetic_data/review_generation_helpers.py b/tutorials/synthetic_data/review_generation_helpers.py new file mode 100644 index 0000000..12fcd4e --- /dev/null +++ b/tutorials/synthetic_data/review_generation_helpers.py @@ -0,0 +1,506 @@ +import numpy as np +import pandas as pd + +# for review_id generation +import string +import random + +# for review body and title generation +from essential_generators import DocumentGenerator + +def generate_review_headline_body(n): + """Generate review headlines and review bodies + + Args: + n: integer, number of pairs to generate + + Returns: + pandas DataFrame with 2 columns: review_healine, review_body; number of rows = n + + """ + gen = DocumentGenerator() + template = {'review_headline':'sentence', + 'review_body': 'paragraph'} + gen.set_template(template) + documents = gen.documents(n) + + return pd.DataFrame(documents) + + +def create_customers_insight(random_number_generator, factor = 1): + """Returns numpy array with the list of customers, reviews, insight flag. + + The function assumes a certain distribution: 1% of the customers wrote 100 + reviews, 1%: 50 reviews, 2%: 30 review, 10%: 15 reviews, 20%: 10 reviews, + 30%: 5 reviews, 30%: 3 reviews, 6%: 1 review. + + Review ids are unique. + + Insight (influencer flag, Y/N): customers with more reviews have a higher + probability of being an insighter. The probability for Y are the following + for the groups above: [0.99, 0.8, 0.4, 0.1, 0, 0, 0, 0] + + To increase the number of reviews change the factor to the next integer. + + Args: + factor: value of 1 (default) will generate 3,224,806 review from 400,100 + customers. random_number_generator: random number generator object from + numpy + + Returns: + numpy array of the shape (2, number of reviews). + """ + + # define customer proportions, number of reviews from each group of customers, + # and the proportion of 'Y' for the insight + distribution_parameters = np.array([[0.01, 0.01, 0.02, 0.1, 0.2, 0.3, 0.3, 0.06], + [100, 50, 30, 15, 10, 5, 3, 1], + [0.99, 0.8, 0.4, 0.1, 0, 0, 0, 0]]) + + total_customers = 400100*factor + # customer_to_reviews_factor = np.multiply(distribution_parameters[0], + # distribution_parameters[1]).sum() + + # calculate the number of customers in each review frequency group + customers_per_group = distribution_parameters[0]*total_customers + + # create array indices to split customer_ids into cohorts + split_indices = customers_per_group.cumsum() + + # generate customer ids: + customer_ids = np.arange(100000, 100000 + total_customers) + + # create arrays with customer_ids: these will generate reviews at different + # frequency as defined in the customer_review_insight[1] + customer_cohorts = np.split(customer_ids, split_indices[0:-1].astype(int)) + + # replicate customer_ids according to the number of reviews to generate + # the array of the size of the reviews dataset + customers_list = [] + for i in range(len(customer_cohorts)): + customers_list.append(np.repeat(customer_cohorts[i], + distribution_parameters[1][i])) + customers_array = np.hstack(customers_list) + + # generate insight vector which needs to be tightly associated with + # customers + insight_list = [] + for i in range(distribution_parameters.shape[1]): + length_needed = (distribution_parameters[1][i]*customer_cohorts[i].shape[0]).astype(int) + probability_needed = distribution_parameters[2][i] + insight_list.append(random_number_generator.choice(['Y', 'N'], + length_needed, + p = [probability_needed, + 1 - probability_needed])) + insight_array = np.hstack(insight_list) + + # combine customers and the insight flag into a single array: + # it should be shuffled for later use + cust_insight_combined = np.vstack([customers_array, insight_array]) + + return cust_insight_combined + + +def shuffle_customer_insight(random_number_generator, customer_insight_array): + """Shuffles the array of customers and the insight flag. + + Args: + customer_insight_array: numpy array of the shape (2, number_of_reviews) + + Returns: + numpy array of the shape (2, number_of_reviews) shuffled. + + """ + # create indices + ind = np.arange(customer_insight_array.shape[1]) + # shuffled the indices + random_number_generator.shuffle(ind) + # subset the original array using randomized indices + customer_insight_shuffled = customer_insight_array[:, ind] + + return customer_insight_shuffled + + +def subset_customer_data(random_number_generator, customer_insight_array): + """Reduce the customer+insight array by a random number between 100 and 2000 + + The goal is to introduce some variability to the datasets generated since + the overall size is defined by function create_customers_insight and the + associated factor. + + Args: + customer_insight_array: numpy array of the shape (2, number_of_reviews) + random_number_generator: random number generator object from numpy + + Returns: + numpy array of the shape (2, number_of_reviews - (random number between + 100 and 2000)) + """ + + rand_number = random_number_generator.integers(100, 2001) + print(rand_number) + customers_to_remove = random_number_generator.choice(np.arange(0,customer_insight_array.shape[1]), size = rand_number) + return np.delete(customer_insight_array, customers_to_remove, axis = 1) + + +def generate_dates_even_per_month(random_number_generator, years, days_per_month): + """ Generates review dates given a list of years + + This function was used to generate the same number of dates per months for the Jewelry + dataset where trends needed to be observed. + + Args: + years: list of years for which to generate dates + + days_per_month: how many dates is needed for each month, int. The same number of random + dates will be generated for each month + + random_number_generator: random number generator object from numpy + + Returns: list of dates in the natural order for month, the same order for years as in the input arg 'years' + """ + output = [] + # source of days per each month + month_days = {'01': 31, '02': 28, '03': 31, '04':30, '05':31, '06':30, '07':31, '08':31, '09':30, '10':31, '11': 30, '12': 31} + for e in years: + for m,d in month_days.items(): + # generate an array of days for a given month, convert to string + d_range = np.arange(1, d+1).astype(str) + # pad the array of days with 0s on the right to ensure correct date representation + d_range_str = np.char.zfill(d_range, 2) + # create an array of 'year-month-' strings + year_month = np.repeat(str(e) + '-' + m + '-', d_range_str.shape[0]) + # concatenate the array of 'year-month-' strings with the days + source_dates = np.char.add(year_month, d_range_str) + # create a sample of dates for a given year/month, add to the output array + output.extend(random_number_generator.choice(source_dates, size = days_per_month)) + return output + + +def generate_dates(random_number_generator, year_array): + """Generate random dates for each instance of year in the input array + + The function will subset from all dates in 2023 (non-leap) in uniform fashion. + + Args: + year_array: numpy array of years, shape is (number_of_reviews, ). + random_number_generator: random number generator object from numpy. + + Returns: pandas DataFrame with 2 columns ("review_year" and "review_date" in ISO format) and + the number of rows equal to the size of the year_array input. + """ + # we will sample from a single non-leap year + test_date1, test_date2 = np.datetime64('2023-01-01'), np.datetime64('2023-12-31') + + # calculate the difference in days + dates_bet = test_date2 - test_date1 + + # sample the required number of offsets, to be added to test_date1 + + days_offset = random_number_generator.choice(dates_bet.astype(int), + size = year_array.shape[0]).astype('timedelta64[D]') + + # create a new datetime object by adding the time delta + start_date = np.repeat(test_date1, year_array.shape[0], axis = 0) + random_dates = start_date + days_offset + + # extract month and day, assemble results to a data frame + months = random_dates.astype('datetime64[M]').astype(int) % 12 + 1 + days = (random_dates - random_dates.astype('datetime64[M]') + 1).astype(int) + + # assemble into a data frame and join to get dates, it was easier to work with pandas than numpy + # in this case + year_ser = pd.Series(year_array.astype(str)) + days_ser = pd.Series(days.astype(str)).str.pad(width = 2, side = 'left', fillchar = '0') + month_ser = pd.Series(months.astype(str)).str.pad(width = 2, side = 'left', fillchar = '0') + df = pd.concat([year_ser, month_ser, days_ser], axis = 1) + df.columns = ["review_year", "month", "day"] + df['review_date'] = df.agg('-'.join, axis=1) + + return df[["review_year", "review_date"]] + + +def generate_products(random_number_generator, product_components, n): + """Generate 2D array with product names and the associated product ids. + + The function generates a list of 10,000 random combinations of product + names, product prefixes (color etc), product suffixes (to be added after + product name and a comma). With the this pool it then generates the final + list of n products and the product ids according to the exponential + distribution (scale = 1). Product id is generated as a random integer from + 10,000 to 100,000. + + Args: + random_number_generator: random number generator object from numpy. + + product_components: list of 1 or 3 lists + If 3 lists are found we assume that the first one contains prefixes + (such as color, material, make, etc.), the second list contains product + names, and the third list contains sufffixes (such as 'with + stones','with knobs', 'with extra features', 'perfect gift' etc). In + this case we will generate new product names by creating random + combinations between prefixes, names and suffixes. If 1 list is found + we assume that the product titles already provided and we will only + sample from them instead of creating random combinations. This + parameterer is used to create product titles + + n: integer, final number of product names and product ids to generate. + + Returns: + numpy array of the shape (2, n). + + """ + + if len(product_components) == 3: + # create a pool of 10000 products + all_products = [''.join(random_number_generator.choice(product_components[0]) + + ' ' + + random_number_generator.choice(product_components[1]) + + ', ' + + random_number_generator.choice(product_components[2])) + for x in range(10000)] + elif len(product_components) == 1: + all_products = random_number_generator.choice(product_components[0], + size = 10000) + else: + raise ValueError("The length of product_components list must be either 3 or 1.") + + # create random ids to be associated with the products + ids_pool = random_number_generator.choice(np.arange(10000,100000), + size = len(all_products)) + + # combine products and the ids + out = np.vstack((all_products, ids_pool), dtype = str) + + # draw from the exponential distribution + exp_weights = random_number_generator.exponential(scale = 1, + size = len(all_products)) + + # generate random indices according to the exponential distribution + indx = random_number_generator.choice(out.shape[1], size = n, + p = exp_weights/exp_weights.sum()) + + # create a random set of product names and product ids: + product_titles_ids = out[:,indx] + + return product_titles_ids + + +def generate_random_review_id(n): + """Generate n strings of length 15 + + This simulates unique review ids. We generate n strings which start with R + and end with a mix of uppercase letters and digits. + + Args: + n: integer, total number of strings to generate + + Returns: list of n strings + """ + + return ['R' + + ''.join( + random.choices(string.ascii_uppercase + string.digits, k=14) + ) + for x in range(n) + ] + + +def subset_array_exponential(random_number_generator, input_array, n, scale, sort = False): + """Generate an array of size n from a smaller array + + Generation follows exponential distribution. + + Example: generate n star ratings where each star rating (1 to 5) is drawn + from the shuffled exponential distribution + + Args: + random_number_generator: random number generator object from numpy. + + input_array: numpy array with the desired values (such as star ratings, + helpful votes, etc) + + n: integer, length of the final array + + scale: float. The scale parameter, inverse of the rate parameter, must be + non-negative. Used in numpy.random.exponential() + + sort: bool. Whether to sort the exponential weight. If False then the weights + are taken as is (random), if True then the weights are sorted in increasing + order. Sorting is recommended when the input_array is years to mimic increasing + number of reviews with time. + + + + Returns: numpy array of shape (n,) + """ + + # draw from exponential distribution according to the size of the input array + exp_weights = random_number_generator.exponential(scale = scale, + size = input_array.shape[0]) + + if sort: + exp_weights.sort() + + # generate random indices according to the exponential distribution + indx = random_number_generator.choice(np.arange(0, input_array.shape[0]), + size = n, + p = exp_weights/exp_weights.sum()) + # randomly subset the input array to the needed size, return + return input_array[indx,] + + +def generate_marketplace_array(random_number_generator, n, scale, factor = 0.5): + """Generate array size n of marketplace codes + + Args: + random_number_generator: random number generator object from numpy. + + n: integer, length of the final array + + scale: float. The scale parameter, inverse of the rate parameter, must be + non-negative. + + factor: float, portion of marketplaces to subset. 1 = 100%, 0.5 = 50%. + 0.5 is the default + + Returns: numpy string array of shape (n, ) with marketplace codes generated + according to exponential distribution. + """ + mp_pool = np.array(['US', 'CA', 'MX', 'AR', 'AU', 'UK', 'BR', 'CN', 'CO', 'CR', + 'FR', 'DE', 'HK', 'IN', 'IT', 'JP', 'PL', 'SG', 'ES', 'CH'], + dtype = str) + mp_selected = random_number_generator.choice(mp_pool, + size = int(factor*mp_pool.shape[0])) + + return subset_array_exponential(random_number_generator, + mp_selected, + n, + scale = scale, + sort = False) + + + +def create_dataset(size_factor = 1, + total_votes = np.arange(0, 50, 5), + helpful_votes = np.arange(0, 31), + scale = 1, + review_years = np.arange(1997, 2015), + product_category = 'Electronics', + product_components = [["pink"], ["stove"], ["with knobs"]], + marketplace_factor = 1): + + """Generate columns for reviews + + + Args: size_factor: int + when = 1 the initial number of reviews generated is 3,224,806 from + 400,100 customers. To increase the number of customers (and, + correspondingly, reviews) use a different factor (2 for 400,100*2, 3 for + 400,100*3 etc). The final number of reviews will be reduced by a random + number between 100 and 2000. This defines the size of the final dataset + total_votes: numpy array + used to select a random number to assign the number of total votes to + each review. Can provide as numpy.arange() + helpful_votes: numpy array + used to select a random number to assign the number of helpful votes to + each review. Can provide as numpy.arange() + scale: float + beta or scale parameter in numpy.random.exponential. Will be used for + all data generators that rely on exponential distribution + review_years: numpy array + used to select years for reviews. Number of reviews per year follows + exponential distribution + product_category: string + name of the product category + product_components: list of 1 or 3 lists + If 3 lists are found we assume that the first one contains prefixes + (such as color, material, make, etc.), the second list contains product + names, and the third list contains sufffixes (such as 'with + stones','with knobs', 'with extra features', 'perfect gift' etc). In + this case we will generate new product names by creating random + combinations between prefixes, names and suffixes. If 1 list is found + we assume that the product titles already provided and we will only + sample from them instead of creating random combinations. This + parameterer is used to create product titles + marketplace_factor: float + used as % of marketplaces to use for the data. 1 will use 20 + marketplaces, 0.5 will use 10 marketplaces etc + """ + + rng = np.random.default_rng() + + # generate customer ids and the insight column: + cust_ins = create_customers_insight(rng, factor = size_factor) + + # shuffle + cust_ins_shuffle = shuffle_customer_insight(rng, cust_ins) + + # reduce the final size by a small random number, output: numpy (2, n) + cust_ins_ready = subset_customer_data(rng, cust_ins_shuffle) + + # define the total number of rows in the final dataset + n = cust_ins_ready.shape[1] + + # generate product titles and product ids, output: numpy array (2, n) + products = generate_products(rng, product_components, n) + + # generate review identifiers, output: list of length n + random.seed() + review_id = generate_random_review_id(n) + + # generate star ratings, output: numpy array (n,) + stars = np.array([1, 2, 3, 4, 5]) + star_rating = subset_array_exponential(rng, stars, n, + scale = scale, sort = False) + + # generate total votes, output: numpy array (n,) + total_votes = subset_array_exponential(rng, total_votes, n, + scale = scale, sort = False) + + # generate helpful votes, output: numpy array (n,) + helpful_votes = subset_array_exponential(rng, helpful_votes, n, + scale = scale, sort = False) + + # generate review years, output: numpy array (n,) + review_years = subset_array_exponential(rng, review_years, n, + scale = scale, sort = True) + + # generate review dates, add review years; output: pandas DataFrame with 2 + # columns + review_dates_years = generate_dates(rng, review_years) + + # generate marketplace codes, output: numpy array (n,) + marketplace = generate_marketplace_array(rng, n, scale = scale, + factor = marketplace_factor) + + # assemble the results into pandas DataFrame + + dat = review_dates_years + dat["review_year"] = dat["review_year"].astype("int") + dat["review_date"] = pd.to_datetime(dat["review_date"]) + dat["product_category"] = product_category + dat["marketplace"] = marketplace + dat["customer_id"] = cust_ins_ready[0, :] + dat["insight"] = cust_ins_ready[1, :] + dat["review_id"] = review_id + dat["product_title"] = products[0, :] + dat["product_id"] = products[1, :] + dat["star_rating"] = star_rating + dat["star_rating"] = dat["star_rating"].astype("int") + dat["helpful_votes"] = helpful_votes + dat["helpful_votes"] = dat["helpful_votes"].astype("int") + dat["total_votes"] = total_votes + dat["total_votes"] = dat["total_votes"].astype("int") + + return dat + + + + + + + + + + \ No newline at end of file