From ee59a254331c4d63e79554ff905573dc330cb2ab Mon Sep 17 00:00:00 2001 From: davitbzh Date: Wed, 6 Sep 2023 23:56:26 +0200 Subject: [PATCH 1/6] java tutorial --- .gitignore | 2 +- java/README.md | 43 ++++++ java/pom.xml | 144 ++++++++++++++++++ java/setup/create_fg_fv.py | 139 +++++++++++++++++ .../tutorials/FeatureVectorBenchMarks.java | 66 ++++++++ 5 files changed, 393 insertions(+), 1 deletion(-) create mode 100644 java/README.md create mode 100644 java/pom.xml create mode 100644 java/setup/create_fg_fv.py create mode 100644 java/src/main/java/com/hopsworks/tutorials/FeatureVectorBenchMarks.java diff --git a/.gitignore b/.gitignore index f0849c11..3bd65a8f 100644 --- a/.gitignore +++ b/.gitignore @@ -124,7 +124,7 @@ dmypy.json .vscode *.iml target/ - +**target/ # Mac .DS_Store diff --git a/java/README.md b/java/README.md new file mode 100644 index 00000000..1b508a61 --- /dev/null +++ b/java/README.md @@ -0,0 +1,43 @@ +# Online Feature Vector Retrieval Using Java Application + +## Introduction +In this tutorial you will learn how to fetch feature vectors from online feature store for near real-time model serving +using external java application. + + + +## Clone tutorials repository +This section requires maven; java 1.8 and git. + +```bash +git clone https://github.com/logicalclocks/hopsworks-tutorials +cd ./hopsworks-tutorials/java +mvn clean package +``` + +## Create Feature Group and Feature View +This tutorial comes with pyspark program with a code to create feature group and feature view: +- `./setup/create_fg_fv.py` + +Feature group data is generated using `dbldatagen` library. + +You can execute this pyspark program directly on Hopsworks cluster. Follow the documentation how to set up and run +[spark jobs](https://docs.hopsworks.ai/hopsworks-api/3.3/generated/api/jobs/) + +## Execute java application: +Now you will create [connection](https://docs.hopsworks.ai/hopsworks-api/3.3/generated/api/connection/) with +your Hopsworks cluster. For this you need to have Hopsworks cluster host address and [api key](https://docs.hopsworks.ai/3.3/user_guides/projects/api_key/create_api_key/) + +Then define environment variables + +```bash +HOPSWORKS_HOST=REPLACE_WITH_YOUR_HOPSWORKS_CLUSTER_HOST +HOPSWORKS_API_KEY=REPLACE_WITH_YOUR_HOPSWORKS_API_KEY +HOPSWORKS_PROJECT_NAME=REPLACE_WITH_YOUR_HOPSWORKS_PROJECT_NAME +export FEATURE_VIEW_NAME=products_fv +export FEATURE_VIEW_VERSION=1 +``` + +```bash +java -jar ./target/hopsworks-java-tutorial-3.4.0-SNAPSHOT-jar-with-dependencies.jar $HOPSWORKS_HOST $HOPSWORKS_API_KEY $HOPSWORKS_PROJECT_NAME $FEATURE_VIEW_NAME $FEATURE_VIEW_VERSION +``` diff --git a/java/pom.xml b/java/pom.xml new file mode 100644 index 00000000..66eb5a8b --- /dev/null +++ b/java/pom.xml @@ -0,0 +1,144 @@ + + + 4.0.0 + + com.hopsworks.tutorials + hopsworks-java-tutorial + 3.4.0-SNAPSHOT + + + 1.8 + 1.8 + 14.0.1 + 4.5.6 + 4.4.13 + 2.22.0 + + + + + com.logicalclocks + hsfs + ${project.version} + + + + org.apache.httpcomponents + httpclient + ${httpclient.version} + + + + org.apache.httpcomponents + httpcore + ${httpcore.version} + + + + com.google.guava + guava + ${guava.version} + + + + mysql + mysql-connector-java + 8.0.33 + runtime + + + + com.google.guava + guava + 32.1.2-jre + + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + + com.hopsworks.tutorials.FeatureVectorBenchMarks + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + 2.4.1 + + + + com.hopsworks.tutorials.FeatureVectorBenchMarks + + + + + jar-with-dependencies + + + + + make-assembly + + package + + single + + + + + + org.apache.maven.plugins + maven-surefire-plugin + ${surefire-plugin.version} + + + + hadoop.home.dir + ${project.basedir}/src/test/resources/hadoop/ + + + src/test/resources/system.properties + + + + + + src/test/resources + + + + + + Hops + Hops Repo + https://archiva.hops.works/repository/Hops/ + + true + + + true + + + + + + diff --git a/java/setup/create_fg_fv.py b/java/setup/create_fg_fv.py new file mode 100644 index 00000000..49faf909 --- /dev/null +++ b/java/setup/create_fg_fv.py @@ -0,0 +1,139 @@ +### 📝 Imports + +import dbldatagen as dg +from pyspark.sql import SparkSession +from pyspark.sql.types import IntegerType, FloatType, StringType, BooleanType, TimestampType, ArrayType +from pyspark.sql.functions import pandas_udf + +import pandas as pd +import numpy as np + +import hopsworks +import random + +### 👩🏻‍🔬 Generation Function +spark = SparkSession.builder.enableHiveSupport().getOrCreate() + + +def generate_data(n_rows): + df_spec = ( + dg.DataGenerator(spark, name="Products_Data", rows=n_rows, partitions=4) + .withColumn( + "user_id", + IntegerType(), + minValue=0, + maxValue=300000, + random=True, + ) + .withColumn( + "product_id", + IntegerType(), + minValue=0, + maxValue=1000, + random=True, + ) + .withColumn( + "timestamp", + TimestampType(), + random=True, + ) + .withColumn( + "col_float", + FloatType(), + expr="floor(rand() * 350) * (86400 + 3600)", + numColumns=110, + random=True, + ) + .withColumn( + "col_str", + StringType(), + numColumns=8, + values=['a', 'b', 'c', 'd', 'e', 'f', 'g'], + random=True, + + ) + .withColumn( + "col_int", + IntegerType(), + numColumns=6, + minValue=0, + maxValue=500, + random=True, + ) + .withColumn( + "col_bool", + BooleanType(), + numColumns=4, + random=True, + ) + ) + + df = df_spec.build() + + return df + + +@pandas_udf(ArrayType(IntegerType())) +def generate_list_col(rows: pd.Series) -> pd.Series: + return pd.Series([np.random.randint(100, size=random.randint(10, 31)) for _ in range(len(rows))]) + +@pandas_udf(ArrayType(IntegerType())) +def generate_click_col(rows: pd.Series) -> pd.Series: + return pd.Series([np.random.randint(10, size=random.randint(0, 5)) for _ in range(len(rows))]) + +## 🔮 Generate Data +n_rows = 5_000 + +data_generated_products = generate_data(n_rows) + +# Get the number of rows +num_rows = data_generated_products.count() + +# Get the number of columns +num_columns = len(data_generated_products.columns) + +print("Number of rows:", num_rows) +print("Number of columns:", num_columns) + +for i in range(6): + data_generated_products = data_generated_products.withColumn( + f'col_list_{i}', + generate_list_col(data_generated_products.product_id + i) + ) + +data_generated_products = data_generated_products.withColumn("clicks", generate_click_col(data_generated_products.product_id + i)) + +clicks_df = data_generated_products.select("user_id", "product_id", "timestamp", "clicks") +products_df = data_generated_products.drop("user_id", "clicks") + +# Get the number of rows +num_rows = data_generated_products.count() + +# Get the number of columns +num_columns = len(data_generated_products.columns) + +print("Number of rows:", num_rows) +print("Number of columns:", num_columns) + +## 🪄 Feature Group Creation +project = hopsworks.login() +fs = project.get_feature_store() + +products_fg = fs.get_or_create_feature_group( + name="products", + version=1, + description="Products Data", + primary_key=["product_id"], + event_time="timestamp", + stream=True, + online_enabled=True, +) +products_fg.insert(products_df) + +products_fg = fs.get_feature_group(name="products", version=1) + +fs.get_or_create_feature_view( + name='products_fv', + version=1, + query=products_fg.select_all(), +) diff --git a/java/src/main/java/com/hopsworks/tutorials/FeatureVectorBenchMarks.java b/java/src/main/java/com/hopsworks/tutorials/FeatureVectorBenchMarks.java new file mode 100644 index 00000000..b5f6d022 --- /dev/null +++ b/java/src/main/java/com/hopsworks/tutorials/FeatureVectorBenchMarks.java @@ -0,0 +1,66 @@ +package com.hopsworks.tutorials; + +import com.logicalclocks.hsfs.FeatureStore; +import com.logicalclocks.hsfs.FeatureView; +import com.logicalclocks.hsfs.HopsworksConnection; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +public class FeatureVectorBenchMarks { + + public static void main(String[] args) throws Exception { + + String host = args[0]; + String projectName = args[1]; + String apiKey = args[2]; + String fvName = args[3]; + Integer fvVersion = Integer.parseInt(args[4]); + + FeatureStore fs = HopsworksConnection.builder() + .host(host) + .project(projectName) + .apiKeyValue(apiKey) + .hostnameVerification(false) + .build() + .getFeatureStore(); + + // get feature view + FeatureView fv = fs.getFeatureView(fvName, fvVersion); + + // single lookup sering vector + List singleVector = fv.getFeatureVector(new HashMap() {{ + put("product_id", productIdGenerator()); + }}); + System.out.println("Feature values from single vector lookup"); + singleVector.stream().forEach(System.out::println); + + // batch lookup sering vector + fv.initServing(true, true); + List> batchVector = fv.getFeatureVectors(productIdGenerator(160)); + + // print results + for (List vector: batchVector) { + System.out.println("Feature values from batch lookup"); + vector.stream().forEach(System.out::println); + } + } + + private static int productIdGenerator() { + int leftLimit = 0; + int rightLimit = 1000; + return leftLimit + (int) (Math.random() * (rightLimit - leftLimit)); + } + + private static java.util.Map> productIdGenerator(int batch) { + List productIds = new ArrayList<>(); + while (productIds.size() <= batch){ + int productId = productIdGenerator(); + if (!productIds.contains(productId)) { + productIds.add(productId); + } + } + return new HashMap>() {{put("product_id", productIds);}}; + } +} From 963c47b98915f2e1abf3a6affcff4dd060921302 Mon Sep 17 00:00:00 2001 From: davitbzh Date: Thu, 7 Sep 2023 11:28:44 +0200 Subject: [PATCH 2/6] java client --- java/README.md | 2 -- java/pom.xml | 4 ++-- .../{FeatureVectorBenchMarks.java => FeatureVectors.java} | 4 +++- 3 files changed, 5 insertions(+), 5 deletions(-) rename java/src/main/java/com/hopsworks/tutorials/{FeatureVectorBenchMarks.java => FeatureVectors.java} (95%) diff --git a/java/README.md b/java/README.md index 1b508a61..e0809867 100644 --- a/java/README.md +++ b/java/README.md @@ -4,8 +4,6 @@ In this tutorial you will learn how to fetch feature vectors from online feature store for near real-time model serving using external java application. - - ## Clone tutorials repository This section requires maven; java 1.8 and git. diff --git a/java/pom.xml b/java/pom.xml index 66eb5a8b..2b3b4214 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -65,7 +65,7 @@ - com.hopsworks.tutorials.FeatureVectorBenchMarks + com.hopsworks.tutorials.FeatureVectors @@ -78,7 +78,7 @@ - com.hopsworks.tutorials.FeatureVectorBenchMarks + com.hopsworks.tutorials.FeatureVectors diff --git a/java/src/main/java/com/hopsworks/tutorials/FeatureVectorBenchMarks.java b/java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java similarity index 95% rename from java/src/main/java/com/hopsworks/tutorials/FeatureVectorBenchMarks.java rename to java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java index b5f6d022..280c4405 100644 --- a/java/src/main/java/com/hopsworks/tutorials/FeatureVectorBenchMarks.java +++ b/java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java @@ -3,12 +3,13 @@ import com.logicalclocks.hsfs.FeatureStore; import com.logicalclocks.hsfs.FeatureView; import com.logicalclocks.hsfs.HopsworksConnection; +import com.logicalclocks.hsfs.SecretStore; import java.util.ArrayList; import java.util.HashMap; import java.util.List; -public class FeatureVectorBenchMarks { +public class FeatureVectors { public static void main(String[] args) throws Exception { @@ -20,6 +21,7 @@ public static void main(String[] args) throws Exception { FeatureStore fs = HopsworksConnection.builder() .host(host) + .port(8181) .project(projectName) .apiKeyValue(apiKey) .hostnameVerification(false) From 77b46e76aaa868abe8b7074621d8ad2d4432c77e Mon Sep 17 00:00:00 2001 From: davitbzh Date: Thu, 7 Sep 2023 11:33:14 +0200 Subject: [PATCH 3/6] java client --- java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java | 1 - 1 file changed, 1 deletion(-) diff --git a/java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java b/java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java index 280c4405..966b5001 100644 --- a/java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java +++ b/java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java @@ -21,7 +21,6 @@ public static void main(String[] args) throws Exception { FeatureStore fs = HopsworksConnection.builder() .host(host) - .port(8181) .project(projectName) .apiKeyValue(apiKey) .hostnameVerification(false) From f3b5c9164ad81f6537a009b49ad023eb77ae5bb7 Mon Sep 17 00:00:00 2001 From: davitbzh <44586065+davitbzh@users.noreply.github.com> Date: Wed, 4 Oct 2023 10:46:39 +0200 Subject: [PATCH 4/6] Update java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java Co-authored-by: kennethmhc --- java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java b/java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java index 966b5001..c4a96cfd 100644 --- a/java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java +++ b/java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java @@ -35,7 +35,7 @@ public static void main(String[] args) throws Exception { put("product_id", productIdGenerator()); }}); System.out.println("Feature values from single vector lookup"); - singleVector.stream().forEach(System.out::println); + System.out.println("[" + Joiner.on(", ").join(singleVector) + "]"); // batch lookup sering vector fv.initServing(true, true); From dfbfc8b28f96d103107dd83ede5b2b3e7ae4c19c Mon Sep 17 00:00:00 2001 From: davitbzh <44586065+davitbzh@users.noreply.github.com> Date: Wed, 4 Oct 2023 10:46:48 +0200 Subject: [PATCH 5/6] Update java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java Co-authored-by: kennethmhc --- .../src/main/java/com/hopsworks/tutorials/FeatureVectors.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java b/java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java index c4a96cfd..72ac5b85 100644 --- a/java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java +++ b/java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java @@ -42,9 +42,9 @@ public static void main(String[] args) throws Exception { List> batchVector = fv.getFeatureVectors(productIdGenerator(160)); // print results + System.out.println("Feature values from batch lookup"); for (List vector: batchVector) { - System.out.println("Feature values from batch lookup"); - vector.stream().forEach(System.out::println); + System.out.println("[" + Joiner.on(", ").join(vector) + "]"); } } From 970dab036ea9760dfe5ae73305a0da9ffab829a3 Mon Sep 17 00:00:00 2001 From: davitbzh Date: Tue, 30 Jan 2024 16:57:10 +0100 Subject: [PATCH 6/6] bump version --- java/pom.xml | 2 +- java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/java/pom.xml b/java/pom.xml index 2b3b4214..b8c9d814 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -6,7 +6,7 @@ com.hopsworks.tutorials hopsworks-java-tutorial - 3.4.0-SNAPSHOT + 3.7.0-SNAPSHOT 1.8 diff --git a/java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java b/java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java index 72ac5b85..094652f1 100644 --- a/java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java +++ b/java/src/main/java/com/hopsworks/tutorials/FeatureVectors.java @@ -1,9 +1,9 @@ package com.hopsworks.tutorials; +import com.google.common.base.Joiner; import com.logicalclocks.hsfs.FeatureStore; import com.logicalclocks.hsfs.FeatureView; import com.logicalclocks.hsfs.HopsworksConnection; -import com.logicalclocks.hsfs.SecretStore; import java.util.ArrayList; import java.util.HashMap;