From 178da9f00dc9b74a38bbc1946ee122ad86a481dc Mon Sep 17 00:00:00 2001
From: JalenCato <jalencato23pistons@gmail.com>
Date: Tue, 23 Jan 2024 21:55:27 +0000
Subject: [PATCH] add

---
 .../dist_bert_transformation.py               | 23 +------------------
 1 file changed, 1 insertion(+), 22 deletions(-)

diff --git a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_bert_transformation.py b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_bert_transformation.py
index 27990b68f8..1883013f0c 100644
--- a/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_bert_transformation.py
+++ b/graphstorm-processing/graphstorm_processing/data_transformations/dist_transformations/dist_bert_transformation.py
@@ -82,28 +82,7 @@ def tokenize(text):
 
         # Apply the UDF to the DataFrame
         scaled_df = input_df.withColumn(cols[0], tokenize(input_df[cols[0]]))
-
-        # @udf(returnType=schema, useArrow=True)
-        # def tokenize(text):
-        #     # Check if text is a string
-        #     if not isinstance(text, str):
-        #         raise ValueError("The input of the tokenizer has to be a string.")
-        #
-        #     # Tokenize the text
-        #     t = tokenizer(text, max_length=max_seq_length, truncation=True, padding='max_length', return_tensors='np')
-        #     token_type_ids = t.get('token_type_ids', np.zeros_like(t['input_ids'], dtype=np.int8))
-        #     result = {
-        #         'input_ids': t['input_ids'][0].tolist(),  # Convert tensor to list
-        #         'attention_mask': t['attention_mask'][0].astype(np.int8).tolist(),
-        #         'token_type_ids': token_type_ids[0].astype(np.int8).tolist()
-        #     }
-        #     return result
-        #
-        # # Define the UDF with the appropriate return type
-        # tokenize_udf = udf(tokenize, MapType(StringType(), ArrayType(IntegerType())))
-        #
-        # # Apply the UDF to the DataFrame
-        # scaled_df = input_df.withColumn(cols[0], tokenize_udf(input_df[cols[0]]))
+        
     return scaled_df