From 28cc227f283b7df9e2e0421ab9ce890f6b5559f1 Mon Sep 17 00:00:00 2001
From: Aaron Liang <aaronliang@google.com>
Date: Fri, 9 Aug 2024 10:13:01 -0700
Subject: [PATCH] Add kubectl ray cluster log command

---
 kubectl-plugin/README.md               |  53 +--
 kubectl-plugin/pkg/cmd/log/log.go      | 301 +++++++++++++++
 kubectl-plugin/pkg/cmd/log/log_test.go | 504 +++++++++++++++++++++++++
 kubectl-plugin/pkg/cmd/ray.go          |   2 +
 4 files changed, 839 insertions(+), 21 deletions(-)
 create mode 100644 kubectl-plugin/pkg/cmd/log/log.go
 create mode 100644 kubectl-plugin/pkg/cmd/log/log_test.go

diff --git a/kubectl-plugin/README.md b/kubectl-plugin/README.md
index ca606d3db3..7f3a23bc41 100644
--- a/kubectl-plugin/README.md
+++ b/kubectl-plugin/README.md
@@ -23,24 +23,35 @@ Kubectl plugin/extension for Kuberay CLI that provides the ability to manage ray
     Aliases:
     get, list
 
-    Flags:
-    -A, --all-namespaces                 If present, list the requested clusters across all namespaces. Namespace in current context is ignored even if specified with --namespace.
-        --as string                      Username to impersonate for the operation. User could be a regular user or a service account in a namespace.
-        --as-group stringArray           Group to impersonate for the operation, this flag can be repeated to specify multiple groups.
-        --as-uid string                  UID to impersonate for the operation.
-        --cache-dir string               Default cache directory
-        --certificate-authority string   Path to a cert file for the certificate authority
-        --client-certificate string      Path to a client certificate file for TLS
-        --client-key string              Path to a client key file for TLS
-        --cluster string                 The name of the kubeconfig cluster to use
-        --context string                 The name of the kubeconfig context to use
-        --disable-compression            If true, opt-out of response compression for all requests to the server
-    -h, --help                           help for get
-        --insecure-skip-tls-verify       If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure
-        --kubeconfig string              Path to the kubeconfig file to use for CLI requests.
-    -n, --namespace string               If present, the namespace scope for this CLI request
-        --request-timeout string         The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0")
-    -s, --server string                  The address and port of the Kubernetes API server
-        --tls-server-name string         Server name to use for server certificate validation. If it is not provided, the hostname used to contact the server is used
-        --token string                   Bearer token for authentication to the API server
-        --user string                    The name of the kubeconfig user to use
+    Description:
+    Retrieves the ray cluster information.
+
+    ```
+    $ kubectl ray cluster get
+    NAME                       NAMESPACE   DESIRED WORKERS   AVAILABLE WORKERS   CPUS   GPUS   TPUS   MEMORY   AGE
+    random-kuberay-cluster   default     1                 1                   5      1      0      24Gi     13d
+    ```
+
+### Retrieve Ray Cluster Head Logs
+
+    Usage:
+    log (RAY_CLUSTER_NAME) [--out-dir directory] [--node-type all|head|worker]
+    **Currently only `head` is supported**
+
+    Aliases:
+    log, logs
+
+    Description:
+    Retrieves ray cluster head pod logs and all the logs under `/tmp/ray/session_latest/logs/`
+
+    ```
+    $ kubectl ray cluster logs --out-dir temp-dir
+    ...
+    $ ls temp-dir/
+    stable-diffusion-cluster-head-hqcxt
+    $ ls temp-dir/stable-diffusion-cluster-head-hqcxt
+    agent-424238335.err dashboard_agent.log gcs_server.err      monitor.err     old                     raylet.out              stdout.log
+    agent-424238335.out debug_state.txt     gcs_server.out      monitor.log     ray_client_server.err   runtime_env_agent.err
+    dashboard.err       debug_state_gcs.txt log_monitor.err     monitor.out     ray_client_server.out   runtime_env_agent.log
+    dashboard.log       events              log_monitor.log     nsight          raylet.err              runtime_env_agent.out
+    ```
diff --git a/kubectl-plugin/pkg/cmd/log/log.go b/kubectl-plugin/pkg/cmd/log/log.go
new file mode 100644
index 0000000000..0f09e41b07
--- /dev/null
+++ b/kubectl-plugin/pkg/cmd/log/log.go
@@ -0,0 +1,301 @@
+package log
+
+import (
+	"archive/tar"
+	"bytes"
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"log"
+	"math"
+	"net/url"
+	"os"
+	"path"
+	"path/filepath"
+
+	"github.com/spf13/cobra"
+	corev1 "k8s.io/api/core/v1"
+	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/cli-runtime/pkg/genericclioptions"
+	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
+	"k8s.io/client-go/rest"
+	"k8s.io/client-go/tools/remotecommand"
+	cmdutil "k8s.io/kubectl/pkg/cmd/util"
+)
+
+const filePathInPod = "/tmp/ray/session_latest/logs/"
+
+type ClusterLogOptions struct {
+	configFlags *genericclioptions.ConfigFlags
+	ioStreams   *genericclioptions.IOStreams
+	Executor    RemoteExecutor
+	outputDir   string
+	nodeType    string
+	args        []string
+}
+
+func NewClusterLogOptions(streams genericclioptions.IOStreams) *ClusterLogOptions {
+	return &ClusterLogOptions{
+		configFlags: genericclioptions.NewConfigFlags(true),
+		ioStreams:   &streams,
+		Executor:    &DefaultRemoteExecutor{},
+	}
+}
+
+func NewClusterLogCommand(streams genericclioptions.IOStreams) *cobra.Command {
+	options := NewClusterLogOptions(streams)
+	// Initialize the factory for later use with the current config flag
+	cmdFactory := cmdutil.NewFactory(options.configFlags)
+
+	cmd := &cobra.Command{
+		Use:          "log (RAY_CLUSTER_NAME) [--out-dir DIR_PATH] [--node-type all|head|worker]",
+		Short:        "Get ray cluster log",
+		Aliases:      []string{"logs"},
+		SilenceUsage: true,
+		RunE: func(cmd *cobra.Command, args []string) error {
+			if err := options.Complete(args); err != nil {
+				return err
+			}
+			if err := options.Validate(); err != nil {
+				return err
+			}
+			return options.Run(cmd.Context(), cmdFactory)
+		},
+	}
+	cmd.Flags().StringVar(&options.outputDir, "out-dir", options.outputDir, "File Directory PATH of where to download the file logs to.")
+	cmd.Flags().StringVar(&options.nodeType, "node-type", options.nodeType, "Type of Ray node to download the files for.")
+	options.configFlags.AddFlags(cmd.Flags())
+	return cmd
+}
+
+func (options *ClusterLogOptions) Complete(args []string) error {
+	options.args = args
+
+	if options.nodeType == "" {
+		options.nodeType = "head"
+	}
+
+	return nil
+}
+
+func (options *ClusterLogOptions) Validate() error {
+	// Overrides and binds the kube config then retrieves the merged result
+	config, err := options.configFlags.ToRawKubeConfigLoader().RawConfig()
+	if err != nil {
+		return fmt.Errorf("Error retrieving raw config: %w", err)
+	}
+	if len(config.CurrentContext) == 0 {
+		return fmt.Errorf("no context is currently set, use %q to select a new one", "kubectl config use-context <context>")
+	}
+
+	// Command must have ray cluster name
+	if len(options.args) != 1 {
+		return fmt.Errorf("must have at only one argument")
+	} else if options.outputDir == "" {
+		fmt.Fprintln(options.ioStreams.Out, "No output directory specified, creating dir under current directory using cluster name.")
+		options.outputDir = options.args[0]
+		err := os.MkdirAll(options.outputDir, 0o755)
+		if err != nil {
+			return fmt.Errorf("could not create directory with cluster name %s: %w", options.outputDir, err)
+		}
+	}
+
+	switch options.nodeType {
+	case "all":
+		return fmt.Errorf("node type `all` is currently not supported")
+	case "head":
+		break
+	case "worker":
+		return fmt.Errorf("node type `worker` is currently not supported")
+	default:
+		return fmt.Errorf("unknown node type `%s`", options.nodeType)
+	}
+
+	info, err := os.Stat(options.outputDir)
+	if os.IsNotExist(err) {
+		return fmt.Errorf("Directory does not exist. Failed with: %w", err)
+	} else if err != nil {
+		return fmt.Errorf("Error occurred will checking directory: %w", err)
+	} else if !info.IsDir() {
+		return fmt.Errorf("Path is Not a directory. Please input a directory and try again")
+	}
+
+	return nil
+}
+
+func (options *ClusterLogOptions) Run(ctx context.Context, factory cmdutil.Factory) error {
+	kubeClientSet, err := factory.KubernetesClientSet()
+	if err != nil {
+		return fmt.Errorf("failed to retrieve kubernetes client set: %w", err)
+	}
+
+	var listopts v1.ListOptions
+	if options.nodeType == "head" {
+		listopts = v1.ListOptions{
+			LabelSelector: fmt.Sprintf("ray.io/group=headgroup, ray.io/cluster=%s", options.args[0]),
+		}
+	}
+
+	// Get list of nodes that are considered ray heads
+	rayHeads, err := kubeClientSet.CoreV1().Pods(*options.configFlags.Namespace).List(ctx, listopts)
+	if err != nil {
+		return fmt.Errorf("failed to retrieve head node for cluster %s: %w", options.args[0], err)
+	}
+
+	// Get a list of logs of the ray heads.
+	var logList []*bytes.Buffer
+	for _, rayHead := range rayHeads.Items {
+		request := kubeClientSet.CoreV1().Pods(rayHead.Namespace).GetLogs(rayHead.Name, &corev1.PodLogOptions{})
+
+		podLogs, err := request.Stream(ctx)
+		if err != nil {
+			return fmt.Errorf("Error retrieving log for kuberay-head %s: %w", rayHead.Name, err)
+		}
+		defer podLogs.Close()
+
+		// Get current logs:
+		buf := new(bytes.Buffer)
+		_, err = io.Copy(buf, podLogs)
+		if err != nil {
+			return fmt.Errorf("Failed to get read current logs for kuberay-head %s: %w", rayHead.Name, err)
+		}
+
+		logList = append(logList, buf)
+	}
+
+	// Pod file name format is name of the ray head
+	for ind, logList := range logList {
+		curFilePath := filepath.Join(options.outputDir, rayHeads.Items[ind].Name, "stdout.log")
+		dirPath := filepath.Join(options.outputDir, rayHeads.Items[ind].Name)
+		err := os.MkdirAll(dirPath, 0o755)
+		if err != nil {
+			return fmt.Errorf("failed to create directory within path %s: %w", dirPath, err)
+		}
+		file, err := os.OpenFile(curFilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o644)
+		if err != nil {
+			return fmt.Errorf("failed to create/open file for kuberay-head with path: %s: %w", curFilePath, err)
+		}
+		defer file.Close()
+
+		_, err = logList.WriteTo(file)
+		if err != nil {
+			return fmt.Errorf("failed to write to file for kuberay-head: %s: %w", rayHeads.Items[ind].Name, err)
+		}
+
+		req := kubeClientSet.CoreV1().RESTClient().
+			Get().
+			Namespace(rayHeads.Items[ind].Namespace).
+			Resource("pods").
+			Name(rayHeads.Items[ind].Name).
+			SubResource("exec").
+			VersionedParams(&corev1.PodExecOptions{
+				Command: []string{"tar", "--warning=no-file-changed", "-cf", "-", "-C", filePathInPod, "."},
+				Stdin:   true,
+				Stdout:  true,
+				Stderr:  true,
+				TTY:     false,
+			}, clientgoscheme.ParameterCodec)
+
+		restconfig, err := factory.ToRESTConfig()
+		if err != nil {
+			return fmt.Errorf("failed to get restconfig: %w", err)
+		}
+
+		exec, err := options.Executor.CreateExecutor(restconfig, req.URL())
+		if err != nil {
+			return fmt.Errorf("failed to create executor with error: %w", err)
+		}
+
+		err = options.downloadRayLogFiles(ctx, exec, rayHeads.Items[ind])
+		if err != nil {
+			return fmt.Errorf("failed to download ray head log files with error: %w", err)
+		}
+	}
+	return nil
+}
+
+// RemoteExecutor creates the executor for executing exec on the pod - provided for testing purposes
+type RemoteExecutor interface {
+	CreateExecutor(restConfig *rest.Config, url *url.URL) (remotecommand.Executor, error)
+}
+
+type DefaultRemoteExecutor struct{}
+
+// CreateExecutor returns the executor created by NewSPDYExecutor
+func (dre *DefaultRemoteExecutor) CreateExecutor(restConfig *rest.Config, url *url.URL) (remotecommand.Executor, error) {
+	return remotecommand.NewSPDYExecutor(restConfig, "POST", url)
+}
+
+// downloadRayLogFiles will use to the executor and retrieve the logs file from the inputted ray head
+func (options *ClusterLogOptions) downloadRayLogFiles(ctx context.Context, exec remotecommand.Executor, rayhead corev1.Pod) error {
+	outreader, outStream := io.Pipe()
+	go func() {
+		defer outStream.Close()
+		err := exec.StreamWithContext(ctx, remotecommand.StreamOptions{
+			Stdin:  options.ioStreams.In,
+			Stdout: outStream,
+			Stderr: options.ioStreams.ErrOut,
+			Tty:    false,
+		})
+		if err != nil {
+			log.Fatalf("Error occurred while calling remote command: %v", err)
+		}
+	}()
+
+	// Goes through the tar and create/copy them one by one into the destination dir
+	tarReader := tar.NewReader(outreader)
+	header, err := tarReader.Next()
+	if err != nil && !errors.Is(err, io.EOF) {
+		return fmt.Errorf("error will extracting head tar file for ray head %s: %w", rayhead.Name, err)
+	}
+	for !errors.Is(err, io.EOF) {
+		fmt.Printf("Downloading file %s for Ray Head %s\n", header.Name, rayhead.Name)
+		if err != nil {
+			return fmt.Errorf("Error reading tar archive: %w", err)
+		}
+
+		// Construct the full local path and a directory for the tmp file logs
+		localFilePath := filepath.Join(path.Clean(options.outputDir), path.Clean(rayhead.Name), path.Clean(header.Name))
+
+		switch header.Typeflag {
+		case tar.TypeDir:
+			if err := os.MkdirAll(localFilePath, 0o755); err != nil {
+				return fmt.Errorf("Error creating directory: %w", err)
+			}
+		case tar.TypeReg:
+			// Check for overflow: G115
+			if header.Mode < 0 || header.Mode > math.MaxUint32 {
+				fmt.Fprintf(options.ioStreams.Out, "file mode out side of accceptable value %d skipping file", header.Mode)
+			}
+			// Create file and write contents
+			outFile, err := os.OpenFile(localFilePath, os.O_CREATE|os.O_RDWR, os.FileMode(header.Mode)) //nolint:gosec // lint failing due to file mode conversion from uint64 to int32, checked above
+			if err != nil {
+				return fmt.Errorf("Error creating file: %w", err)
+			}
+			defer outFile.Close()
+			// This is to limit the copy size for a decompression bomb, currently set arbitrarily
+			for {
+				n, err := io.CopyN(outFile, tarReader, 1000000)
+				if err != nil {
+					if errors.Is(err, io.EOF) {
+						break
+					}
+					return fmt.Errorf("failed while writing to file: %w", err)
+				}
+				if n == 0 {
+					break
+				}
+			}
+		default:
+			fmt.Printf("Ignoring unsupported file type: %b", header.Typeflag)
+		}
+
+		header, err = tarReader.Next()
+		if header == nil && err != nil && !errors.Is(err, io.EOF) {
+			return fmt.Errorf("error while extracting tar file with error: %w", err)
+		}
+	}
+
+	return nil
+}
diff --git a/kubectl-plugin/pkg/cmd/log/log_test.go b/kubectl-plugin/pkg/cmd/log/log_test.go
new file mode 100644
index 0000000000..c5c38c58f6
--- /dev/null
+++ b/kubectl-plugin/pkg/cmd/log/log_test.go
@@ -0,0 +1,504 @@
+package log
+
+import (
+	"archive/tar"
+	"bytes"
+	"context"
+	"io"
+	"net/http"
+	"net/url"
+	"os"
+	"path/filepath"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	v1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/cli-runtime/pkg/genericclioptions"
+	"k8s.io/cli-runtime/pkg/genericiooptions"
+	"k8s.io/cli-runtime/pkg/resource"
+	"k8s.io/client-go/rest"
+	restclient "k8s.io/client-go/rest"
+	"k8s.io/client-go/rest/fake"
+	"k8s.io/client-go/tools/clientcmd"
+	"k8s.io/client-go/tools/clientcmd/api"
+	"k8s.io/client-go/tools/remotecommand"
+	cmdtesting "k8s.io/kubectl/pkg/cmd/testing"
+	"k8s.io/kubectl/pkg/scheme"
+)
+
+// Mocked NewSPDYExecutor
+var fakeNewSPDYExecutor = func(method string, url *url.URL, inputbuf *bytes.Buffer) (remotecommand.Executor, error) {
+	return &fakeExecutor{method: method, url: url, buf: inputbuf}, nil
+}
+
+type fakeExecutor struct {
+	url    *url.URL
+	buf    *bytes.Buffer
+	method string
+}
+
+// Stream is needed for implementing remotecommand.Execute
+func (f *fakeExecutor) Stream(_ remotecommand.StreamOptions) error {
+	return nil
+}
+
+// downloadRayLogFiles uses StreamWithContext so this is the real function that we are mocking
+func (f *fakeExecutor) StreamWithContext(_ context.Context, options remotecommand.StreamOptions) error {
+	_, err := io.Copy(options.Stdout, f.buf)
+	return err
+}
+
+// createFakeTarFile creates the fake tar file that will be used for testing
+func createFakeTarFile() (*bytes.Buffer, error) {
+	// Create a buffer to hold the tar archive
+	tarbuff := new(bytes.Buffer)
+
+	// Create a tar writer
+	tw := tar.NewWriter(tarbuff)
+
+	// Define the files/directories to include
+	files := []struct {
+		ModTime time.Time
+		Name    string
+		Body    string
+		IsDir   bool
+		Mode    int64
+	}{
+		{time.Now(), "/", "", true, 0o755},
+		{time.Now(), "file1.txt", "This is the content of file1.txt\n", false, 0o644},
+		{time.Now(), "file2.txt", "Content of file2.txt inside subdir\n", false, 0o644},
+	}
+
+	// Add each file/directory to the tar archive
+	for _, file := range files {
+		hdr := &tar.Header{
+			Name:    file.Name,
+			Mode:    file.Mode,
+			ModTime: file.ModTime,
+			Size:    int64(len(file.Body)),
+		}
+		if file.IsDir {
+			hdr.Typeflag = tar.TypeDir
+		} else {
+			hdr.Typeflag = tar.TypeReg
+		}
+
+		// Write the header
+		if err := tw.WriteHeader(hdr); err != nil {
+			return nil, err
+		}
+
+		// Write the file content (if not a directory)
+		if !file.IsDir {
+			if _, err := tw.Write([]byte(file.Body)); err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	// Close the tar writer
+	if err := tw.Close(); err != nil {
+		return nil, err
+	}
+	return tarbuff, nil
+}
+
+type FakeRemoteExecutor struct{}
+
+func (dre *FakeRemoteExecutor) CreateExecutor(_ *rest.Config, url *url.URL) (remotecommand.Executor, error) {
+	return fakeNewSPDYExecutor("GET", url, new(bytes.Buffer))
+}
+
+func TestRayClusterLogComplete(t *testing.T) {
+	testStreams, _, _, _ := genericclioptions.NewTestIOStreams()
+	fakeClusterLogOptions := NewClusterLogOptions(testStreams)
+	fakeArgs := []string{"Expectedoutput"}
+
+	err := fakeClusterLogOptions.Complete(fakeArgs)
+
+	assert.Equal(t, fakeClusterLogOptions.nodeType, "head")
+	assert.Nil(t, err)
+	assert.Equal(t, fakeClusterLogOptions.args, fakeArgs)
+}
+
+func TestRayClusterLogValidate(t *testing.T) {
+	testStreams, _, _, _ := genericclioptions.NewTestIOStreams()
+
+	testNS, testContext, testBT, testImpersonate := "test-namespace", "test-contet", "test-bearer-token", "test-person"
+
+	// Fake directory for kubeconfig
+	fakeDir, err := os.MkdirTemp("", "fake-config")
+	assert.Nil(t, err)
+	defer os.RemoveAll(fakeDir)
+
+	// Set up fake config for kubeconfig
+	config := &api.Config{
+		Clusters: map[string]*api.Cluster{
+			"test-cluster": {
+				Server:                "https://fake-kubernetes-cluster.example.com",
+				InsecureSkipTLSVerify: true, // For testing purposes
+			},
+		},
+		Contexts: map[string]*api.Context{
+			"my-fake-context": {
+				Cluster:  "my-fake-cluster",
+				AuthInfo: "my-fake-user",
+			},
+		},
+		CurrentContext: "my-fake-context",
+		AuthInfos: map[string]*api.AuthInfo{
+			"my-fake-user": {
+				Token: "", // Empty for testing without authentication
+			},
+		},
+	}
+
+	fakeFile := filepath.Join(fakeDir, ".kubeconfig")
+
+	if err := clientcmd.WriteToFile(*config, fakeFile); err != nil {
+		t.Fatalf("Failed to write kubeconfig to temp file: %v", err)
+	}
+
+	// Initialize the fake config flag with the fake kubeconfig and values
+	fakeConfigFlags := &genericclioptions.ConfigFlags{
+		Namespace:        &testNS,
+		Context:          &testContext,
+		KubeConfig:       &fakeFile,
+		BearerToken:      &testBT,
+		Impersonate:      &testImpersonate,
+		ImpersonateGroup: &[]string{"fake-group"},
+	}
+
+	tests := []struct {
+		name        string
+		opts        *ClusterLogOptions
+		expect      string
+		expectError string
+	}{
+		{
+			name: "Test validation when no context is set",
+			opts: &ClusterLogOptions{
+				configFlags: genericclioptions.NewConfigFlags(false),
+				outputDir:   fakeDir,
+				args:        []string{"fake-cluster"},
+				nodeType:    "head",
+				ioStreams:   &testStreams,
+			},
+			expectError: "no context is currently set, use \"kubectl config use-context <context>\" to select a new one",
+		},
+		{
+			name: "Test validation when more than 1 arg",
+			opts: &ClusterLogOptions{
+				// Use fake config to bypass the config flag checks
+				configFlags: fakeConfigFlags,
+				outputDir:   fakeDir,
+				args:        []string{"fake-cluster", "another-fake"},
+				nodeType:    "head",
+				ioStreams:   &testStreams,
+			},
+			expectError: "must have at only one argument",
+		},
+		{
+			name: "Test validation when node type is `all`",
+			opts: &ClusterLogOptions{
+				// Use fake config to bypass the config flag checks
+				configFlags: fakeConfigFlags,
+				outputDir:   fakeDir,
+				args:        []string{"fake-cluster"},
+				nodeType:    "all",
+				ioStreams:   &testStreams,
+			},
+			expectError: "node type `all` is currently not supported",
+		},
+		{
+			name: "Test validation when node type is `worker`",
+			opts: &ClusterLogOptions{
+				// Use fake config to bypass the config flag checks
+				configFlags: fakeConfigFlags,
+				outputDir:   fakeDir,
+				args:        []string{"fake-cluster"},
+				nodeType:    "worker",
+				ioStreams:   &testStreams,
+			},
+			expectError: "node type `worker` is currently not supported",
+		},
+		{
+			name: "Test validation when node type is `random-string`",
+			opts: &ClusterLogOptions{
+				// Use fake config to bypass the config flag checks
+				configFlags: fakeConfigFlags,
+				outputDir:   fakeDir,
+				args:        []string{"fake-cluster"},
+				nodeType:    "random-string",
+				ioStreams:   &testStreams,
+			},
+			expectError: "unknown node type `random-string`",
+		},
+		{
+			name: "Successful validation call",
+			opts: &ClusterLogOptions{
+				// Use fake config to bypass the config flag checks
+				configFlags: fakeConfigFlags,
+				outputDir:   fakeDir,
+				args:        []string{"random_arg"},
+				nodeType:    "head",
+				ioStreams:   &testStreams,
+			},
+			expectError: "",
+		},
+		{
+			name: "Validate output directory when no out-dir i set.",
+			opts: &ClusterLogOptions{
+				// Use fake config to bypass the config flag checks
+				configFlags: fakeConfigFlags,
+				outputDir:   "",
+				args:        []string{"cluster-name"},
+				nodeType:    "head",
+				ioStreams:   &testStreams,
+			},
+			expectError: "",
+		},
+		{
+			name: "Failed validation call with output directory not exist",
+			opts: &ClusterLogOptions{
+				// Use fake config to bypass the config flag checks
+				configFlags: fakeConfigFlags,
+				outputDir:   "randomPath-here",
+				args:        []string{"random_arg"},
+				nodeType:    "head",
+				ioStreams:   &testStreams,
+			},
+			expectError: "Directory does not exist. Failed with: stat randomPath-here: no such file or directory",
+		},
+		{
+			name: "Failed validation call with output directory is file",
+			opts: &ClusterLogOptions{
+				// Use fake config to bypass the config flag checks
+				configFlags: fakeConfigFlags,
+				outputDir:   fakeFile,
+				args:        []string{"random_arg"},
+				nodeType:    "head",
+				ioStreams:   &testStreams,
+			},
+			expectError: "Path is Not a directory. Please input a directory and try again",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			err := tc.opts.Validate()
+			if tc.expectError != "" {
+				assert.Equal(t, tc.expectError, err.Error())
+			} else {
+				if tc.opts.outputDir == "" {
+					assert.Equal(t, tc.opts.args[0], tc.opts.outputDir)
+				}
+				assert.True(t, err == nil)
+			}
+		})
+	}
+}
+
+func TestRayClusterLogRun(t *testing.T) {
+	tf := cmdtesting.NewTestFactory().WithNamespace("test")
+	defer tf.Cleanup()
+
+	fakeDir, err := os.MkdirTemp("", "fake-directory")
+	assert.Nil(t, err)
+	defer os.RemoveAll(fakeDir)
+
+	testStreams, _, _, _ := genericiooptions.NewTestIOStreams()
+
+	fakeClusterLogOptions := NewClusterLogOptions(testStreams)
+	// Uses the mocked executor
+	fakeClusterLogOptions.Executor = &FakeRemoteExecutor{}
+	fakeClusterLogOptions.args = []string{"test-cluster"}
+	fakeClusterLogOptions.outputDir = fakeDir
+
+	// Create list of fake ray heads
+	rayHeadsList := &v1.PodList{
+		ListMeta: metav1.ListMeta{
+			ResourceVersion: "15",
+		},
+		Items: []v1.Pod{
+			{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-cluster-kuberay-head-1",
+					Namespace: "test",
+					Labels: map[string]string{
+						"ray.io/group":    "headgroup",
+						"ray.io/clusters": "test-cluster",
+					},
+				},
+				Spec: v1.PodSpec{
+					Containers: []v1.Container{
+						{
+							Name:  "mycontainer",
+							Image: "nginx:latest",
+						},
+					},
+				},
+				Status: v1.PodStatus{
+					Phase: v1.PodRunning,
+					PodIP: "10.0.0.1",
+				},
+			},
+			{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-cluster-kuberay-head-2",
+					Namespace: "test",
+					Labels: map[string]string{
+						"ray.io/group":    "headgroup",
+						"ray.io/clusters": "test-cluster",
+					},
+				},
+				Spec: v1.PodSpec{
+					Containers: []v1.Container{
+						{
+							Name:  "anothercontainer",
+							Image: "busybox:latest",
+						},
+					},
+				},
+				Status: v1.PodStatus{
+					Phase: v1.PodPending,
+				},
+			},
+		},
+	}
+
+	// create logs for multiple head pods and turn them into io streams so they can be returned with the fake client
+	fakeLogs := []string{
+		"This is some fake log data for first pod.\nStill first pod logs\n",
+		"This is some fake log data for second pod.\nStill second pod logs\n",
+	}
+	logReader1 := io.NopCloser(bytes.NewReader([]byte(fakeLogs[0])))
+	logReader2 := io.NopCloser(bytes.NewReader([]byte(fakeLogs[1])))
+
+	// fakes the client and the REST calls.
+	codec := scheme.Codecs.LegacyCodec(scheme.Scheme.PrioritizedVersionsAllGroups()...)
+	tf.Client = &fake.RESTClient{
+		GroupVersion:         v1.SchemeGroupVersion,
+		NegotiatedSerializer: resource.UnstructuredPlusDefaultContentConfig().NegotiatedSerializer,
+		Client: fake.CreateHTTPClient(func(req *http.Request) (*http.Response, error) {
+			switch req.URL.Path {
+			case "/api/v1/pods":
+				return &http.Response{StatusCode: http.StatusOK, Header: cmdtesting.DefaultHeader(), Body: cmdtesting.ObjBody(codec, rayHeadsList)}, nil
+			case "/api/v1/namespaces/test/pods/test-cluster-kuberay-head-1/log":
+				return &http.Response{StatusCode: http.StatusOK, Header: cmdtesting.DefaultHeader(), Body: logReader1}, nil
+			case "/api/v1/namespaces/test/pods/test-cluster-kuberay-head-2/log":
+				return &http.Response{StatusCode: http.StatusOK, Header: cmdtesting.DefaultHeader(), Body: logReader2}, nil
+			default:
+				t.Fatalf("request url: %#v,and request: %#v", req.URL, req)
+				return nil, nil
+			}
+		}),
+	}
+
+	tf.ClientConfigVal = &restclient.Config{
+		ContentConfig: restclient.ContentConfig{GroupVersion: &v1.SchemeGroupVersion},
+	}
+
+	err = fakeClusterLogOptions.Run(context.Background(), tf)
+	assert.Nil(t, err)
+
+	// Check that the two directories are there
+	entries, err := os.ReadDir(fakeDir)
+	assert.Nil(t, err)
+	assert.Equal(t, 2, len(entries))
+
+	assert.Equal(t, "test-cluster-kuberay-head-1", entries[0].Name())
+	assert.Equal(t, "test-cluster-kuberay-head-2", entries[1].Name())
+
+	// Check the first directory for the logs
+	for ind, entry := range entries {
+		currPath := filepath.Join(fakeDir, entry.Name())
+		currDir, err := os.ReadDir(currPath)
+		assert.Nil(t, err)
+		assert.Equal(t, 1, len(currDir))
+		openfile, err := os.Open(filepath.Join(currPath, "stdout.log"))
+		assert.Nil(t, err)
+		actualContent, err := io.ReadAll(openfile)
+		assert.Nil(t, err)
+		assert.Equal(t, fakeLogs[ind], string(actualContent))
+	}
+}
+
+func TestDownloadRayLogFiles(t *testing.T) {
+	fakeDir, err := os.MkdirTemp("", "fake-directory")
+	assert.Nil(t, err)
+	defer os.RemoveAll(fakeDir)
+
+	testStreams, _, _, _ := genericiooptions.NewTestIOStreams()
+
+	fakeClusterLogOptions := NewClusterLogOptions(testStreams)
+	fakeClusterLogOptions.args = []string{"test-cluster"}
+	fakeClusterLogOptions.outputDir = fakeDir
+
+	// create fake tar files to test
+	fakeTar, err := createFakeTarFile()
+	assert.Nil(t, err)
+
+	// Ray head needed for calling the downloadRayLogFiles command
+	rayHead := v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "test-cluster-kuberay-head-1",
+			Namespace: "test",
+			Labels: map[string]string{
+				"ray.io/group":    "headgroup",
+				"ray.io/clusters": "test-cluster",
+			},
+		},
+		Spec: v1.PodSpec{
+			Containers: []v1.Container{
+				{
+					Name:  "mycontainer",
+					Image: "nginx:latest",
+				},
+			},
+		},
+		Status: v1.PodStatus{
+			Phase: v1.PodRunning,
+			PodIP: "10.0.0.1",
+		},
+	}
+
+	executor, _ := fakeNewSPDYExecutor("GET", &url.URL{}, fakeTar)
+
+	err = fakeClusterLogOptions.downloadRayLogFiles(context.Background(), executor, rayHead)
+	assert.Nil(t, err)
+
+	entries, err := os.ReadDir(fakeDir)
+	assert.Nil(t, err)
+	assert.Equal(t, 1, len(entries))
+
+	// Assert the files
+	assert.True(t, entries[0].IsDir())
+	files, err := os.ReadDir(filepath.Join(fakeDir, entries[0].Name()))
+	assert.Nil(t, err)
+	assert.Equal(t, 2, len(files))
+
+	expectedfileoutput := []struct {
+		Name string
+		Body string
+	}{
+		{"file1.txt", "This is the content of file1.txt\n"},
+		{"file2.txt", "Content of file2.txt inside subdir\n"},
+	}
+
+	// Goes through and check the temp directory with the downloaded files
+	for ind, file := range files {
+		fileInfo, err := file.Info()
+		assert.Nil(t, err)
+		curr := expectedfileoutput[ind]
+
+		assert.Equal(t, curr.Name, fileInfo.Name())
+		openfile, err := os.Open(filepath.Join(fakeDir, entries[0].Name(), file.Name()))
+		assert.Nil(t, err)
+		actualContent, err := io.ReadAll(openfile)
+		assert.Nil(t, err)
+		assert.Equal(t, curr.Body, string(actualContent))
+	}
+}
diff --git a/kubectl-plugin/pkg/cmd/ray.go b/kubectl-plugin/pkg/cmd/ray.go
index 100f57e519..59b4d61622 100644
--- a/kubectl-plugin/pkg/cmd/ray.go
+++ b/kubectl-plugin/pkg/cmd/ray.go
@@ -6,6 +6,7 @@ import (
 	"github.com/spf13/cobra"
 
 	"github.com/ray-project/kuberay/kubectl-plugin/pkg/cmd/cluster"
+	"github.com/ray-project/kuberay/kubectl-plugin/pkg/cmd/log"
 	"github.com/ray-project/kuberay/kubectl-plugin/pkg/cmd/session"
 )
 
@@ -22,5 +23,6 @@ func NewRayCommand(streams genericiooptions.IOStreams) *cobra.Command {
 
 	cmd.AddCommand(cluster.NewClusterCommand(streams))
 	cmd.AddCommand(session.NewSessionCommand(streams))
+	cmd.AddCommand(log.NewClusterLogCommand(streams))
 	return cmd
 }