buddy-compiler · zhanghb97 · Oct 30, 2024 · Jun 19, 2024 · Jun 19, 2024 · Jun 19, 2024
diff --git a/examples/BuddyLeNet/.gitignore b/examples/BuddyLeNet/.gitignore
@@ -3,8 +3,11 @@ log.ll
 log.s
 data
 *.data
+*.json
+*.dot
 __pycache__
 *.pth
 lenet.mlir
 forward.mlir
 subgraph0.mlir
+subgraph1.mlir
diff --git a/examples/BuddyLeNet/CMakeLists.txt b/examples/BuddyLeNet/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_custom_command(
-  OUTPUT ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/forward.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/arg0.data
+  OUTPUT ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/forward.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/arg0.data
   COMMAND python3 ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/buddy-lenet-import.py
-  COMMENT "Generating forward.mlir, subgraph0.mlir and parameter files"
+  COMMENT "Generating forward.mlir, subgraph0.mlir, subgraph1.mlir and parameter files"
 )
 
 add_custom_command(
@@ -50,13 +50,61 @@ add_custom_command(
   COMMENT "Building subgraph0.o"
   VERBATIM)
 
-add_library(LENET STATIC subgraph0.o forward.o)
+set(ONE_SHOT_BUFFERIZE_OPTION "bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map")
+set(LOWER_TO_NVVM_OPTION "cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin")
+add_custom_command(
+  OUTPUT subgraph1.o
+  COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir 
+            -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" |
+          ${LLVM_TOOLS_BINARY_DIR}/mlir-opt
+          -one-shot-bufferize=${ONE_SHOT_BUFFERIZE_OPTION}
+          -buffer-deallocation
+          -convert-linalg-to-parallel-loops
+          -canonicalize
+          -gpu-map-parallel-loops
+          -convert-parallel-loops-to-gpu
+          -gpu-kernel-outlining
+          -canonicalize
+          -cse |
+          ${BUDDY_BINARY_DIR}/buddy-opt -convert-memcpy-to-gpu -gpu-async-region -canonicalize |
+          ${LLVM_TOOLS_BINARY_DIR}/mlir-opt -llvm-request-c-wrappers --test-lower-to-nvvm=${LOWER_TO_NVVM_OPTION} |
+          ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+          ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+          ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj  -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o
+  DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir
+  COMMENT "Building subgraph1.o"
+  VERBATIM)
+set(ONE_SHOT_BUFFERIZE_OPTION "bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map")
+set(LOWER_TO_NVVM_OPTION "cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin")
+add_custom_command(
+  OUTPUT subgraph1.o
+  COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir 
+            -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" |
+          ${LLVM_TOOLS_BINARY_DIR}/mlir-opt
+          -one-shot-bufferize=${ONE_SHOT_BUFFERIZE_OPTION}
+          -buffer-deallocation
+          -convert-linalg-to-parallel-loops
+          -canonicalize
+          -gpu-map-parallel-loops
+          -convert-parallel-loops-to-gpu
+          -gpu-kernel-outlining
+          -canonicalize
+          -cse |
+          ${BUDDY_BINARY_DIR}/buddy-opt -convert-memcpy-to-gpu -gpu-async-region -canonicalize |
+          ${LLVM_TOOLS_BINARY_DIR}/mlir-opt -llvm-request-c-wrappers --test-lower-to-nvvm=${LOWER_TO_NVVM_OPTION} |
+          ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+          ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+          ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj  -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o
+  DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir
+  COMMENT "Building subgraph1.o"
+  VERBATIM)
+
+add_library(LENET STATIC subgraph0.o subgraph1.o forward.o)
 
 SET_TARGET_PROPERTIES(LENET PROPERTIES LINKER_LANGUAGE C)
 
 add_executable(buddy-lenet-run buddy-lenet-main.cpp)
 target_link_directories(buddy-lenet-run PRIVATE ${LLVM_LIBRARY_DIR})
 
-set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils ${PNG_LIBRARIES})
-
+set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils mlir_async_runtime mlir_runner_utils mlir_cuda_runtime BuddyLibDIP ${PNG_LIBRARIES})
 target_link_libraries(buddy-lenet-run ${BUDDY_LENET_LIBS})
diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py
@@ -27,8 +27,14 @@
 
 from buddy.compiler.frontend import DynamoCompiler
 from buddy.compiler.graph import GraphDriver
-from buddy.compiler.graph.transform import simply_fuse
-from buddy.compiler.ops import tosa
+from buddy.compiler.graph.transform import (
+    simply_fuse,
+    gpu_fuse,
+    custom_partition,
+)
+from buddy.compiler.graph.type import DeviceType
+from buddy.compiler.ops import tosa, gpu
+from buddy.compiler.graph.json_decoder import json_to_graph
 from model import LeNet
 
 # Retrieve the LeNet model path from environment variables.
@@ -56,13 +62,17 @@
 assert len(graphs) == 1
 graph = graphs[0]
 params = dynamo_compiler.imported_params[graph]
-pattern_list = [simply_fuse]
-graphs[0].fuse_ops(pattern_list)
-driver = GraphDriver(graphs[0])
-driver.subgraphs[0].lower_to_top_level_ir()
+pattern_list = [custom_partition]
+graph.fuse_ops(pattern_list)
 path_prefix = os.path.dirname(os.path.abspath(__file__))
+driver = GraphDriver(graph)
+driver.subgraphs[0].lower_to_top_level_ir()
 with open(os.path.join(path_prefix, "subgraph0.mlir"), "w") as module_file:
     print(driver.subgraphs[0]._imported_module, file=module_file)
+# Add heterogeneous hardware partition
+driver.subgraphs[1].lower_to_top_level_ir()
+with open(os.path.join(path_prefix, "subgraph1.mlir"), "w") as module_file:
+    print(driver.subgraphs[1]._imported_module, file=module_file)
 with open(os.path.join(path_prefix, "forward.mlir"), "w") as module_file:
     print(driver.construct_main_graph(True), file=module_file)
 

diff --git a/examples/BuddyLeNet/makefile b/examples/BuddyLeNet/makefile
@@ -20,6 +20,22 @@ MLIR_ASYNC_RUNTIME := ${LLVM_BUILD_DIR}/lib/libmlir_async_runtime.dylib
 MTRIPLE := x86_64-apple-darwin
 endif
 
+buddy-gpu-matmul-lower:
+	@${BUDDY_OPT} subgraph0.mlir \
+		-transform-preload-library="transform-library-paths=transform.mlir" \
+		-transform-interpreter="entry-point=codegen" \
+		-o log.mlir
+
+buddy-gpu-matmul:
+	@${BUDDY_OPT} subgraph0.mlir -transform-preload-library="transform-library-paths=transform.mlir" -transform-interpreter="entry-point=codegen" | \
+	${BUDDY_OPT} --pass-pipeline='builtin.module(func.func(nvgpu-optimize-shared-memory))' | \
+	${BUDDY_OPT} -arith-expand -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -linalg-bufferize -convert-linalg-to-affine-loops -affine-loop-fusion -affine-parallelize -lower-affine -canonicalize -func-bufferize -arith-bufferize -tensor-bufferize -buffer-deallocation -finalizing-bufferize -canonicalize | \
+	${BUDDY_OPT} -gpu-launch-sink-index-computations -canonicalize -legalize-shmem-outlining -canonicalize | \
+	${BUDDY_OPT} -convert-memcpy-to-gpu -gpu-async-region -canonicalize | \
+	${BUDDY_OPT} -convert-scf-to-cf -memref-expand -finalize-memref-to-llvm -convert-arith-to-llvm --convert-vector-to-llvm -convert-gpu-to-nvvm='has-redux=1' | \
+	${BUDDY_OPT} -llvm-request-c-wrappers -canonicalize -cse -sccp | \
+	${MLIR_OPT} --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" -o matmul-cubin.mlir
+
 buddy-lenet-lower:
 	@${BUDDY_OPT} ./fake-lenet.mlir \
 		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | \
@@ -124,3 +140,4 @@ buddy-lenet-opt-run:
 		-reconcile-unrealized-casts | \
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
 		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
diff --git a/frontend/Python/frontend.py b/frontend/Python/frontend.py
@@ -45,6 +45,7 @@
 from .graph import Graph, TensorDType, TensorMeta
 from .graph.operation import *
 from .graph.transform import maxpool2d_simplify
+from .graph.type import *
 
 
 class DynamoCompiler:
@@ -284,6 +285,7 @@ def _compiler(_gm: torch.fx.GraphModule, _inputs: List[torch.Tensor]):
                 fake_params,
                 self._ops_registry,
                 self._func_name,
+                DeviceType.CPU,
                 self._verbose
             )
             for gm_node in _gm.graph.nodes: