Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for IR2Vec #615

Closed
Closed
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 47 additions & 1 deletion WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,6 @@ cc_library(
name = "fmt",
srcs = glob(["src/*.cc"]),
hdrs = glob(["include/fmt/*.h"]),
copts = ["-Iexternal/fmt/include"],
strip_include_prefix = "include",
visibility = ["//visibility:public"],
)
Expand Down Expand Up @@ -358,3 +357,50 @@ http_archive(
load("@programl//tools:bzl/deps.bzl", "programl_deps")

programl_deps()

# === IR2Vec ===
# https://github.com/IITH-Compilers/IR2Vec

http_archive(
name = "ir2vec",
build_file_content = """
genrule(
name = "version",
outs = ["version.h"],
cmd = "echo '#define IR2VEC_VERSION \\"1\\"' > $@",
)

cc_library(
name = "ir2vec",
srcs = glob(["src/*.cpp"]) + [":version.h"],
hdrs = glob(["src/include/*.h"]),
copts = ["-Iexternal/ir2vec/src/include"],
strip_include_prefix = "src/include",
visibility = ["//visibility:public"],
deps = [
"@eigen//:eigen",
"@llvm//10.0.0",
],
)
""",
sha256 = "92cbe1d023593c2d45588caf2b1530795f376045e8bc3d2868ba349fb8d61ea5",
strip_prefix = "IR2Vec-1.1.0",
urls = ["https://github.com/IITH-Compilers/IR2Vec/archive/refs/tags/v1.1.0.tar.gz"],
)

# === Eigen ===
# https://eigen.tuxfamily.org/index.php?title=Main_Page

http_archive(
name = "eigen",
build_file_content = """
cc_library(
name = "eigen",
hdrs = glob(["Eigen/**/*"]),
visibility = ["//visibility:public"],
)
""",
sha256 = "d56fbad95abf993f8af608484729e3d87ef611dd85b3380a8bad1d5cbc373a57",
strip_prefix = "eigen-3.3.7",
urls = ["https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.gz"],
)
3 changes: 3 additions & 0 deletions compiler_gym/envs/llvm/service/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ filegroup(
name = "service",
srcs = [
":compiler_gym-llvm-service",
# Runtime data dependencies:
"//compiler_gym/third_party/ir2vec:embeddings",
] + select({
"@llvm//:darwin": [],
"//conditions:default": [
Expand Down Expand Up @@ -245,6 +247,7 @@ cc_library(
"//compiler_gym/util:GrpcStatusMacros",
"@boost//:filesystem",
"@glog",
"@ir2vec",
"@llvm//10.0.0",
"@magic_enum",
"@nlohmann_json//:json",
Expand Down
4 changes: 3 additions & 1 deletion compiler_gym/envs/llvm/service/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ if(DARWIN)
endif()
cg_filegroup(
NAME "service"
DEPENDS ${_DEPS}
DEPENDS
${_DEPS}
compiler_gym::third_party::ir2vec::embeddings
)

cg_genrule(
Expand Down
60 changes: 60 additions & 0 deletions compiler_gym/envs/llvm/service/Observation.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@
#include "compiler_gym/third_party/autophase/InstCount.h"
#include "compiler_gym/third_party/llvm/InstCount.h"
#include "compiler_gym/util/GrpcStatusMacros.h"
#include "compiler_gym/util/RunfilesPath.h"
#include "llvm/Bitcode/BitcodeWriter.h"
// #include "llvm/IR/Metadata.h"
#include "IR2Vec.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/raw_ostream.h"
#include "nlohmann/json.hpp"
Expand Down Expand Up @@ -83,6 +85,64 @@ Status setObservation(LlvmObservationSpace space, const fs::path& workingDirecto
*reply.mutable_int64_list()->mutable_value() = {features.begin(), features.end()};
break;
}
case LlvmObservationSpace::IR2VEC_FA: {
const auto ir2vecEmbeddingsPath = util::getRunfilesPath(
"compiler_gym/third_party/ir2vec/seedEmbeddingVocab-300-llvm10.txt");

IR2Vec::Embeddings embeddings(benchmark.module(), IR2Vec::IR2VecMode::FlowAware,
ir2vecEmbeddingsPath.string());
const auto features = embeddings.getProgramVector();
*reply.mutable_double_list()->mutable_value() = {features.begin(), features.end()};
break;
}
case LlvmObservationSpace::IR2VEC_SYM: {
const auto ir2vecEmbeddingsPath = util::getRunfilesPath(
"compiler_gym/third_party/ir2vec/seedEmbeddingVocab-300-llvm10.txt");

IR2Vec::Embeddings embeddings(benchmark.module(), IR2Vec::IR2VecMode::Symbolic,
ir2vecEmbeddingsPath.string());
const auto features = embeddings.getProgramVector();
*reply.mutable_double_list()->mutable_value() = {features.begin(), features.end()};
break;
}
case LlvmObservationSpace::IR2VEC_FUN_FA: {
const auto ir2vecEmbeddingsPath = util::getRunfilesPath(
"compiler_gym/third_party/ir2vec/seedEmbeddingVocab-300-llvm10.txt");
IR2Vec::Embeddings embeddings(benchmark.module(), IR2Vec::IR2VecMode::FlowAware,
ir2vecEmbeddingsPath.string());
const auto FuncMap = embeddings.getFunctionVecMap();
json Embeddings = json::array({});

for (auto func : FuncMap) {
std::vector<double> FuncEmb = {func.second.begin(), func.second.end()};
json FuncEmbJson = FuncEmb;
json FuncJson;
std::string FuncName = func.first->getName();
FuncJson[FuncName] = FuncEmbJson;
Embeddings.push_back(FuncJson);
}
*reply.mutable_string_value() = Embeddings.dump();
break;
}
case LlvmObservationSpace::IR2VEC_FUN_SYM: {
const auto ir2vecEmbeddingsPath = util::getRunfilesPath(
"compiler_gym/third_party/ir2vec/seedEmbeddingVocab-300-llvm10.txt");
IR2Vec::Embeddings embeddings(benchmark.module(), IR2Vec::IR2VecMode::Symbolic,
ir2vecEmbeddingsPath.string());
const auto FuncMap = embeddings.getFunctionVecMap();
json Embeddings = json::array({});

for (auto func : FuncMap) {
std::vector<double> FuncEmb = {func.second.begin(), func.second.end()};
json FuncEmbJson = FuncEmb;
json FuncJson;
std::string FuncName = func.first->getName();
FuncJson[FuncName] = FuncEmbJson;
Embeddings.push_back(FuncJson);
}
*reply.mutable_string_value() = Embeddings.dump();
break;
}
case LlvmObservationSpace::PROGRAML:
case LlvmObservationSpace::PROGRAML_JSON: {
// Build the ProGraML graph.
Expand Down
68 changes: 68 additions & 0 deletions compiler_gym/envs/llvm/service/ObservationSpaces.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ namespace compiler_gym::llvm_service {

// The number of features in the Autophase feature vector.
static constexpr size_t kAutophaseFeatureDim = 56;

// The number of features in the IR2Vec feature vector.
static constexpr size_t kIR2VecFeatureDim = 300;

// 4096 is the maximum path length for most filesystems.
static constexpr size_t kMaximumPathLength = 4096;

Expand Down Expand Up @@ -90,6 +94,70 @@ std::vector<ObservationSpace> getLlvmObservationSpaceList() {
defaultValue.begin(), defaultValue.end()};
break;
}
case LlvmObservationSpace::IR2VEC_FA: {
ScalarRange featureSize;
featureSize.mutable_min()->set_value(0.0);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure about this. This code says that values are in the range [0,∞], but when I run your code I see plenty of negative values:

>>> env.observation["Ir2vecFa"]
array([ 16.67847493,  -8.76068458, -49.90505585,  12.22742195,
        15.365804  , -10.31495722, -42.74864244, -10.26059285,
       -44.02249729,  23.81014121,  29.06372466,  24.08734658,
       -14.95525244,  19.8942756 , -29.91043964,  10.30582115,
       -24.02354845,   0.10980253,  -1.66427926,  14.17916835,
       -34.78192827,  37.14407874,  -8.33318256,  -3.45480279,
       -16.80741089, -32.45384884,  45.50566991,  37.82753753,
       -49.07060102,  -8.93597257, -52.5364784 ,   1.33546551,
       -12.41253508,  29.89899298,  10.97634208,  10.21049925,
        31.45356546,  16.61958681,  13.0980088 ,  -8.284721  ,

What are the bounds for embedding values?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that ScalarRange wasn't a good fit for the shape of the embeddings as the range is not really bounded. I switched this to be of a Sequence type and fixed the length of the sequence type to be of 300 for both max & min. Can you please check the new code?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the shape of the two non-function-level spaces? Is it a single 300 dimension vector? Or a list of 300 dimension vectors?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a single 300 dimension vector

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OKay, it should be an int64_list then. You can copy the Autophase space and adjust the dimensionality and limits:

ScalarRange featureSize;
featureSize.mutable_min()->set_value(0);
std::vector<ScalarRange> featureSizes;
featureSizes.reserve(kAutophaseFeatureDim);
for (size_t i = 0; i < kAutophaseFeatureDim; ++i) {
featureSizes.push_back(featureSize);
}
*space.mutable_int64_range_list()->mutable_range() = {featureSizes.begin(),
featureSizes.end()};
space.set_deterministic(true);
space.set_platform_dependent(false);
std::vector<int64_t> defaultValue(kAutophaseFeatureDim, 0);
*space.mutable_default_value()->mutable_int64_list()->mutable_value() = {
defaultValue.begin(), defaultValue.end()};

If there is no lower bound, remove this line:

        featureSize.mutable_min()->set_value(0);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OKay, it should be an int64_list then. You can copy the Autophase space and adjust the dimensionality and limits:

ScalarRange featureSize;
featureSize.mutable_min()->set_value(0);
std::vector<ScalarRange> featureSizes;
featureSizes.reserve(kAutophaseFeatureDim);
for (size_t i = 0; i < kAutophaseFeatureDim; ++i) {
featureSizes.push_back(featureSize);
}
*space.mutable_int64_range_list()->mutable_range() = {featureSizes.begin(),
featureSizes.end()};
space.set_deterministic(true);
space.set_platform_dependent(false);
std::vector<int64_t> defaultValue(kAutophaseFeatureDim, 0);
*space.mutable_default_value()->mutable_int64_list()->mutable_value() = {
defaultValue.begin(), defaultValue.end()};

If there is no lower bound, remove this line:

        featureSize.mutable_min()->set_value(0);

Did you mean a double_list ? The values for the embeddings are floating-point numbers

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yes, of course, sorry :)

std::vector<ScalarRange> featureSizes;
featureSizes.reserve(kIR2VecFeatureDim);
for (size_t i = 0; i < kIR2VecFeatureDim; ++i) {
featureSizes.push_back(featureSize);
}
*space.mutable_double_range_list()->mutable_range() = {featureSizes.begin(),
featureSizes.end()};
space.set_deterministic(true);
space.set_platform_dependent(false);
std::vector<double> defaultValue(kIR2VecFeatureDim, 0.0);
*space.mutable_default_value()->mutable_double_list()->mutable_value() = {
defaultValue.begin(), defaultValue.end()};
break;
}
case LlvmObservationSpace::IR2VEC_SYM: {
ScalarRange featureSize;
featureSize.mutable_min()->set_value(0.0);
std::vector<ScalarRange> featureSizes;
featureSizes.reserve(kIR2VecFeatureDim);
for (size_t i = 0; i < kIR2VecFeatureDim; ++i) {
featureSizes.push_back(featureSize);
}
*space.mutable_double_range_list()->mutable_range() = {featureSizes.begin(),
featureSizes.end()};
space.set_deterministic(true);
space.set_platform_dependent(false);
std::vector<double> defaultValue(kIR2VecFeatureDim, 0.0);
*space.mutable_default_value()->mutable_double_list()->mutable_value() = {
defaultValue.begin(), defaultValue.end()};
break;
}
case LlvmObservationSpace::IR2VEC_FUN_FA: {
space.set_opaque_data_format("json://");
space.mutable_string_size_range()->mutable_min()->set_value(0.0);
space.set_deterministic(true);
space.set_platform_dependent(false);
std::vector<double> defaultEmbs;
for (double i = 0; i < 300; i++) defaultEmbs.push_back(i);
ChrisCummins marked this conversation as resolved.
Show resolved Hide resolved
json vectorJson = defaultEmbs;
json FunctionKey;
json embeddings;
FunctionKey["default"] = vectorJson;
embeddings["embeddings"] = FunctionKey;
*space.mutable_default_value()->mutable_string_value() = embeddings.dump();
break;
}
case LlvmObservationSpace::IR2VEC_FUN_SYM: {
space.set_opaque_data_format("json://");
space.mutable_string_size_range()->mutable_min()->set_value(0.0);
space.set_deterministic(true);
space.set_platform_dependent(false);
std::vector<double> defaultEmbs;
for (double i = 0; i < 300; i++) defaultEmbs.push_back(i);
json vectorJson = defaultEmbs;
json FunctionKey;
json embeddings;
FunctionKey["default"] = vectorJson;
embeddings["embeddings"] = FunctionKey;
*space.mutable_default_value()->mutable_string_value() = embeddings.dump();
break;
}
case LlvmObservationSpace::PROGRAML: {
// ProGraML serializes the graph to JSON.
space.set_opaque_data_format("json://networkx/MultiDiGraph");
Expand Down
58 changes: 55 additions & 3 deletions compiler_gym/envs/llvm/service/ObservationSpaces.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ namespace compiler_gym::llvm_service {
* 1. Add a new entry to this LlvmObservationSpace enum.
* 2. Add a new switch case to getLlvmObservationSpaceList() to return the
* ObserverationSpace.
* 3. Add a new switch case to LlvmSession::getObservation() to compute
* the actual observation.
* 4. Run `bazel test //compiler_gym/...` and update the newly failing tests.
* 3. Add a new switch case to setObservation() to compute the actual
* observation.
* 4. Run `make test` and update the newly failing tests.
*/
enum class LlvmObservationSpace {
/**
Expand All @@ -46,6 +46,58 @@ enum class LlvmObservationSpace {
* deep reinforcement learning. FCCM.
*/
AUTOPHASE,
/**
* The IR2Vec Program Level Flow-Aware embeddings.
*
* From:
*
* S. VenkataKeerthy, Rohit Aggarwal, Shalini Jain, Maunendra Sankar Desarkar,
Ramakrishna Upadrasta, and Y. N. Srikant. (2020).
IR2VEC: LLVM IR Based Scalable Program Embeddings.
ACM Trans. Archit. Code Optim. 17, 4, Article 32 (December 2020), 27
pages. DOI:https://doi.org/10.1145/3418463
*
*/
IR2VEC_FA,
ChrisCummins marked this conversation as resolved.
Show resolved Hide resolved
/**
* The IR2Vec Program Level Symbolic embeddings.
*
* From:
*
* S. VenkataKeerthy, Rohit Aggarwal, Shalini Jain, Maunendra Sankar Desarkar,
Ramakrishna Upadrasta, and Y. N. Srikant. (2020).
IR2VEC: LLVM IR Based Scalable Program Embeddings.
ACM Trans. Archit. Code Optim. 17, 4, Article 32 (December 2020), 27
pages. DOI:https://doi.org/10.1145/3418463
*
*/
IR2VEC_SYM,
/**
* The IR2Vec Function level Flow Aware embeddings.
ChrisCummins marked this conversation as resolved.
Show resolved Hide resolved
*
* From:
*
* S. VenkataKeerthy, Rohit Aggarwal, Shalini Jain, Maunendra Sankar Desarkar,
Ramakrishna Upadrasta, and Y. N. Srikant. (2020).
IR2VEC: LLVM IR Based Scalable Program Embeddings.
ACM Trans. Archit. Code Optim. 17, 4, Article 32 (December 2020), 27
pages. DOI:https://doi.org/10.1145/3418463
*
*/
IR2VEC_FUN_FA,
/**
* The IR2Vec Function level Symbolic embeddings.
*
* From:
*
* S. VenkataKeerthy, Rohit Aggarwal, Shalini Jain, Maunendra Sankar Desarkar,
Ramakrishna Upadrasta, and Y. N. Srikant. (2020).
IR2VEC: LLVM IR Based Scalable Program Embeddings.
ACM Trans. Archit. Code Optim. 17, 4, Article 32 (December 2020), 27
pages. DOI:https://doi.org/10.1145/3418463
*
*/
IR2VEC_FUN_SYM,
/**
* Returns the graph representation of a program as a networkx Graph.
*
Expand Down
12 changes: 12 additions & 0 deletions compiler_gym/third_party/ir2vec/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# IR2Vec. https://github.com/IITH-Compilers/IR2Vec

filegroup(
name = "embeddings",
srcs = ["seedEmbeddingVocab-300-llvm10.txt"],
visibility = ["//visibility:public"],
)
12 changes: 12 additions & 0 deletions compiler_gym/third_party/ir2vec/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

cg_add_all_subdirs()

cg_filegroup(
NAME "embeddings"
FILES
"${CMAKE_CURRENT_LIST_DIR}/seedEmbeddingVocab-300-llvm10.txt"
)
Loading