Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mlperf apps renaming #1162

Merged
merged 6 commits into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Identification of this CM script
alias: reproduce-mlperf-inference-dummy
uid: 7dde40c221e144fb
alias: app-mlperf-inference-dummy
uid: 5b71627383a94576
cache: false

automation_alias: script
Expand All @@ -16,18 +16,18 @@ tags:
- mlperf
- inference
- harness
- dummy-harness
- dummy
- dummy-harness
- dummy
- redhat-harness
- redhat
- openshift-harness
- openshift

# Default environment
default_env:
CM_MLPERF_LOADGEN_SCENARIO: Offline
CM_MLPERF_LOADGEN_MODE: performance
CM_SKIP_PREPROCESS_DATASET: 'no'
CM_SKIP_MODEL_DOWNLOAD: 'no'
CM_MLPERF_SUT_NAME_IMPLEMENTATION_PREFIX: dummy
CM_MLPERF_SUT_NAME_IMPLEMENTATION_PREFIX: redhat_openshift
CM_MLPERF_SKIP_RUN: 'no'

env:
Expand Down Expand Up @@ -283,15 +283,8 @@ variations:

uint8:
group: precision
env:
CM_MLPERF_MODEL_PRECISION: uint8
fp16:
group: precision
env:
CM_MLPERF_MODEL_PRECISION: float16
fp32:
group: precision
default: true
env:
CM_MLPERF_MODEL_PRECISION: float32

Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def get_run_cmd(model, i):
outdir = env['CM_MLPERF_OUTPUT_DIR']
mlperf_conf_path = env['CM_MLPERF_CONF']
user_conf_path = env['CM_MLPERF_USER_CONF']
api_server = env.get('CM_MLPERF_INFERENCE_API_SERVER', 'localhost')
model_path = env['GPTJ_CHECKPOINT_PATH']
dataset_path = env['CM_DATASET_CNNDM_EVAL_PATH']
precision = env['CM_MLPERF_MODEL_PRECISION']
Expand All @@ -46,9 +47,9 @@ def get_run_cmd(model, i):
else:
accuracy_string = ""

run_cmd = f"python3 -u main.py --scenario {scenario} --model-path {model_path} --mlperf-conf {mlperf_conf_path} {accuracy_string} --user-conf {user_conf_path} --dataset-path {dataset_path} --output-log-dir {outdir} --dtype {precision} --device {device} "
run_cmd = f"python3 -u main.py --scenario {scenario} --model-path {model_path} --api-server {api_server} --api-model-name gpt-j-cnn --mlperf-conf {mlperf_conf_path} {accuracy_string} --vllm --user-conf {user_conf_path} --dataset-path {dataset_path} --output-log-dir {outdir} --dtype float32 --device {device} "

run_dir = os.path.join(env['CM_MLPERF_INFERENCE_IMPLEMENTATION_REPO'], "open", "Submitter", "code", "gptj-99")
run_dir = os.path.join(env['CM_MLPERF_INFERENCE_IMPLEMENTATION_REPO'], "open", "RedHat", "code", "gptj-99")

return {'return': 0, 'run_cmd': run_cmd, 'run_dir': run_dir}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Identification of this CM script
alias: reproduce-mlperf-inference-intel
alias: app-mlperf-inference-intel
uid: c05a90433bb04cc1
cache: false
can_force_cache: true
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Identification of this CM script
alias: app-mlperf-inference-cpp
alias: app-mlperf-inference-mlcommons-cpp
uid: bf62405e6c7a44bf

automation_alias: script
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
FROM ubuntu:22.04

# Maintained by the MLCommons taskforce on automation and reproducibility
LABEL github="https://github.com/mlcommons/ck"
LABEL maintainer="https://cKnowledge.org/mlcommons-taskforce"

SHELL ["/bin/bash", "-c"]
ARG CM_GH_TOKEN

# Notes: https://runnable.com/blog/9-common-dockerfile-mistakes
# Install system dependencies
RUN apt-get update -y
RUN apt-get install -y python3 python3-pip git sudo wget

# Install python packages
RUN python3 -m pip install cmind requests giturlparse

# Setup docker environment
ENTRYPOINT ["/bin/bash", "-c"]
ENV TZ="US/Pacific"
ENV PATH="${PATH}:/home/cmuser/.local/bin"
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ >/etc/timezone

# Setup docker user
RUN groupadd cm
RUN useradd -g cm --create-home --shell /bin/bash cmuser
RUN echo "cmuser ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
USER cmuser:cm
WORKDIR /home/cmuser

# Download CM repo for scripts
RUN cm pull repo ctuning@mlcommons-ck

# Install all system dependencies
RUN cm run script --tags=get,sys-utils-cm --quiet

# Run commands
RUN cm run script --tags=app,mlperf,inference,_intel-original,_gptj-99 --quiet --fake_run --env.CM_RUN_STATE_DOCKER=True
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#ifndef ONNXRUNTIME_BACKEND_H_
#define ONNXRUNTIME_BACKEND_H_

#include <memory>
#include <vector>
#include <cstring>

#include "onnxruntime_cxx_api.h"

#include "loadgen.h"

#include "backend.h"

class OnnxRuntimeBackend : public Backend {
public:
OnnxRuntimeBackend(
std::shared_ptr<Model> &model, std::shared_ptr<Device> &device,
size_t performance_sample_count, size_t batch_size,
bool use_cuda)
: Backend(model, device, performance_sample_count, batch_size)
, env(ORT_LOGGING_LEVEL_WARNING, "env") {
for (size_t i = 0; i < device->NumMemory(); i++) {
memory_infos.emplace_back(
use_cuda ? "Cuda" : "Cpu",
OrtAllocatorType::OrtArenaAllocator, i, OrtMemTypeDefault);
}

for (size_t i = 0; i < device->NumConcurrency(); i++) {
Ort::SessionOptions session_options;
// arm64 does not work with optimization level 3 (ORT_ENABLE_ALL)
if (getenv("ORT_ENABLE_ALL", "") == "no")
session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);

const auto &api = Ort::GetApi();

std::vector<const char *> keys{"device_id"};
std::vector<const char *> values{std::to_string(i).c_str()};

OrtCUDAProviderOptionsV2 *cuda_options = nullptr;
if (use_cuda) {
Ort::ThrowOnError(api.CreateCUDAProviderOptions(&cuda_options));

Ort::ThrowOnError(api.UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size()));

Ort::ThrowOnError(api.SessionOptionsAppendExecutionProvider_CUDA_V2(
static_cast<OrtSessionOptions *>(session_options),
cuda_options));
}

sessions.emplace_back(env, model->model_path.c_str(), session_options);
bindings.emplace_back(sessions[i]);

if (use_cuda) {
api.ReleaseCUDAProviderOptions(cuda_options);
}
}
}

void RunInference(
size_t concurrency_index,
const std::vector<mlperf::QuerySample> &batch,
std::vector<void *> &batch_data) override {
Ort::Session &session = sessions[concurrency_index];
Ort::IoBinding &binding = bindings[concurrency_index];
size_t memory_index = device->GetMemoryIndex(concurrency_index);

for (size_t i = 0; i < model->num_inputs; i++) {
size_t size = batch.size() * GetSampleSize(batch.front().index, i);
const std::vector<size_t> &shape = GetSampleShape(batch.front().index, i);
std::vector<int64_t> input_shape;
input_shape.push_back(batch.size());
for (size_t dim : shape)
input_shape.push_back(dim);
ONNXTensorElementDataType input_type =
session.GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetElementType();
Ort::Value value = Ort::Value::CreateTensor(
memory_infos[memory_index],
batch_data[i], size,
input_shape.data(), input_shape.size(),
input_type);
binding.BindInput(model->input_names[i].c_str(), value);
}

for (std::string &output : model->output_names)
binding.BindOutput(output.c_str(), memory_info_cpu);

session.Run(Ort::RunOptions(), binding);

std::vector<Ort::Value> outputs = binding.GetOutputValues();
std::vector<mlperf::QuerySampleResponse> responses(batch.size());
std::vector<std::vector<uint8_t>> response_buffers(batch.size());
for (size_t i = 0; i < batch.size(); i++) {
// get output data and shapes
std::vector<void *> output_buffers(outputs.size());
std::vector<std::vector<size_t>> output_shapes(outputs.size());
for (size_t j = 0; j < outputs.size(); j++) {
// assume ith position in output is ith sample in batch
output_buffers[j] =
static_cast<uint8_t *>(outputs[j].GetTensorMutableData<void>())
+ i * model->output_sizes[j];
size_t rank = outputs[j].GetTensorTypeAndShapeInfo().GetDimensionsCount();
std::vector<int64_t> output_shape(rank);
outputs[j].GetTensorTypeAndShapeInfo().GetDimensions(output_shape.data(), rank);
output_shapes[j].resize(rank);
for (size_t k = 0; k < rank; k++)
output_shapes[j][k] = output_shape[k];
}

model->PostProcess(
batch[i].index, output_buffers, output_shapes, response_buffers[i]);

responses[i].id = batch[i].id;
responses[i].data = reinterpret_cast<uintptr_t>(response_buffers[i].data());
responses[i].size = response_buffers[i].size();
}

mlperf::QuerySamplesComplete(responses.data(), responses.size());

binding.ClearBoundInputs();
binding.ClearBoundOutputs();
};

private:
Ort::Env env;
std::vector<Ort::Session> sessions;
std::vector<Ort::IoBinding> bindings;
std::vector<Ort::MemoryInfo> memory_infos;
Ort::MemoryInfo memory_info_cpu{
Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemTypeDefault)};
};

#endif // ONNXRUNTIME_BACKEND_H_
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Identification of this CM script
alias: app-mlperf-inference-reference
alias: app-mlperf-inference-mlcommons-python
uid: ff149e9781fc4b65

automation_alias: script
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Identification of this CM script
alias: reproduce-mlperf-inference-nvidia
alias: app-mlperf-inference-nvidia
uid: bc3b17fb430f4732
cache: false
can_force_cache: true
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Identification of this CM script
alias: reproduce-mlperf-inference-qualcomm
alias: app-mlperf-inference-qualcomm
uid: eef1aca5d7c0470e
cache: false
can_force_cache: true
Expand Down
Loading
Loading