Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement packing local variable and constant data on 64-bit if stack usage is high #156

Merged
merged 3 commits into from
Sep 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/Walrus.h
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ if (f.type == Type::B) { puts("failed in msvc."); }
std::unique_ptr<uint8_t[]> Result##HolderWhenUsingMalloc; \
size_t bytes##Result = (Bytes); \
Type* Result; \
if (LIKELY(bytes##Result < 512)) { \
if (LIKELY(bytes##Result < 2048)) { \
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any special reason for increasing the size of alloca?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code generated by emcc sometimes uses stack a lot.

Result = (Type*)alloca(bytes##Result); \
} else { \
Result##HolderWhenUsingMalloc = std::unique_ptr<uint8_t[]>(new uint8_t[bytes##Result]); \
Expand Down
6 changes: 6 additions & 0 deletions src/interpreter/ByteCode.h
Original file line number Diff line number Diff line change
Expand Up @@ -2037,6 +2037,7 @@ class GlobalGet32 : public ByteCode {
void dump(size_t pos)
{
printf("global.get32 ");
DUMP_BYTECODE_OFFSET(dstOffset);
printf("index: %" PRId32,
m_index);
}
Expand All @@ -2063,6 +2064,7 @@ class GlobalGet64 : public ByteCode {
void dump(size_t pos)
{
printf("global.get64 ");
DUMP_BYTECODE_OFFSET(dstOffset);
printf("index: %" PRId32,
m_index);
}
Expand All @@ -2089,6 +2091,7 @@ class GlobalGet128 : public ByteCode {
void dump(size_t pos)
{
printf("global.get128 ");
DUMP_BYTECODE_OFFSET(dstOffset);
printf("index: %" PRId32,
m_index);
}
Expand All @@ -2115,6 +2118,7 @@ class GlobalSet32 : public ByteCode {
void dump(size_t pos)
{
printf("global.set32 ");
DUMP_BYTECODE_OFFSET(srcOffset);
printf("index: %" PRId32,
m_index);
}
Expand All @@ -2141,6 +2145,7 @@ class GlobalSet64 : public ByteCode {
void dump(size_t pos)
{
printf("global.set64 ");
DUMP_BYTECODE_OFFSET(srcOffset);
printf("index: %" PRId32,
m_index);
}
Expand All @@ -2167,6 +2172,7 @@ class GlobalSet128 : public ByteCode {
void dump(size_t pos)
{
printf("global.set128 ");
DUMP_BYTECODE_OFFSET(srcOffset);
printf("index: %" PRId32,
m_index);
}
Expand Down
169 changes: 91 additions & 78 deletions src/parser/WASMParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,7 @@ class WASMBinaryReader : public wabt::WASMBinaryReaderDelegate {
bool m_inPreprocess;
WASMBinaryReader& m_reader;
std::vector<LocalVariableInfo> m_localVariableInfo;
// <ConstantValue, reference count or position>
std::vector<std::pair<Walrus::Value, size_t>> m_constantData;
};

Expand All @@ -485,8 +486,10 @@ class WASMBinaryReader : public wabt::WASMBinaryReaderDelegate {
std::vector<CatchInfo> m_catchInfo;
struct LocalInfo {
Walrus::Value::Type m_valueType;
LocalInfo(Walrus::Value::Type type)
size_t m_position;
LocalInfo(Walrus::Value::Type type, size_t position)
: m_valueType(type)
, m_position(position)
{
}
};
Expand Down Expand Up @@ -586,10 +589,12 @@ class WASMBinaryReader : public wabt::WASMBinaryReaderDelegate {
m_currentFunctionType = mf->functionType();
m_localInfo.clear();
m_localInfo.reserve(m_currentFunctionType->param().size());
size_t pos = 0;
for (size_t i = 0; i < m_currentFunctionType->param().size(); i++) {
m_localInfo.push_back(LocalInfo(m_currentFunctionType->param()[i]));
m_localInfo.push_back(LocalInfo(m_currentFunctionType->param()[i], pos));
pos += Walrus::valueStackAllocatedSize(m_localInfo[i].m_valueType);
}
m_currentFunction->m_requiredStackSizeDueToParameterAndLocal = m_initialFunctionStackSize = m_functionStackSizeSoFar = m_currentFunctionType->paramStackSize();
m_initialFunctionStackSize = m_functionStackSizeSoFar = m_currentFunctionType->paramStackSize();
m_currentFunction->m_requiredStackSize = std::max(
m_currentFunction->m_requiredStackSize, m_functionStackSizeSoFar);
}
Expand Down Expand Up @@ -969,11 +974,10 @@ class WASMBinaryReader : public wabt::WASMBinaryReaderDelegate {
while (count) {
auto wType = toValueKind(type);
m_currentFunction->m_local.push_back(wType);
m_localInfo.push_back(LocalInfo(wType));
m_localInfo.push_back(LocalInfo(wType, m_functionStackSizeSoFar));
auto sz = Walrus::valueStackAllocatedSize(wType);
m_initialFunctionStackSize += sz;
m_functionStackSizeSoFar += sz;
m_currentFunction->m_requiredStackSizeDueToParameterAndLocal += sz;
count--;
}
m_currentFunction->m_requiredStackSize = std::max(
Expand Down Expand Up @@ -1007,29 +1011,90 @@ class WASMBinaryReader : public wabt::WASMBinaryReaderDelegate {
m_vmStack.clear();

m_preprocessData.organizeData();
// init local if needs

// set const variables position
for (size_t i = 0; i < m_preprocessData.m_constantData.size(); i++) {
auto constType = m_preprocessData.m_constantData[i].first.type();
m_preprocessData.m_constantData[i].second = m_initialFunctionStackSize;
m_initialFunctionStackSize += Walrus::valueStackAllocatedSize(constType);
}

#if defined(WALRUS_64)
#ifndef WALRUS_ENABLE_LOCAL_VARIABLE_PACKING_MIN_SIZE
#define WALRUS_ENABLE_LOCAL_VARIABLE_PACKING_MIN_SIZE 64
#endif
// pack local variables if needs
constexpr size_t enableLocalVaraiblePackingMinSize = WALRUS_ENABLE_LOCAL_VARIABLE_PACKING_MIN_SIZE;
if (m_initialFunctionStackSize >= enableLocalVaraiblePackingMinSize) {
m_initialFunctionStackSize = m_currentFunctionType->paramStackSize();
// put already aligned variables first
for (size_t i = m_currentFunctionType->param().size(); i < m_localInfo.size(); i++) {
auto& info = m_localInfo[i];
if (Walrus::hasCPUWordAlignedSize(info.m_valueType) || needsCPUWordAlignedAddress(info.m_valueType)) {
info.m_position = m_initialFunctionStackSize;
m_initialFunctionStackSize += Walrus::valueStackAllocatedSize(info.m_valueType);
}
}
for (size_t i = 0; i < m_preprocessData.m_constantData.size(); i++) {
auto constType = m_preprocessData.m_constantData[i].first.type();
if (Walrus::hasCPUWordAlignedSize(constType) || needsCPUWordAlignedAddress(constType)) {
m_preprocessData.m_constantData[i].second = m_initialFunctionStackSize;
m_initialFunctionStackSize += Walrus::valueStackAllocatedSize(constType);
}
}

// pack rest values
for (size_t i = m_currentFunctionType->param().size(); i < m_localInfo.size(); i++) {
auto& info = m_localInfo[i];
if (!Walrus::hasCPUWordAlignedSize(info.m_valueType) && !needsCPUWordAlignedAddress(info.m_valueType)) {
info.m_position = m_initialFunctionStackSize;
m_initialFunctionStackSize += Walrus::valueSize(info.m_valueType);
}
}
for (size_t i = 0; i < m_preprocessData.m_constantData.size(); i++) {
auto constType = m_preprocessData.m_constantData[i].first.type();
if (!Walrus::hasCPUWordAlignedSize(constType) && !needsCPUWordAlignedAddress(constType)) {
m_preprocessData.m_constantData[i].second = m_initialFunctionStackSize;
m_initialFunctionStackSize += Walrus::valueSize(constType);
}
}

if (m_initialFunctionStackSize % sizeof(size_t)) {
m_initialFunctionStackSize += (sizeof(size_t) - m_initialFunctionStackSize % sizeof(size_t));
}
}
#endif

m_functionStackSizeSoFar = m_initialFunctionStackSize;
m_currentFunction->m_requiredStackSize = m_functionStackSizeSoFar;

// Explicit init local variable if needs
for (size_t i = m_currentFunctionType->param().size(); i < m_localInfo.size(); i++) {
if (m_preprocessData.m_localVariableInfo[i].m_needsExplicitInitOnStartup) {
auto r = resolveLocalOffsetAndSize(i);
if (r.second == 4) {
pushByteCode(Walrus::Const32(r.first, 0), WASMOpcode::I32ConstOpcode);
} else if (r.second == 8) {
pushByteCode(Walrus::Const64(r.first, 0), WASMOpcode::I64ConstOpcode);
auto localPos = m_localInfo[i].m_position;
auto size = Walrus::valueSize(m_localInfo[i].m_valueType);
if (size == 4) {
pushByteCode(Walrus::Const32(localPos, 0), WASMOpcode::I32ConstOpcode);
} else if (size == 8) {
pushByteCode(Walrus::Const64(localPos, 0), WASMOpcode::I64ConstOpcode);
} else {
ASSERT(r.second == 16);
ASSERT(size == 16);
uint8_t empty[16] = {
0,
};
pushByteCode(Walrus::Const128(r.first, empty), WASMOpcode::V128ConstOpcode);
pushByteCode(Walrus::Const128(localPos, empty), WASMOpcode::V128ConstOpcode);
}
}
#if !defined(NDEBUG)
m_currentFunction->m_localDebugData.push_back(m_localInfo[i].m_position);
#endif
}

// init constant space
for (size_t i = 0; i < m_preprocessData.m_constantData.size(); i++) {
const auto& constValue = m_preprocessData.m_constantData[i].first;
auto constType = m_preprocessData.m_constantData[i].first.type();
auto constPos = m_initialFunctionStackSize;
auto constPos = m_preprocessData.m_constantData[i].second;
size_t constSize = Walrus::valueSize(constType);

uint8_t constantBuffer[16];
Expand All @@ -1042,16 +1107,10 @@ class WASMBinaryReader : public wabt::WASMBinaryReaderDelegate {
ASSERT(constSize == 16);
pushByteCode(Walrus::Const128(constPos, constantBuffer), WASMOpcode::V128ConstOpcode);
}

m_initialFunctionStackSize += Walrus::valueStackAllocatedSize(constType);
#if !defined(NDEBUG)
m_currentFunction->m_constantDebugData.pushBack(m_preprocessData.m_constantData[i].first);
m_currentFunction->m_constantDebugData.pushBack(m_preprocessData.m_constantData[i]);
#endif
}

m_functionStackSizeSoFar = m_initialFunctionStackSize;
m_currentFunction->m_requiredStackSize = std::max(
m_currentFunction->m_requiredStackSize, m_functionStackSizeSoFar);
}

virtual void OnOpcode(uint32_t opcode) override
Expand Down Expand Up @@ -1132,13 +1191,11 @@ class WASMBinaryReader : public wabt::WASMBinaryReaderDelegate {
if (!m_inInitExpr) {
m_preprocessData.addConstantData(value);
if (!m_preprocessData.m_inPreprocess) {
size_t pos = m_currentFunction->m_requiredStackSizeDueToParameterAndLocal;
for (size_t i = 0; i < m_preprocessData.m_constantData.size(); i++) {
if (m_preprocessData.m_constantData[i].first == value) {
pushVMStack(value.type(), pos);
pushVMStack(value.type(), m_preprocessData.m_constantData[i].second);
return true;
}
pos += Walrus::valueStackAllocatedSize(m_preprocessData.m_constantData[i].first.type());
}
}
}
Expand Down Expand Up @@ -1186,32 +1243,14 @@ class WASMBinaryReader : public wabt::WASMBinaryReaderDelegate {
pushByteCode(Walrus::Const128(computeExprResultPosition(Walrus::Value::Type::V128), value), WASMOpcode::V128ConstOpcode);
}

std::pair<uint32_t, uint32_t> resolveLocalOffsetAndSize(Index localIndex)
{
if (localIndex < m_currentFunctionType->param().size()) {
size_t offset = 0;
for (Index i = 0; i < localIndex; i++) {
offset += Walrus::valueStackAllocatedSize(m_currentFunctionType->param()[i]);
}
return std::make_pair(offset, Walrus::valueSize(m_currentFunctionType->param()[localIndex]));
} else {
localIndex -= m_currentFunctionType->param().size();
size_t offset = m_currentFunctionType->paramStackSize();
for (Index i = 0; i < localIndex; i++) {
offset += Walrus::valueStackAllocatedSize(m_currentFunction->m_local[i]);
}
return std::make_pair(offset, Walrus::valueSize(m_currentFunction->m_local[localIndex]));
}
}

size_t computeExprResultPosition(Walrus::Value::Type type)
{
if (!m_preprocessData.m_inPreprocess) {
// if there is local.set code ahead,
// we can use local variable position as expr target position
auto localSetInfo = readAheadLocalGetIfExists();
if (localSetInfo.first) {
auto pos = resolveLocalOffsetAndSize(localSetInfo.first.value()).first;
auto pos = m_localInfo[localSetInfo.first.value()].m_position;
// skip local.set opcode
*m_readerOffsetPointer += localSetInfo.second;
return pos;
Expand All @@ -1221,35 +1260,9 @@ class WASMBinaryReader : public wabt::WASMBinaryReaderDelegate {
return pushVMStack(type);
}

Index resolveLocalIndexFromStackPosition(size_t pos)
{
ASSERT(pos < m_initialFunctionStackSize);
if (pos <= m_currentFunctionType->paramStackSize()) {
Index idx = 0;
size_t offset = 0;
while (true) {
if (offset == pos) {
return idx;
}
offset += Walrus::valueStackAllocatedSize(m_currentFunctionType->param()[idx]);
idx++;
}
}
pos -= m_currentFunctionType->paramStackSize();
Index idx = 0;
size_t offset = 0;
while (true) {
if (offset == pos) {
return idx + m_currentFunctionType->param().size();
}
offset += Walrus::valueStackAllocatedSize(m_currentFunction->m_local[idx]);
idx++;
}
}

virtual void OnLocalGetExpr(Index localIndex) override
{
auto r = resolveLocalOffsetAndSize(localIndex);
auto localPos = m_localInfo[localIndex].m_position;
auto localValueType = m_localInfo[localIndex].m_valueType;

bool canUseDirectReference = true;
Expand All @@ -1264,38 +1277,38 @@ class WASMBinaryReader : public wabt::WASMBinaryReaderDelegate {
}

if (canUseDirectReference) {
pushVMStack(localValueType, r.first, localIndex);
pushVMStack(localValueType, localPos, localIndex);
} else {
auto pos = m_functionStackSizeSoFar;
pushVMStack(localValueType, pos, localIndex);
generateMoveCodeIfNeeds(r.first, pos, localValueType);
generateMoveCodeIfNeeds(localPos, pos, localValueType);
}
}

virtual void OnLocalSetExpr(Index localIndex) override
{
auto r = resolveLocalOffsetAndSize(localIndex);
auto localPos = m_localInfo[localIndex].m_position;

ASSERT(m_localInfo[localIndex].m_valueType == peekVMStackValueType());
auto src = popVMStackInfo();
generateMoveCodeIfNeeds(src.position(), r.first, src.valueType());
generateMoveCodeIfNeeds(src.position(), localPos, src.valueType());
m_preprocessData.addLocalVariableWrite(localIndex);
}

virtual void OnLocalTeeExpr(Index localIndex) override
{
auto valueType = m_localInfo[localIndex].m_valueType;
auto r = resolveLocalOffsetAndSize(localIndex);
auto localPos = m_localInfo[localIndex].m_position;
ASSERT(valueType == peekVMStackValueType());
auto dstInfo = peekVMStackInfo();
generateMoveCodeIfNeeds(dstInfo.position(), r.first, valueType);
generateMoveCodeIfNeeds(dstInfo.position(), localPos, valueType);
m_preprocessData.addLocalVariableWrite(localIndex);
}

virtual void OnGlobalGetExpr(Index index) override
{
auto valueType = m_result.m_globalTypes[index]->type();
auto sz = Walrus::valueStackAllocatedSize(valueType);
auto sz = Walrus::valueSize(valueType);
auto stackPos = computeExprResultPosition(valueType);
if (sz == 4) {
pushByteCode(Walrus::GlobalGet32(stackPos, index), WASMOpcode::GlobalGetOpcode);
Expand All @@ -1313,7 +1326,7 @@ class WASMBinaryReader : public wabt::WASMBinaryReaderDelegate {
auto stackPos = peekVMStack();

ASSERT(peekVMStackValueType() == valueType);
auto sz = Walrus::valueStackAllocatedSize(valueType);
auto sz = Walrus::valueSize(valueType);
if (sz == 4) {
pushByteCode(Walrus::GlobalSet32(stackPos, index), WASMOpcode::GlobalSetOpcode);
} else if (sz == 8) {
Expand Down
Loading