Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

slice stall and dcache write stall #264

Merged
merged 2 commits into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions configs/common/Caches.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ class L2Cache(Cache):
cache_level = 2
enable_wayprediction = False

slice_num = 4

class L3Cache(Cache):
mshrs = 64
tgts_per_mshr = 20
Expand Down
1 change: 0 additions & 1 deletion configs/common/FSConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,7 +670,6 @@ def makeBareMetalXiangshanSystem(mem_mode, mdesc=None, cmdline=None, np=1, ruby=
self.iobus = IOXBar()
if not ruby:
self.membus = MemBus()
self.membus.width = 32

self.bridge = Bridge(delay='50ns')
self.bridge.mem_side_port = self.iobus.cpu_side_ports
Expand Down
3 changes: 3 additions & 0 deletions src/cpu/o3/lsq.cc
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,9 @@ LSQ::clearAddresses(Tick time)
bool
LSQ::bankConflictedCheck(Addr vaddr)
{
if (dcacheWriteStall) {
return true;
}
bool now_bank_conflict = false;
// 64KB Dcache 8way 128sets
// [12:6] [5:3] [2:0]
Expand Down
5 changes: 5 additions & 0 deletions src/cpu/o3/lsq.hh
Original file line number Diff line number Diff line change
Expand Up @@ -950,6 +950,10 @@ class LSQ

bool bankConflictedCheck(Addr vaddr);

void setDcacheWriteStall(bool stall) { dcacheWriteStall = stall; }

bool getDcacheWriteStall() { return dcacheWriteStall; }

/** Is D-cache blocked? */
bool cacheBlocked() const;
/** Set D-cache blocked status */
Expand All @@ -975,6 +979,7 @@ class LSQ

Tick lastConflictCheckTick;

bool dcacheWriteStall = false;
std::vector<int64_t> l1dBankAddresses;
struct NullStruct {};
boost::compute::detail::lru_cache<uint64_t, NullStruct> recentlyloadAddr;
Expand Down
15 changes: 10 additions & 5 deletions src/cpu/o3/lsq_unit.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1404,16 +1404,20 @@ bool LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t
void
LSQUnit::storeBufferEvictToCache()
{
if (isStoreBlocked) {
return;
}
if (storeBuffer.size() == 0) {
if (storeBufferFlushing && storeBuffer.size() == 0) [[unlikely]] {
assert(storeBuffer.unsentSize() == 0);
storeBufferFlushing = false;
cpu->activityThisCycle();
}

// write request will stall one cycle
// so 2 cycle send one write request
if (lsq->getDcacheWriteStall()) {
lsq->setDcacheWriteStall(false);
return;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Dcache requeset of this store may write DataSram after N cycles (4 if Hit, 10+/100+ if miss) arriving at Dcache, (I feel that the blocking should be done at that time instead of blocking directly in the next cycle?

}
if (storeBuffer.unsentSize() == 0) {

if (isStoreBlocked || storeBuffer.unsentSize() == 0) {
return;
}

Expand Down Expand Up @@ -1470,6 +1474,7 @@ LSQUnit::storeBufferEvictToCache()
}
DPRINTF(StoreBuffer, "send packet successed\n");
entry->sending = true;
lsq->setDcacheWriteStall(true);
storeBufferWritebackInactive = 0;
} else {
// Timeout
Expand Down
2 changes: 1 addition & 1 deletion src/mem/cache/Cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ class BaseCache(ClockedObject):
cache_level = Param.Unsigned(0, "Cache level (L1 is 1, L2 is 2, etc.)")

tag_load_read_ports = Param.Unsigned(3, "Total tag read ports for load/prefetcher(in L1 Cache)")
slice_num = Param.Int(4, "slice number (-1 is disable)")
slice_num = Param.Int(-1, "slice number (-1 is disable)")

force_hit = Param.Bool(False, "Force some PC to hit in L1")
way_entries = Param.MemorySize(
Expand Down
65 changes: 43 additions & 22 deletions src/mem/cache/base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -383,14 +383,9 @@ BaseCache::handleTimingReqHit(PacketPtr pkt, CacheBlk *blk, Tick request_time, b
this->schedule(new SendTimingRespEvent(this, pkt), request_time - 1);
}
else {
Tick delay = calculateBusyLatenct(request_time, pkt);
cpuSidePort.schedTimingResp(pkt, request_time + delay);
cpuSidePort.schedTimingResp(pkt, request_time);
}
} else {
if (pkt->isEviction()) {
calculateBusyLatenct(curTick(), pkt);
}

DPRINTF(Cache, "%s satisfied %s, no response needed\n", __func__,
pkt->print());

Expand All @@ -400,6 +395,10 @@ BaseCache::handleTimingReqHit(PacketPtr pkt, CacheBlk *blk, Tick request_time, b
// here as well
pendingDelete.reset(pkt);
}

if (cacheLevel != 1) {
calculateSliceBusy(pkt, false);
}
}

void
Expand Down Expand Up @@ -658,6 +657,9 @@ BaseCache::recvTimingReq(PacketPtr pkt)
cpuSidePort.sendCustomSignal(pkt, DcacheRespType::Miss);
}
} else {
if (cacheLevel != 1) {
calculateSliceBusy(pkt);
}
if (cacheLevel == 1 && pkt->needsResponse() && pkt->isRead()) {
// send cache miss signal
cpuSidePort.sendCustomSignal(pkt, DcacheRespType::Miss);
Expand Down Expand Up @@ -1559,25 +1561,32 @@ BaseCache::calculateAccessLatency(const CacheBlk* blk, const uint32_t delay,
return lat;
}

Tick
BaseCache::calculateBusyLatenct(Tick when_ready, PacketPtr pkt)
{
if (sliceNum <= 0) [[likely]] return 0;
Addr baddr = pkt->getAddr() >> ceilLog2(blkSize);
Addr sliceidx = baddr & (sliceNum - 1);
int additional = 1;
int opLatency = additional + (lookupLatency == 1 ? 0 : lookupLatency) + (dataLatency == 1 ? 0 : dataLatency);
Tick op_lat = cyclesToTicks(Cycles(opLatency));
Tick& readytime = sliceReadyTick[sliceidx];
if (when_ready >= readytime + op_lat) {
readytime = when_ready;
return 0;
} else {
readytime = readytime + op_lat;
return readytime + op_lat - when_ready;
void
BaseCache::calculateSliceBusy(PacketPtr pkt, bool isOnlyTag)
{
int sliceidx = getSliceIdx(pkt->getAddr());
if (sliceidx >= 0) {
Tick arrival_time = curTick() + pkt->headerDelay;
int additional = 1;
int opLatency = additional + (lookupLatency == 1 ? 0 : lookupLatency);
opLatency += (!isOnlyTag && (dataLatency == 1) ? 0 : dataLatency);
Tick op_lat = cyclesToTicks(Cycles(opLatency));
Tick& lastReadytime = sliceReadyTick[sliceidx];
assert(lastReadytime <= arrival_time);
lastReadytime = arrival_time + op_lat;
}
}

bool
BaseCache::checkSLiceBusy(PacketPtr pkt, uint32_t sliceidx)
{
Tick arrival_time = curTick();
if (sliceReadyTick[sliceidx] < arrival_time) {
return false;
}
return true;
}

bool
BaseCache::access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat,
PacketList &writebacks)
Expand Down Expand Up @@ -2990,6 +2999,18 @@ BaseCache::CpuSidePort::tryTiming(PacketPtr pkt)
DPRINTF(TagReadFail, "tryAccessTag fails addr: %lx\n", pkt->getAddr());
return false;
}
int sliceidx = cache->getSliceIdx(pkt->getAddr());
if (sliceidx >= 0 && cache->cacheLevel != 1) {
if (cache->checkSLiceBusy(pkt, sliceidx)) {
//no more buffer
if (sendRetryEvent.scheduled()) {
owner.reschedule(sendRetryEvent, cache->clockEdge());
} else {
owner.schedule(sendRetryEvent, cache->clockEdge());
}
happy-lx marked this conversation as resolved.
Show resolved Hide resolved
return false;
}
}
mustSendRetry = false;
return true;
}
Expand Down
14 changes: 11 additions & 3 deletions src/mem/cache/base.hh
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@

#include <cassert>
#include <cstdint>
#include <queue>
#include <string>

#include "base/addr_range.hh"
Expand Down Expand Up @@ -305,8 +306,6 @@ class BaseCache : public ClockedObject, CacheAccessor

bool mustSendRetry;

private:

void processSendRetry();

EventFunctionWrapper sendRetryEvent;
Expand Down Expand Up @@ -514,7 +513,16 @@ class BaseCache : public ClockedObject, CacheAccessor
Cycles calculateAccessLatency(const CacheBlk* blk, const uint32_t delay,
const Cycles lookup_lat) const;

Tick calculateBusyLatenct(Tick when_ready, PacketPtr pkt);
int getSliceIdx(Addr addr) {
if (sliceNum <= 0) return -1;
Addr baddr = addr >> ceilLog2(blkSize);
Addr sliceidx = baddr & (sliceNum - 1);
return sliceidx;
}

void calculateSliceBusy(PacketPtr pkt, bool isOnlyTag = true);

bool checkSLiceBusy(PacketPtr pkt, uint32_t sliceidx);

/**
* Does all the processing necessary to perform the provided request.
Expand Down