cpu-o3: Transform the lsqunit #265

happy-lx · 2025-01-10T05:51:49Z

Background

In order to align the behavior of the LSU module with the RTL, this PR makes the following modifications:

Uses TimeBuffer to visualize the load and store pipelines.
Distributes the operational details of the load and store pipelines across the corresponding pipeline stages.
Adds nuke replay operation to reduce RAW violations.
Aligns miss load replay behavior with RTL and adds early wake-up.

1. Pipeline Construction

Related commits:

Original design: Load/store operations are delayed by a certain number of cycles and then all operations (TLB, Cache lookup, exception check, etc.) are completed in one cycle using executeLoad/executeStore.

New design: TimeBuffer is set according to the corresponding pipeline stages. Instructions are first dispatched to TimeBuffer s0, and over time they pass through the stages of the pipeline to complete the corresponding operations.

Code Details

Original design:

The operation was delayed by op_latency - 1 cycles before being sent to the corresponding function unit.

GEM5/src/cpu/o3/inst_queue.cc

Lines 700 to 708 in 2d1995a

    
           if (op_latency <= 1) { 
        
               i2e_info->size++; 
        
               instsToExecute.push_back(issued_inst); 
        
           } 
        
           else { 
        
               ++wbOutstanding; 
        
               FUCompletion *execution = new FUCompletion(issued_inst, 0, this); 
        
               cpu->schedule(execution, cpu->clockEdge(Cycles(op_latency - 1))-1); 
        
           }

After reaching the function unit, the load/store calls executeLoad/executeStore to complete the corresponding operations.

GEM5/src/cpu/o3/iew.cc

Lines 1407 to 1437 in 2d1995a

    
           } else if (inst->isLoad()) { 
        
               // Loads will mark themselves as executed, and their writeback 
        
               // event adds the instruction to the queue to commit 
        
               fault = ldstQueue.executeLoad(inst); 
        
               if (inst->isTranslationDelayed() && 
        
                   fault == NoFault) { 
        
                   // A hw page table walk is currently going on; the 
        
                   // instruction must be deferred. 
        
                   DPRINTF(IEW, "Execute: Delayed translation, deferring " 
        
                           "load.\n"); 
        
                   instQueue.deferMemInst(inst); 
        
                   continue; 
        
               } 
        
               if (inst->isDataPrefetch() || inst->isInstPrefetch()) { 
        
                   inst->fault = NoFault; 
        
               } 
        
           } else if (inst->isStore()) { 
        
               fault = ldstQueue.executeStore(inst); 
        
               if (inst->isTranslationDelayed() && 
        
                   fault == NoFault) { 
        
                   // A hw page table walk is currently going on; the 
        
                   // instruction must be deferred. 
        
                   DPRINTF(IEW, "Execute: Delayed translation, deferring " 
        
                           "store.\n"); 
        
                   instQueue.deferMemInst(inst); 
        
                   continue; 
        
               }

executeLoad/executeStore completes all operations.

GEM5/src/cpu/o3/lsq_unit.cc

Lines 906 to 979 in 2d1995a

    
           Fault 
        
           LSQUnit::executeLoad(const DynInstPtr &inst) 
        
           { 
        
               // Execute a specific load. 
        
               Fault load_fault = NoFault; 
        
               DPRINTF(LSQUnit, "Executing load PC %s, [sn:%lli]\n", 
        
                       inst->pcState(), inst->seqNum); 
        
               assert(!inst->isSquashed()); 
        
               load_fault = inst->initiateAcc(); 
        
               if (!inst->translationCompleted()) { 
        
                   iewStage->loadCancel(inst); 
        
               } else { 
        
                   DPRINTF(LSQUnit, "load tlb hit [sn:%lli]\n", 
        
                           inst->seqNum); 
        
               } 
        
               if (load_fault == NoFault && !inst->readMemAccPredicate()) { 
        
                   assert(inst->readPredicate()); 
        
                   inst->setExecuted(); 
        
                   inst->completeAcc(nullptr); 
        
                   iewStage->instToCommit(inst); 
        
                   iewStage->activityThisCycle(); 
        
                   return NoFault; 
        
               } 
        
               if (inst->isTranslationDelayed() && load_fault == NoFault) { 
        
                   return load_fault; 
        
               } 
        
               if (load_fault != NoFault && inst->translationCompleted() && 
        
                       inst->savedRequest->isPartialFault() 
        
                       && !inst->savedRequest->isComplete()) { 
        
                   assert(inst->savedRequest->isSplit()); 
        
                   // If we have a partial fault where the mem access is not complete yet 
        
                   // then the cache must have been blocked. This load will be re-executed 
        
                   // when the cache gets unblocked. We will handle the fault when the 
        
                   // mem access is complete. 
        
                   return NoFault; 
        
               } 
        
               // If the instruction faulted or predicated false, then we need to send it 
        
               // along to commit without the instruction completing. 
        
               if (load_fault != NoFault || !inst->readPredicate()) { 
        
                   // Send this instruction to commit, also make sure iew stage 
        
                   // realizes there is activity.  Mark it as executed unless it 
        
                   // is a strictly ordered load that needs to hit the head of 
        
                   // commit. 
        
                   if (!inst->readPredicate()) 
        
                       inst->forwardOldRegs(); 
        
                   DPRINTF(LSQUnit, "Load [sn:%lli] not executed from %s\n", 
        
                           inst->seqNum, 
        
                           (load_fault != NoFault ? "fault" : "predication")); 
        
                   if (!(inst->hasRequest() && inst->strictlyOrdered()) || 
        
                       inst->isAtCommit()) { 
        
                       inst->setExecuted(); 
        
                   } 
        
                   iewStage->instToCommit(inst); 
        
                   iewStage->activityThisCycle(); 
        
               } else { 
        
                   if (inst->effAddrValid()) { 
        
                       auto it = inst->lqIt; 
        
                       ++it; 
        
                       if (checkLoads) 
        
                           return checkViolations(it, inst); 
        
                   } 
        
               } 
        
               return load_fault; 
        
           }

GEM5/src/cpu/o3/lsq_unit.cc

Lines 1003 to 1067 in 2d1995a

    
           Fault 
        
           LSQUnit::executeStore(const DynInstPtr &store_inst) 
        
           { 
        
               // Make sure that a store exists. 
        
               assert(storeQueue.size() != 0); 
        
               ssize_t store_idx = store_inst->sqIdx; 
        
               DPRINTF(LSQUnit, "Executing store PC %s [sn:%lli]\n", 
        
                       store_inst->pcState(), store_inst->seqNum); 
        
               assert(!store_inst->isSquashed()); 
        
               // Check the recently completed loads to see if any match this store's 
        
               // address.  If so, then we have a memory ordering violation. 
        
               typename LoadQueue::iterator loadIt = store_inst->lqIt; 
        
               Fault store_fault = store_inst->initiateAcc(); 
        
               if (store_inst->isTranslationDelayed() && 
        
                   store_fault == NoFault) 
        
                   return store_fault; 
        
               if (!store_inst->readPredicate()) { 
        
                   DPRINTF(LSQUnit, "Store [sn:%lli] not executed from predication\n", 
        
                           store_inst->seqNum); 
        
                   store_inst->forwardOldRegs(); 
        
                   return store_fault; 
        
               } 
        
               if (storeQueue[store_idx].size() == 0) { 
        
                   DPRINTF(LSQUnit,"Fault on Store PC %s, [sn:%lli], Size = 0\n", 
        
                           store_inst->pcState(), store_inst->seqNum); 
        
                   if (store_inst->isAtomic()) { 
        
                       // If the instruction faulted, then we need to send it along 
        
                       // to commit without the instruction completing. 
        
                       if (!(store_inst->hasRequest() && store_inst->strictlyOrdered()) || 
        
                           store_inst->isAtCommit()) { 
        
                           store_inst->setExecuted(); 
        
                       } 
        
                       iewStage->instToCommit(store_inst); 
        
                       iewStage->activityThisCycle(); 
        
                   } 
        
                   return store_fault; 
        
               } 
        
               assert(store_fault == NoFault); 
        
               if (store_inst->isStoreConditional() || store_inst->isAtomic()) { 
        
                   // Store conditionals and Atomics need to set themselves as able to 
        
                   // writeback if we haven't had a fault by here. 
        
                   storeQueue[store_idx].canWB() = true; 
        
                   ++storesToWB; 
        
               } else { 
        
                   if (enableStorePrefetchTrain) { 
        
                       triggerStorePFTrain(store_idx); 
        
                   } 
        
               } 
        
               return checkViolations(loadIt, store_inst); 
        
           }

New design:

The load/store operations are directly sent to the function unit.

GEM5/src/cpu/o3/inst_queue.cc

Lines 711 to 719 in cf13024

    
           if (op_latency <= 1 || issued_inst->isLoad() || issued_inst->isStore()) { 
        
               i2e_info->size++; 
        
               instsToExecute.push_back(issued_inst); 
        
           } 
        
           else { 
        
               ++wbOutstanding; 
        
               FUCompletion *execution = new FUCompletion(issued_inst, 0, this); 
        
               cpu->schedule(execution, cpu->clockEdge(Cycles(op_latency - 1))-1); 
        
           }

After reaching the function unit, load/store operations call issueToLoadPipe/issueToStorePipe to send the instructions to the s0 of the load/store pipeline. loadPipeSx/storePipeSx is the corresponding pipeline timebuffer, loadPipeSx[0] means s0 stage of load pipeline.

GEM5/src/cpu/o3/lsq_unit.hh

Lines 658 to 682 in cf13024

    
           /** Struct that defines the information passed through Load Pipeline. */ 
        
           struct LoadPipeStruct 
        
           { 
        
               int size; 
        
               DynInstPtr insts[MaxWidth]; 
        
               std::bitset<LdStFlagNum> flags[MaxWidth]; 
        
           }; 
        
           /** The load pipeline TimeBuffer. */ 
        
           TimeBuffer<LoadPipeStruct> loadPipe; 
        
           /** Each stage in load pipeline. loadPipeSx[0] means load pipe S0 */ 
        
           std::vector<TimeBuffer<LoadPipeStruct>::wire> loadPipeSx; 
        
           /** Struct that defines the information passed through Store Pipeline. */ 
        
           struct StorePipeStruct 
        
           { 
        
               int size; 
        
               DynInstPtr insts[MaxWidth]; 
        
               std::bitset<LdStFlagNum> flags[MaxWidth]; 
        
           }; 
        
           /** The store pipeline TimeBuffer. */ 
        
           TimeBuffer<StorePipeStruct> storePipe; 
        
           /** Each stage in store pipeline. storePipeSx[0] means store pipe S0 */ 
        
           std::vector<TimeBuffer<StorePipeStruct>::wire> storePipeSx;

GEM5/src/cpu/o3/iew.cc

Lines 1473 to 1503 in cf13024

    
           if (inst->isMemRef()) { 
        
               DPRINTF(IEW, "Execute: Calculating address for memory " 
        
                       "reference.\n"); 
        
               // Tell the LDSTQ to execute this instruction (if it is a load). 
        
               if (inst->isAtomic()) { 
        
                   // AMOs are treated like store requests 
        
                   fault = ldstQueue.executeAmo(inst); 
        
                   if (inst->isTranslationDelayed() && 
        
                       fault == NoFault) { 
        
                       // A hw page table walk is currently going on; the 
        
                       // instruction must be deferred. 
        
                       DPRINTF(IEW, "Execute: Delayed translation, deferring " 
        
                               "store.\n"); 
        
                       deferMemInst(inst); 
        
                       continue; 
        
                   } 
        
               } else if (inst->isLoad()) { 
        
                   // add this load inst to loadpipe S0. 
        
                   ldstQueue.issueToLoadPipe(inst); 
        
               } else if (inst->isStore()) { 
        
                   // add this store inst to storepipe S0. 
        
                   ldstQueue.issueToStorePipe(inst); 
        
                   // Store conditionals will mark themselves as 
        
                   // executed, and their writeback event will add the 
        
                   // instruction to the queue to commit. 
        
               } else { 
        
                   panic("Unexpected memory type!\n"); 
        
               }

GEM5/src/cpu/o3/lsq_unit.cc

Lines 1053 to 1081 in cf13024

    
           void 
        
           LSQUnit::issueToLoadPipe(const DynInstPtr &inst) 
        
           { 
        
               // push to loadPipeS0 
        
               assert(loadPipeSx[0]->size < MaxWidth); 
        
               int idx = loadPipeSx[0]->size; 
        
               loadPipeSx[0]->insts[idx] = inst; 
        
               loadPipeSx[0]->flags[idx][LdStFlags::Valid] = true; 
        
               loadPipeSx[0]->size++; 
        
               DPRINTF(LSQUnit, "issueToLoadPipe: [sn:%lli]\n", inst->seqNum); 
        
               dumpLoadPipe(); 
        
           } 
        
           void 
        
           LSQUnit::issueToStorePipe(const DynInstPtr &inst) 
        
           { 
        
               // push to storePipeS0 
        
               assert(storePipeSx[0]->size < MaxWidth); 
        
               int idx = storePipeSx[0]->size; 
        
               storePipeSx[0]->insts[idx] = inst; 
        
               storePipeSx[0]->flags[idx][LdStFlags::Valid] = true; 
        
               storePipeSx[0]->size++; 
        
               DPRINTF(LSQUnit, "issueToStorePipe: [sn:%lli]\n", inst->seqNum); 
        
               dumpStorePipe(); 
        
           }

Over time, the instructions gradually enter the subsequent pipeline stages (s1, s2, etc.). The corresponding instructions are fetched from the load/store pipeline at each stage and the operations are executed.

GEM5/src/cpu/o3/lsq_unit.cc

Lines 1314 to 1367 in cf13024

    
           void 
        
           LSQUnit::executeLoadPipeSx() 
        
           { 
        
               // TODO: execute operations in each load pipelines 
        
               Fault fault = NoFault; 
        
               for (int i = 0; i < loadPipeSx.size(); i++) { 
        
                   auto& stage = loadPipeSx[i]; 
        
                   for (int j = 0; j < stage->size; j++) { 
        
                       auto& inst = stage->insts[j]; 
        
                       auto& flag = stage->flags[j]; 
        
                       if (!inst->isSquashed()) { 
        
                           switch (i) { 
        
                               case 0: 
        
                                   fault = loadPipeS0(inst, flag); 
        
                                   break; 
        
                               case 1: 
        
                                   // Loads will mark themselves as executed, and their writeback 
        
                                   // event adds the instruction to the queue to commit 
        
                                   fault = loadPipeS1(inst, flag); 
        
                                   if (inst->isTranslationDelayed() && fault == NoFault) { 
        
                                       // A hw page table walk is currently going on; the 
        
                                       // instruction must be deferred. 
        
                                       DPRINTF(LSQUnit, "Execute: Delayed translation, deferring " 
        
                                       "load.\n"); 
        
                                       iewStage->deferMemInst(inst); 
        
                                       flag[LdStFlags::Replayed] = true; 
        
                                   } 
        
                                   iewStage->SquashCheckAfterExe(inst); 
        
                                   break; 
        
                               case 2: 
        
                                   fault = loadPipeS2(inst, flag); 
        
                                   if (inst->isDataPrefetch() || inst->isInstPrefetch()) { 
        
                                       inst->fault = NoFault; 
        
                                   } 
        
                                   break; 
        
                               case 3: 
        
                                   fault = loadPipeS3(inst, flag); 
        
                                   break; 
        
                               default: 
        
                                   panic("unsupported loadpipe length"); 
        
                           } 
        
                       } else { 
        
                           DPRINTF(LSQUnit, "Execute: Instruction was squashed. PC: %s, [tid:%i]" 
        
                                           " [sn:%llu]\n", inst->pcState(), inst->threadNumber, 
        
                                           inst->seqNum); 
        
                           inst->setExecuted(); 
        
                           inst->setCanCommit(); 
        
                           flag[LdStFlags::Squashed] = true; 
        
                       } 
        
                   } 
        
               } 
        
           }

GEM5/src/cpu/o3/lsq_unit.cc

Lines 1493 to 1559 in cf13024

    
           void 
        
           LSQUnit::executeStorePipeSx() 
        
           { 
        
               // TODO: execute operations in each store pipelines 
        
               Fault fault = NoFault; 
        
               for (int i = 0; i < storePipeSx.size(); i++) { 
        
                   auto& stage = storePipeSx[i]; 
        
                   for (int j = 0; j < stage->size; j++) { 
        
                       auto& inst = stage->insts[j]; 
        
                       auto& flag = stage->flags[j]; 
        
                       if (!inst->isSquashed()) { 
        
                           switch (i) { 
        
                               case 0: 
        
                                   fault = storePipeS0(inst, flag); 
        
                                   break; 
        
                               case 1: 
        
                                   fault = storePipeS1(inst, flag); 
        
                                   if (inst->isTranslationDelayed() && fault == NoFault) { 
        
                                       // A hw page table walk is currently going on; the 
        
                                       // instruction must be deferred. 
        
                                       DPRINTF(LSQUnit, "Execute: Delayed translation, deferring " 
        
                                               "store.\n"); 
        
                                       iewStage->deferMemInst(inst); 
        
                                       flag[LdStFlags::Replayed] = true; 
        
                                       continue; 
        
                                   } 
        
                                   iewStage->notifyExecuted(inst); 
        
                                   iewStage->SquashCheckAfterExe(inst); 
        
                                   break; 
        
                               case 2: 
        
                                   fault = storePipeS2(inst, flag); 
        
                                   break; 
        
                               case 3: 
        
                                   fault = storePipeS3(inst, flag); 
        
                                   break; 
        
                               case 4: 
        
                                   fault = storePipeS4(inst, flag); 
        
                                   break; 
        
                               default: 
        
                                   panic("unsupported storepipe length"); 
        
                           } 
        
                           if (i == (lsq->storeWbStage() - 1)) { 
        
                               // If the store had a fault then it may not have a mem req 
        
                               if (fault != NoFault || !inst->readPredicate() || !inst->isStoreConditional()) { 
        
                                   // If the instruction faulted, then we need to send it 
        
                                   // along to commit without the instruction completing. 
        
                                   // Send this instruction to commit, also make sure iew 
        
                                   // stage realizes there is activity. 
        
                                   if (!flag[LdStFlags::Replayed]) { 
        
                                       inst->setExecuted(); 
        
                                       iewStage->instToCommit(inst); 
        
                                       iewStage->activityThisCycle(); 
        
                                   } 
        
                               } 
        
                           } 
        
                       } else { 
        
                           DPRINTF(LSQUnit, "Execute: Instruction was squashed. PC: %s, [tid:%i]" 
        
                                           " [sn:%llu]\n", inst->pcState(), inst->threadNumber, 
        
                                           inst->seqNum); 
        
                           inst->setExecuted(); 
        
                           inst->setCanCommit(); 
        
                           flag[LdStFlags::Squashed] = true; 
        
                       } 
        
                   } 
        
               } 
        
           }

2. Nuke Replay

Related commit:

bfbf7c957c2a873aee9105664d1228233ceb4f92

Scenario:
sw s1, 0(s0)
lw s2, 0(s0)
The load needs to forward data from the store, and both load and store are issued simultaneously from the IQ into the pipeline, arriving at s1 in the load/store pipeline at the same time.

Original design: If the load executes first, since the store in the same cycle hasn't finished (and hasn't updated the SQ), the load can't detect the dependency and thus can't forward data from the store. Then the store in the same cycle executes, a RAW violation will be detected, causing the pipeline to be flushed.

What about RTL behavior: The described situation does not occur in the RTL because the store instruction in the s1 pipeline stage (the cycle before the SQ is updated) checks whether there is a load instruction on the load pipeline with the same address and data that has not forwarded from it, and causes that load instruction to be replayed.

New design: When the load reaches s1 or s2, it checks whether there are stores still in s1 that haven't executed and match the address. If so, the load will be replayed. This is called the pipeline nuke replay. After the store in s1 completes, the address and data are updated to the SQ, allowing the replayed load to correctly forward data without causing a RAW violation.

You can use EnablePipeNukeCheck to control whether to enable this feature. In addition, if you want to use this feature, make sure EnableLdMissReplay is set to True.

Code Details

RTL design:

store instructions at store pipeline s1 query load pipeline s1/s2, causing the matched load to be replayed https://github.com/OpenXiangShan/XiangShan/blob/0051450372ae5a03ce9d36afdbdd34b9a19f4785/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala#L971-L995 https://github.com/OpenXiangShan/XiangShan/blob/0051450372ae5a03ce9d36afdbdd34b9a19f4785/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala#L1228-L1248 https://github.com/OpenXiangShan/XiangShan/blob/0051450372ae5a03ce9d36afdbdd34b9a19f4785/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala#L1357-L1381

Gem5 New design:

The pipeLineNukeCheck is where the load in s1 checks for pipeline nuke.

GEM5/src/cpu/o3/lsq_unit.cc

Lines 1159 to 1179 in cf13024

    
           if (load_fault != NoFault || !inst->readPredicate()) { 
        
               flag[LdStFlags::HasFault] = load_fault != NoFault; 
        
               flag[LdStFlags::readNotPredicate] = !inst->readPredicate(); 
        
           } else { 
        
               if (inst->effAddrValid()) { 
        
                   // raw violation check (nuke replay) 
        
                   for (int i = 0; i < storePipeSx[1]->size; i++) { 
        
                       auto& store_inst = storePipeSx[1]->insts[i]; 
        
                       if (pipeLineNukeCheck(inst, store_inst)) { 
        
                           flag[LdStFlags::Nuke] = true; 
        
                           break; 
        
                       } 
        
                   } 
        
                   // rar violation check 
        
                   auto it = inst->lqIt; 
        
                   ++it; 
        
                   if (checkLoads) 
        
                       load_fault = checkViolations(it, inst); 
        
               } 
        
           }

The pipeLineNukeCheck for load in s2 checks for pipeline nuke.

GEM5/src/cpu/o3/lsq_unit.cc

Lines 1250 to 1258 in cf13024

    
           // raw violation check (nuke replay) 
        
           for (int i = 0; i < storePipeSx[1]->size; i++) { 
        
               auto& store_inst = storePipeSx[1]->insts[i]; 
        
               if (pipeLineNukeCheck(inst, store_inst)) { 
        
                   flag[LdStFlags::Nuke] = true; 
        
                   break; 
        
               } 
        
           }

Check logic.

GEM5/src/cpu/o3/lsq_unit.cc

Lines 743 to 762 in cf13024

    
           bool 
        
           LSQUnit::pipeLineNukeCheck(const DynInstPtr &load_inst, const DynInstPtr &store_inst) 
        
           { 
        
               Addr load_eff_addr1 = load_inst->effAddr >> depCheckShift; 
        
               Addr load_eff_addr2 = (load_inst->effAddr + load_inst->effSize - 1) >> depCheckShift; 
        
               Addr store_eff_addr1 = store_inst->effAddr >> depCheckShift; 
        
               Addr store_eff_addr2 = (store_inst->effAddr + store_inst->effSize - 1) >> depCheckShift; 
        
               LSQRequest* store_req = store_inst->savedRequest; 
        
               bool load_need_check = load_inst->effAddrValid() && (load_inst->lqIt >= store_inst->lqIt); 
        
               bool store_need_check = store_req && store_req->isTranslationComplete() && 
        
                                       store_req->isMemAccessRequired() && (store_inst->getFault() == NoFault); 
        
               if (lsq->enablePipeNukeCheck() && load_need_check && store_need_check) { 
        
                   if (load_eff_addr1 <= store_eff_addr2 && store_eff_addr1 <= load_eff_addr2) { 
        
                       return true; 
        
                   } 
        
               } 
        
               return false; 
        
           }

If the load in s1 or s2 triggers a pipeline nuke, a fast load replay is performed in s2.

GEM5/src/cpu/o3/lsq_unit.cc

Lines 1273 to 1282 in cf13024

    
           if (flag[LdStFlags::Nuke]) { 
        
               assert(lsq->enablePipeNukeCheck()); 
        
               // replay load if nuke happens 
        
               loadReplayHelper( 
        
                   inst, request, 
        
                   false, // cache hit 
        
                   true,  // fast replay 
        
                   true   // call request->discard() now 
        
               ); 
        
               stats.pipeRawNukeReplay++;

If the load needs to perform a pipeline nuke replay, the pipeline flag of this load will be marked as Nuke, and the RAW violation check will skip this load.

GEM5/src/cpu/o3/lsq_unit.cc

Lines 955 to 961 in cf13024

    
           // if this load has been marked as Nuke, the load will then be replayed 
        
           // So next time this load replaying to pipeline will forward from store correctly 
        
           // And no RAW violation happens 
        
           if (skipNukeReplay(ld_inst)) { 
        
               ++loadIt; 
        
               continue; 
        
           }

3. Miss Load Replay

Commits related to this modification:

Original design: A miss load directly writes back after receiving the TimingResp, with no limit on the number and no re-entering pipeline behavior.

New design: A miss load receives a custom Hint signal two cycles before receiving the TimingResp, which wakes up the relevant instructions and reissues them into the pipeline. The relevant data becomes stable in the bus after TimingResp reaching the LSU, and the reissued load will forward data from the bus. If data cannot be forwarded, it will access the cache again. After the cache refilled the data to itself completely, a Bus_Clear request is sent to the LSU to clear the corresponding data in the bus.

You can use EnableLdMissReplay to control whether to enable this feature.

Code details

Original design:

A miss load directly calls completeDataAccess for write-back after receiving the TimingResp

GEM5/src/cpu/o3/lsq.cc

Lines 1349 to 1362 in 2d1995a

    
           bool 
        
           LSQ::SingleDataRequest::recvTimingResp(PacketPtr pkt) 
        
           { 
        
               // Dump inst num, request addr, and packet addr 
        
               DPRINTF(LSQ, "Single Req::recvTimingResp: inst: %llu, pkt: %#lx\n", pkt->req->getReqInstSeqNum(), 
        
                       pkt->getAddr()); 
        
               assert(_numOutstandingPackets == 1); 
        
               flags.set(Flag::Complete); 
        
               assert(pkt == _packets.front()); 
        
               forward(); 
        
               _port.completeDataAccess(pkt); 
        
               _hasStaleTranslation = false; 
        
               return true; 
        
           }

New design:

A miss load enters a cache miss load replayQueue

GEM5/src/cpu/o3/lsq_unit.cc

Lines 1259 to 1271 in cf13024

    
           // check if cache hit & get cache response? 
        
           // NOTE: cache miss replay has higher priority than nuke replay! 
        
           if (lsq->enableLdMissReplay() && 
        
               request && request->isNormalLd() && !flag[LdStFlags::FullForward] && !flag[LdStFlags::CacheHit]) { 
        
               // cannot get cache data at load s2, replay this load 
        
               loadReplayHelper( 
        
                   inst, request, 
        
                   true,  // cache miss 
        
                   false, // Dont fast replay 
        
                   false  // call request->discard() later when TimingResp comes 
        
               ); 
        
               return fault; 
        
           }

When the refill data reaches the bus between L2 and L1, a Hint signal is returned a few cycles ahead of the TimingResp

GEM5/src/mem/coherent_xbar.cc

Lines 518 to 522 in cf13024

    
           if (hintWakeUpAheadCycles) { 
        
               // send Hint in advance to wake up cache missed load earlier before real TimingResp 
        
               CustomHintEvent* hint = new CustomHintEvent(cpu_side_port_id, pkt, this); 
        
               schedule(hint, curTick() + latency - (clockPeriod() * hintWakeUpAheadCycles)); 
        
           }

After the Hint reaches L1, it queries the MSHR and sends a wake-up signal to the relevant load requests in the cache block. The loads that receive the wake-up signal are awakened from the replayQueue

GEM5/src/mem/cache/cache.cc

Lines 963 to 999 in cf13024

    
           void 
        
           Cache::sendHintViaMSHRTargets(MSHR *mshr, const PacketPtr pkt) 
        
           { 
        
               QueueEntry::Target *initial_tgt = mshr->getTarget(); 
        
               // First offset for critical word first calculations 
        
               const int initial_offset = initial_tgt->pkt->getOffset(blkSize); 
        
               MSHR::TargetList targets = mshr->copyServiceableTargets(pkt); 
        
               // ResponseQueue is a forceOrder Queue, so if first tgt is delayed, 
        
               // all tgt will be delayed to the same time as the first tgt, 
        
               // for more details, see PacketQueue::schedSendTiming 
        
               bool firstTgt = true; 
        
               bool firstTgtDelayed = false; 
        
               for (auto &target: targets) { 
        
                   Packet *tgt_pkt = target.pkt; 
        
                   if (target.source == MSHR::Target::FromCPU) { 
        
                       // How many bytes past the first request is this one 
        
                       int transfer_offset = 
        
                           tgt_pkt->getOffset(blkSize) - initial_offset; 
        
                       if (transfer_offset < 0) { 
        
                           transfer_offset += blkSize; 
        
                       } 
        
                       if (firstTgt) { 
        
                           firstTgtDelayed = transfer_offset != 0 && pkt->payloadDelay != 0; 
        
                       } 
        
                       Tick sendHintTime = curTick() + ((transfer_offset || firstTgtDelayed) ? pkt->payloadDelay : 0); 
        
                       DPRINTF(Cache, "sendHintViaMSHRTargets: pkt: %#lx, sendHintTime: %ld", tgt_pkt->getAddr(), sendHintTime); 
        
                       if (sendHintTime == curTick()) { 
        
                           BaseCache::cpuSidePort.sendCustomSignal(tgt_pkt, DcacheRespType::Hint); 
        
                       } else { 
        
                           SendCustomEvent* hintEvent = new SendCustomEvent(this, tgt_pkt, DcacheRespType::Hint); 
        
                           schedule(hintEvent, sendHintTime); 
        
                       } 
        
                       firstTgt = false; 
        
                   } 
        
               } 
        
           }

GEM5/src/cpu/o3/lsq.cc

Lines 1524 to 1537 in cf13024

    
           void 
        
           LSQ::SingleDataRequest::recvFunctionalCustomSignal(PacketPtr pkt) 
        
           { 
        
               LSQ* lsq = this->_port.getLsq(); 
        
               bool isNormalLd = this->isNormalLd(); 
        
               bool enableLdMissReplay = lsq->enableLdMissReplay(); 
        
               if (enableLdMissReplay && isNormalLd && LSQRequest::_inst->waitingCacheRefill()) { 
        
                   // Receive Custom Hint, wake up cache missed load earlier before recvTimingResp 
        
                   DPRINTF(LSQ, "SingleDataRequest::CustomResp: inst: %llu, pkt: %#lx\n", pkt->req->getReqInstSeqNum(), 
        
                       pkt->getAddr()); 
        
                   DPRINTF(Hint, "[sn:%ld] Recv Hint\n", pkt->req->getReqInstSeqNum()); 
        
                   LSQRequest::_inst->waitingCacheRefill(false); 
        
               } 
        
           }

The actual TimingResp arrives a few cycles after the Hint and stabilizes the data on the bus

GEM5/src/cpu/o3/lsq.cc

Lines 1434 to 1475 in cf13024

    
           bool 
        
           LSQ::SingleDataRequest::recvTimingResp(PacketPtr pkt) 
        
           { 
        
               LSQ* lsq = this->_port.getLsq(); 
        
               bool isNormalLd = this->isNormalLd(); 
        
               bool enableLdMissReplay = lsq->enableLdMissReplay(); 
        
               // Dump inst num, request addr, and packet addr 
        
               DPRINTF(LSQ, "Single Req::recvTimingResp: inst: %llu, pkt: %#lx, isLoad: %d, " 
        
                           "isLLSC: %d, isUncache: %d, isCacheSatisfied: %d, data: %d\n", 
        
                           pkt->req->getReqInstSeqNum(), pkt->getAddr(), isLoad(), mainReq()->isLLSC(), 
        
                           mainReq()->isUncacheable(), pkt->cacheSatisfied, *(pkt->getPtr<uint64_t*>())); 
        
               assert(_numOutstandingPackets == 1); 
        
               if (enableLdMissReplay && isNormalLd) { 
        
                   DPRINTF(Hint, "[sn:%ld] Recv TimingResp\n", pkt->req->getReqInstSeqNum()); 
        
                   if (LSQRequest::_inst->waitingCacheRefill()) { 
        
                       // Missed Data is ready at lsq side data bus, wake up missed load in replay queue 
        
                       DPRINTF(LSQ, "[sn:%ld] waitingCacheRefill\n", pkt->req->getReqInstSeqNum()); 
        
                       LSQRequest::_inst->waitingCacheRefill(false); 
        
                       discard(); 
        
                   } else { 
        
                       // this load is either missed and waken up early or hit. 
        
                       if (pkt->cacheSatisfied) { 
        
                           // cache hit 
        
                           _port.setFlagInPipeLine(_inst, LdStFlags::CacheHit); 
        
                       } else { 
        
                           DPRINTF(LSQ, "[sn:%ld] addToBus\n", pkt->req->getReqInstSeqNum()); 
        
                           // cache miss refill, make data stable on data bus 
        
                           lsq->bus[_inst->seqNum] = pkt->getAddr(); 
        
                           _port.getStats()->busAppendTimes++; 
        
                           discard(); 
        
                       } 
        
                   } 
        
               } else { 
        
                   flags.set(Flag::Complete); 
        
                   assert(pkt == _packets.front()); 
        
                   assert(pkt == mainPacket()); 
        
                   assemblePackets(); 
        
                   _hasStaleTranslation = false; 
        
               } 
        
               LSQRequest::_inst->hasPendingCacheReq(false); 
        
               return true; 
        
           }

The reissued load checks the bus before accessing the cache. If data is available on the bus, it directly uses the data from the bus (via forwardFrmBus(load_inst, request)), and accesses the cache only if no data is found on the bus.

GEM5/src/cpu/o3/lsq_unit.cc

Lines 2995 to 3023 in cf13024

    
           DPRINTF(Hint, "[sn:%ld] Read\n", load_inst->seqNum); 
        
           auto& bus = getLsq()->bus; 
        
           bool busFwdSuccess = bus.find(load_inst->seqNum) != bus.end(); 
        
           if (request->_inst->hasPendingCacheReq()) { 
        
               // Load has been waken up too early, TimingResp is not present now 
        
               // try waiting TimingResp and forward bus again at load s2 
        
               assert(request->isLoad()); 
        
               setFlagInPipeLine(load_inst, LdStFlags::WakeUpEarly); 
        
           } else if (busFwdSuccess) { 
        
               DPRINTF(LSQUnit, "[sn:%ld]: Forward from bus at load s1, data: %lx\n", 
        
                       load_inst->seqNum, *((uint64_t*)(load_inst->memData))); 
        
               panic_if(bus.size() > lsq->getLQEntries(), "packets on bus should never be greater than LQ size"); 
        
               for (auto ele : bus) { 
        
                   DPRINTF(LSQUnit, " bus:[sn:%ld], paddr:%lx\n", ele.first, ele.second); 
        
               } 
        
               // this load can forward data from bus 
        
               forwardFrmBus(load_inst, request); 
        
           } else { 
        
               // if cannot forward from bus, do real cache access 
        
               request->buildPackets(); 
        
               // if the cache is not blocked, do cache access 
        
               if (!request->sendPacketToCache()) { 
        
                   iewStage->loadCancel(load_inst); 
        
               } 
        
               if (!request->isSent()) { 
        
                   iewStage->blockMemInst(load_inst); 
        
                   setFlagInPipeLine(load_inst, LdStFlags::Replayed); 
        
               } 
        
           }

Sure, it is also possible that the Hint is sent too early, causing the reissued load reaching the pipeline and find that the TimingResp has not yet arrived. In this case, the load will speculatively wait for one more cycle, i.e., wait until s2. If the TimingResp arrives, it can forward data from the bus in s2 and then write back. If s2 still hasn't received the TimingResp, it will re-enter the cache miss replay queue.(Note: this scenario will be very rare.)

GEM5/src/cpu/o3/lsq_unit.cc

Lines 2995 to 3002 in cf13024

    
           DPRINTF(Hint, "[sn:%ld] Read\n", load_inst->seqNum); 
        
           auto& bus = getLsq()->bus; 
        
           bool busFwdSuccess = bus.find(load_inst->seqNum) != bus.end(); 
        
           if (request->_inst->hasPendingCacheReq()) { 
        
               // Load has been waken up too early, TimingResp is not present now 
        
               // try waiting TimingResp and forward bus again at load s2 
        
               assert(request->isLoad()); 
        
               setFlagInPipeLine(load_inst, LdStFlags::WakeUpEarly);

GEM5/src/cpu/o3/lsq_unit.cc

Lines 1221 to 1244 in cf13024

    
           if (flag[LdStFlags::WakeUpEarly]) { 
        
               auto& bus = getLsq()->bus; 
        
               bool busFwdSuccess = bus.find(inst->seqNum) != bus.end(); 
        
               if (inst->hasPendingCacheReq() || !busFwdSuccess) { 
        
                   // Load has been waken up too early, even no TimingResp at load s2 
        
                   // Or load received TimingResp any time at [s1, s2], but can not find data on bus 
        
                   // replay this load 
        
                   // warn("Tick:%ld Hint & TimingResp not Match, " 
        
                   //     "plz check the timing relationship between Hint & TimingResp, sn:%ld\n", curTick(), inst->seqNum); 
        
                   loadReplayHelper( 
        
                       inst, request, 
        
                       true,  // cache miss 
        
                       false, // Dont fast replay 
        
                       true   // call request->discard() now 
        
                   ); 
        
                   stats.cacheMissReplayEarly++; 
        
               } else { 
        
                   // Load received TimingResp any time at [s1, s2], forward from data bus 
        
                   DPRINTF(LSQUnit, "[sn:%ld]: Forward from bus at load s2, data: %lx\n", 
        
                           inst->seqNum, *((uint64_t*)(inst->memData))); 
        
                   panic_if(bus.size() > lsq->getLQEntries(), "packets on bus should never be greater than LQ size"); 
        
                   forwardFrmBus(inst, request); 
        
               } 
        
           }

After the cache refill data is ready within the cache, a Bus_Clear is sent to the LSU to clear the relevant data from the bus

GEM5/src/mem/cache/base.cc

Lines 2041 to 2052 in cf13024

    
           // The block will be ready when the payload arrives and the fill is done 
        
           blk->setWhenReady(clockEdge(fillLatency) + pkt->headerDelay + 
        
                             pkt->payloadDelay); 
        
           // NOTE: just send the block address back to lsu 
        
           // notify lsu to clear data on data bus 
        
           // when the block is ready in dcache (load can get data from cache directly) 
        
           PacketPtr customPkt = new Packet(pkt->req, MemCmd::CustomBusClear); 
        
           customPkt->setAddr(addr); 
        
           SendCustomEvent* clearEvent = new SendCustomEvent(this, customPkt, DcacheRespType::Bus_Clear); 
        
           schedule(clearEvent, clockEdge(fillLatency) + pkt->headerDelay + 
        
                             pkt->payloadDelay);

GEM5/src/cpu/o3/lsq.cc

Lines 586 to 602 in cf13024

    
           } else if (sig == DcacheRespType::Bus_Clear) { 
        
               assert(pkt->cmd == MemCmd::CustomBusClear); 
        
               // Data block is ready in Dcache, data on bus can be cleared now 
        
               Addr busClearBlkAddr = pkt->getAddr(); 
        
               DPRINTF(Hint, "Bus Clear\n"); 
        
               DPRINTF(LSQ, "Bus_Clear, clear address: %#lx, bus size: %d\n", busClearBlkAddr, bus.size()); 
        
               for (auto it = bus.begin(); it != bus.end();) { 
        
                   auto [seqNum, addr] = *it; 
        
                   if ((addr & ~((uint64_t)cpu->cacheLineSize() - 1)) == busClearBlkAddr) { 
        
                       it = bus.erase(it); 
        
                       DPRINTF(LSQ, " erased bus: [sn:%ld] addr: %#lx\n", seqNum, addr); 
        
                   } else { 
        
                       it++; 
        
                   } 
        
               } 
        
               delete pkt; 
        
               panic_if(bus.size() > getLQEntries(), "elements on bus should never be greater than LQ size");

4. Misc

During the alignment process, some functionality correctness issues and minor misalignments were also discovered, and they are all documented in this section.

4.1 fence opType

Commits related to this modification:

5e118d7bfbffc7d18a4ad52b73330e61221e7713

Original design:

the fence instruction will be dispatched to mem's dispatchQueue, but its opType is No_OpClass, which will cause it to wait for the integer issue queue IQ2(IntMisc) to have free items before it can continue execution.
If the subsequent instructions of the fence instruction occupy the intIQ2, the fence cannot be executed and cpu deadlocks.

new design:

Change the opType of the fence instruction to MemReadOp to prevent this situation (in fact, the fence will not be dispatched to IQ)

4.2 LRSC

Commits related to this modification:

af375fa600564b38392501242e9aa1ddd19d2b56

LR Instruction can be executed speculatively, which causes RAW violations in some corner cases.
To reduce complexity and ensure consistency with the RTL design, avoid speculative execution for the LR, and strictly maintain order.

4.3 store writeback

Commits related to this modification:

3891ceb1bcc9cb2245b02c61e1fad3ecbefc6758

Previously, the store was written back in s2, but it is written back in s4 in RTL design. So align the timing of this operation. The write-back timing has a significant impact on high IPC programs like hmmer (~13%), with earlier write-back being more beneficial for improving performance.

You can set StoreWbStage to 2 or 4 or N to control which stage the store is written back.

Transform the load/store execution logic into a multi-stage pipeline form Change-Id: Iaf7558ad75ed8fe2bbf4a776359db113b6126453

Originally, the fence instruction will be dispatched to mem's dispatchQueue, but its opType is No_OpClass, which will cause it to wait for the integer issue queue IQ2(IntMisc) to have free items before it can continue execution. If the subsequent instructions of the fence instruction occupy the intIQ2, the fence cannot be executed and cpu stucks. Therefore, change the opType of the fence instruction to MemReadOp to prevent this situation (in fact, the fence will not be dispatched to IQ) Change-Id: Ie38a901e038db9906c43f78675e69391e847c88b

Now initiateAcc only does tlb access and is located at s0 of the load/store pipeline. Load makes cache access and query violations at s1, receives the cache response at s2, and writes back at s3. Store updates sq and query violations at s1, and writes back at s4. AMO operations are now executed using `executeAmo`. Change-Id: Iac678b7de3a690329f279c70fdcd22be4ed22715

This commit is only for normal load. The uncache/amo load is the same as the original process. Change-Id: Idc98ee18a6e94a39774ebba0f772820699b834de

Add a fence before and after the LRSC instruction. Change-Id: I66021d0a5a653d2a7e30cd262166363a84184ed6

Change-Id: Ifc1a586df8beab65772d48a75106155f9e723cba

Adjust cache miss load replay logic: replay all loads cannot get data at load s2, now we don’t need cache to send `sendCustomSignal` when miss. Add RAW nuke replay at load s1&s2 Move most of the writeback logic to load s2 and actually writeback at s3 Change-Id: Idfd3480969958826f4820349168f17c9522f791e

set `EnableLdMissReplay` to True to enable replaying missed load from replayQ set `EnablePipeNukeCheck` to True to detect raw nuke replay in loadpipe NOTE: if `Enableldmissreplay` is False, `EnablePipeNukeCheck` can't be set as True Change-Id: Ic4235bffba01d5dc4c39cec8ae92f2d27b28d98a

store writeback at S4 by default when using --ideal-kmhv3, store writeback at S2 Change-Id: I6a318ff6c182daca0ab041840d76575a16e45d82

Change-Id: I5829589df8ca01724ffa4369d23d7e4693e0aea1

Previously, the delay of the write packet operation did not take into account whether the block was ready. In fact, if the block is not ready, the actual timing for the write to return the TimingResp should be delayed. Change-Id: I65de8d47e2f24ad4be867e1867cddee06092f22f

Currently, at the xbar, except sending the actual TimingResp, a Hint signal is sent N cycles in advance. (N is set by hint_wakeup_ahead_cycles in Caches.py) This Hint signal first queries the MSHR, finds all the associated load instructions, and issues a custom wake-up. Once all the custom wake-ups are received by the load instructions, they wake up the load instructions in the replayQueue. When these waken instructions reach stage s1 or s2 at load pipeline, data is forwarded from the bus. The actual TimingResp will place the data on the bus until the Dcache finally writes the data to itself then clears it. Change-Id: I8960acc14e95c06d8b1a86220f36a181588ff7f4

If this instruction is cancelled, the associated wake event should be descheduled. Change-Id: I595541aa5f96163350aa5f6e3825f78520a0e660

tastynoob · 2025-01-14T02:33:59Z

src/cpu/o3/lsq.cc

@@ -121,9 +135,10 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams &params)
    }

    thread.reserve(numThreads);
+    // TODO: Parameterize the load/store pipeline stages


parameterize this

OK, I will fix this

tastynoob · 2025-01-14T02:36:24Z

src/cpu/o3/lsq.cc

@@ -1282,7 +1365,9 @@ LSQ::LSQRequest::~LSQRequest()
        std::raise(SIGINT);
    }
    assert(!isAnyOutstandingRequest());
-    _inst->savedRequest = nullptr;
+    if (_inst->savedRequest == this) {


why need this condition

In the current design, after a load instruction experiences a miss for the first time, it does not continue to track the corresponding request.

Once the miss request retrieves the data, refills the dcache, and returns a response to the LSU, it will discard itself.

When the missed load is replayed to the pipeline, a new request will be generated(the goal is to ensure it re-accesses the DTLB and dcache when necessary). This approach indeed breaks the rule that each instruction is correspond to only one request. If the request management is not handled properly, it could potentially lead to functional issues.

However, in order to implement early wake-up and align with the RTL, I feel this approach is actually more friendly in terms of implementation.

tastynoob · 2025-01-14T02:49:22Z

src/cpu/o3/lsq_unit.cc

+}
+
+Fault
+LSQUnit::storePipeS2(const DynInstPtr &inst, std::bitset<LdStFlagNum> &flag)


Why so many identical function?
Can be simplified

Yes. These storePipeS2, S3, and S4 do not perform any operations and are indeed redundant. I will replace them with a new function.

tastynoob · 2025-01-14T02:51:26Z

src/cpu/o3/lsq_unit.cc

+void
+LSQUnit::dumpLoadPipe()
+{
+    DPRINTF(LSQUnit, "Dumping LoadPipe:\n");


use:
if (debug::LSQUnit)

Yes, using debug::LSQUnit is much better for simulation speed, I will fix it.

tastynoob · 2025-01-14T02:51:37Z

src/cpu/o3/lsq_unit.cc

+void
+LSQUnit::dumpStorePipe()
+{
+    DPRINTF(LSQUnit, "Dumping StorePipe:\n");


Same as above

tastynoob · 2025-01-14T02:57:59Z

src/cpu/o3/lsq_unit.hh

+        int size;
+
+        DynInstPtr insts[MaxWidth];
+        std::bitset<LdStFlagNum> flags[MaxWidth];


I suggest put the ldsrflags into dyninst

In the current design, where a single instruction may appear in both s0 and s3 of the pipeline (fast replay), I feel storing the state in the TimeBuffer would be easier to manage.

tastynoob · 2025-01-14T02:58:18Z

src/cpu/o3/lsq_unit.hh

+        int size;
+
+        DynInstPtr insts[MaxWidth];
+        std::bitset<LdStFlagNum> flags[MaxWidth];


same as above

+ add LdPipeStages and StPipeStages parameters + remove redundant storePipeSx code + fix dumpLoadStorePipe Change-Id: Ie8cb7865c3a53265520f11f016dd467c25a3e2a5

happy-lx force-pushed the split-lsu-pipe-with-hint branch from 0894beb to cf13024 Compare January 11, 2025 09:09

happy-lx requested review from jueshiwenli, tastynoob and jensen-yan January 13, 2025 07:44

jueshiwenli approved these changes Jan 13, 2025

View reviewed changes

happy-lx added 12 commits January 13, 2025 18:34

cpu-o3: Transform the lsqunit

f503987

Transform the load/store execution logic into a multi-stage pipeline form Change-Id: Iaf7558ad75ed8fe2bbf4a776359db113b6126453

cpu-o3: replay cache missed load from replayQ

ee53d22

This commit is only for normal load. The uncache/amo load is the same as the original process. Change-Id: Idc98ee18a6e94a39774ebba0f772820699b834de

arch: use strictly order-preserving LRSC

af375fa

Add a fence before and after the LRSC instruction. Change-Id: I66021d0a5a653d2a7e30cd262166363a84184ed6

mem: let load has certain latency in ruby cahche

419f2de

Change-Id: Ifc1a586df8beab65772d48a75106155f9e723cba

cpu-o3: make store wb stage configurable

3891ceb

store writeback at S4 by default when using --ideal-kmhv3, store writeback at S2 Change-Id: I6a318ff6c182daca0ab041840d76575a16e45d82

cpu-o3: refactor fullforward code

9097291

Change-Id: I5829589df8ca01724ffa4369d23d7e4693e0aea1

happy-lx force-pushed the split-lsu-pipe-with-hint branch from cf13024 to 343e77e Compare January 13, 2025 10:48

cpu-o3: delete pending wake events if cancel

343e77e

If this instruction is cancelled, the associated wake event should be descheduled. Change-Id: I595541aa5f96163350aa5f6e3825f78520a0e660

tastynoob reviewed Jan 14, 2025

View reviewed changes

jensen-yan approved these changes Jan 14, 2025

View reviewed changes

tastynoob approved these changes Jan 14, 2025

View reviewed changes

cpu-o3: refactor lsu related code

4e5b82c

+ add LdPipeStages and StPipeStages parameters + remove redundant storePipeSx code + fix dumpLoadStorePipe Change-Id: Ie8cb7865c3a53265520f11f016dd467c25a3e2a5

happy-lx merged commit 28ab3ea into xs-dev Jan 15, 2025
9 checks passed

happy-lx mentioned this pull request Jan 17, 2025

cpu-o3: add intel topdown #257

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

cpu-o3: Transform the lsqunit #265

cpu-o3: Transform the lsqunit #265

happy-lx commented Jan 10, 2025 •

edited

Loading

tastynoob Jan 14, 2025

happy-lx Jan 14, 2025

tastynoob Jan 14, 2025

happy-lx Jan 14, 2025

tastynoob Jan 14, 2025

happy-lx Jan 14, 2025

tastynoob Jan 14, 2025

happy-lx Jan 14, 2025

tastynoob Jan 14, 2025

tastynoob Jan 14, 2025

happy-lx Jan 14, 2025

tastynoob Jan 14, 2025

	if (op_latency <= 1) {
	i2e_info->size++;
	instsToExecute.push_back(issued_inst);
	}
	else {
	++wbOutstanding;
	FUCompletion *execution = new FUCompletion(issued_inst, 0, this);
	cpu->schedule(execution, cpu->clockEdge(Cycles(op_latency - 1))-1);
	}

	} else if (inst->isLoad()) {
	// Loads will mark themselves as executed, and their writeback
	// event adds the instruction to the queue to commit
	fault = ldstQueue.executeLoad(inst);

	if (inst->isTranslationDelayed() &&
	fault == NoFault) {
	// A hw page table walk is currently going on; the
	// instruction must be deferred.
	DPRINTF(IEW, "Execute: Delayed translation, deferring "
	"load.\n");
	instQueue.deferMemInst(inst);
	continue;
	}

	if (inst->isDataPrefetch() \|\| inst->isInstPrefetch()) {
	inst->fault = NoFault;
	}
	} else if (inst->isStore()) {
	fault = ldstQueue.executeStore(inst);

	if (inst->isTranslationDelayed() &&
	fault == NoFault) {
	// A hw page table walk is currently going on; the
	// instruction must be deferred.
	DPRINTF(IEW, "Execute: Delayed translation, deferring "
	"store.\n");
	instQueue.deferMemInst(inst);
	continue;
	}

	Fault
	LSQUnit::executeLoad(const DynInstPtr &inst)
	{
	// Execute a specific load.
	Fault load_fault = NoFault;

	DPRINTF(LSQUnit, "Executing load PC %s, [sn:%lli]\n",
	inst->pcState(), inst->seqNum);

	assert(!inst->isSquashed());

	load_fault = inst->initiateAcc();

	if (!inst->translationCompleted()) {
	iewStage->loadCancel(inst);
	} else {
	DPRINTF(LSQUnit, "load tlb hit [sn:%lli]\n",
	inst->seqNum);
	}

	if (load_fault == NoFault && !inst->readMemAccPredicate()) {
	assert(inst->readPredicate());
	inst->setExecuted();
	inst->completeAcc(nullptr);
	iewStage->instToCommit(inst);
	iewStage->activityThisCycle();
	return NoFault;
	}

	if (inst->isTranslationDelayed() && load_fault == NoFault) {
	return load_fault;
	}

	if (load_fault != NoFault && inst->translationCompleted() &&
	inst->savedRequest->isPartialFault()
	&& !inst->savedRequest->isComplete()) {
	assert(inst->savedRequest->isSplit());
	// If we have a partial fault where the mem access is not complete yet
	// then the cache must have been blocked. This load will be re-executed
	// when the cache gets unblocked. We will handle the fault when the
	// mem access is complete.
	return NoFault;
	}

	// If the instruction faulted or predicated false, then we need to send it
	// along to commit without the instruction completing.
	if (load_fault != NoFault \|\| !inst->readPredicate()) {
	// Send this instruction to commit, also make sure iew stage
	// realizes there is activity. Mark it as executed unless it
	// is a strictly ordered load that needs to hit the head of
	// commit.
	if (!inst->readPredicate())
	inst->forwardOldRegs();
	DPRINTF(LSQUnit, "Load [sn:%lli] not executed from %s\n",
	inst->seqNum,
	(load_fault != NoFault ? "fault" : "predication"));
	if (!(inst->hasRequest() && inst->strictlyOrdered()) \|\|
	inst->isAtCommit()) {
	inst->setExecuted();
	}
	iewStage->instToCommit(inst);
	iewStage->activityThisCycle();
	} else {
	if (inst->effAddrValid()) {
	auto it = inst->lqIt;
	++it;

	if (checkLoads)
	return checkViolations(it, inst);
	}
	}

	return load_fault;
	}

	Fault
	LSQUnit::executeStore(const DynInstPtr &store_inst)
	{
	// Make sure that a store exists.
	assert(storeQueue.size() != 0);

	ssize_t store_idx = store_inst->sqIdx;

	DPRINTF(LSQUnit, "Executing store PC %s [sn:%lli]\n",
	store_inst->pcState(), store_inst->seqNum);

	assert(!store_inst->isSquashed());

	// Check the recently completed loads to see if any match this store's
	// address. If so, then we have a memory ordering violation.
	typename LoadQueue::iterator loadIt = store_inst->lqIt;

	Fault store_fault = store_inst->initiateAcc();

	if (store_inst->isTranslationDelayed() &&
	store_fault == NoFault)
	return store_fault;

	if (!store_inst->readPredicate()) {
	DPRINTF(LSQUnit, "Store [sn:%lli] not executed from predication\n",
	store_inst->seqNum);
	store_inst->forwardOldRegs();
	return store_fault;
	}

	if (storeQueue[store_idx].size() == 0) {
	DPRINTF(LSQUnit,"Fault on Store PC %s, [sn:%lli], Size = 0\n",
	store_inst->pcState(), store_inst->seqNum);

	if (store_inst->isAtomic()) {
	// If the instruction faulted, then we need to send it along
	// to commit without the instruction completing.
	if (!(store_inst->hasRequest() && store_inst->strictlyOrdered()) \|\|
	store_inst->isAtCommit()) {
	store_inst->setExecuted();
	}
	iewStage->instToCommit(store_inst);
	iewStage->activityThisCycle();
	}

	return store_fault;
	}

	assert(store_fault == NoFault);

	if (store_inst->isStoreConditional() \|\| store_inst->isAtomic()) {
	// Store conditionals and Atomics need to set themselves as able to
	// writeback if we haven't had a fault by here.
	storeQueue[store_idx].canWB() = true;

	++storesToWB;
	} else {
	if (enableStorePrefetchTrain) {
	triggerStorePFTrain(store_idx);
	}
	}

	return checkViolations(loadIt, store_inst);

	}

	if (op_latency <= 1 \|\| issued_inst->isLoad() \|\| issued_inst->isStore()) {
	i2e_info->size++;
	instsToExecute.push_back(issued_inst);
	}
	else {
	++wbOutstanding;
	FUCompletion *execution = new FUCompletion(issued_inst, 0, this);
	cpu->schedule(execution, cpu->clockEdge(Cycles(op_latency - 1))-1);
	}

	/** Struct that defines the information passed through Load Pipeline. */
	struct LoadPipeStruct
	{
	int size;

	DynInstPtr insts[MaxWidth];
	std::bitset<LdStFlagNum> flags[MaxWidth];
	};
	/** The load pipeline TimeBuffer. */
	TimeBuffer<LoadPipeStruct> loadPipe;
	/** Each stage in load pipeline. loadPipeSx[0] means load pipe S0 */
	std::vector<TimeBuffer<LoadPipeStruct>::wire> loadPipeSx;

	/** Struct that defines the information passed through Store Pipeline. */
	struct StorePipeStruct
	{
	int size;

	DynInstPtr insts[MaxWidth];
	std::bitset<LdStFlagNum> flags[MaxWidth];
	};
	/** The store pipeline TimeBuffer. */
	TimeBuffer<StorePipeStruct> storePipe;
	/** Each stage in store pipeline. storePipeSx[0] means store pipe S0 */
	std::vector<TimeBuffer<StorePipeStruct>::wire> storePipeSx;

	if (inst->isMemRef()) {
	DPRINTF(IEW, "Execute: Calculating address for memory "
	"reference.\n");

	// Tell the LDSTQ to execute this instruction (if it is a load).
	if (inst->isAtomic()) {
	// AMOs are treated like store requests
	fault = ldstQueue.executeAmo(inst);

	if (inst->isTranslationDelayed() &&
	fault == NoFault) {
	// A hw page table walk is currently going on; the
	// instruction must be deferred.
	DPRINTF(IEW, "Execute: Delayed translation, deferring "
	"store.\n");
	deferMemInst(inst);
	continue;
	}
	} else if (inst->isLoad()) {
	// add this load inst to loadpipe S0.
	ldstQueue.issueToLoadPipe(inst);
	} else if (inst->isStore()) {
	// add this store inst to storepipe S0.
	ldstQueue.issueToStorePipe(inst);

	// Store conditionals will mark themselves as
	// executed, and their writeback event will add the
	// instruction to the queue to commit.
	} else {
	panic("Unexpected memory type!\n");
	}

	void
	LSQUnit::issueToLoadPipe(const DynInstPtr &inst)
	{
	// push to loadPipeS0
	assert(loadPipeSx[0]->size < MaxWidth);
	int idx = loadPipeSx[0]->size;

	loadPipeSx[0]->insts[idx] = inst;
	loadPipeSx[0]->flags[idx][LdStFlags::Valid] = true;
	loadPipeSx[0]->size++;

	DPRINTF(LSQUnit, "issueToLoadPipe: [sn:%lli]\n", inst->seqNum);
	dumpLoadPipe();
	}

	void
	LSQUnit::issueToStorePipe(const DynInstPtr &inst)
	{
	// push to storePipeS0
	assert(storePipeSx[0]->size < MaxWidth);
	int idx = storePipeSx[0]->size;

	storePipeSx[0]->insts[idx] = inst;
	storePipeSx[0]->flags[idx][LdStFlags::Valid] = true;
	storePipeSx[0]->size++;

	DPRINTF(LSQUnit, "issueToStorePipe: [sn:%lli]\n", inst->seqNum);
	dumpStorePipe();
	}

	void
	LSQUnit::executeLoadPipeSx()
	{
	// TODO: execute operations in each load pipelines
	Fault fault = NoFault;
	for (int i = 0; i < loadPipeSx.size(); i++) {
	auto& stage = loadPipeSx[i];
	for (int j = 0; j < stage->size; j++) {
	auto& inst = stage->insts[j];
	auto& flag = stage->flags[j];
	if (!inst->isSquashed()) {
	switch (i) {
	case 0:
	fault = loadPipeS0(inst, flag);
	break;
	case 1:
	// Loads will mark themselves as executed, and their writeback
	// event adds the instruction to the queue to commit
	fault = loadPipeS1(inst, flag);

	if (inst->isTranslationDelayed() && fault == NoFault) {
	// A hw page table walk is currently going on; the
	// instruction must be deferred.
	DPRINTF(LSQUnit, "Execute: Delayed translation, deferring "
	"load.\n");
	iewStage->deferMemInst(inst);
	flag[LdStFlags::Replayed] = true;
	}
	iewStage->SquashCheckAfterExe(inst);
	break;
	case 2:
	fault = loadPipeS2(inst, flag);

	if (inst->isDataPrefetch() \|\| inst->isInstPrefetch()) {
	inst->fault = NoFault;
	}
	break;
	case 3:
	fault = loadPipeS3(inst, flag);
	break;
	default:
	panic("unsupported loadpipe length");
	}
	} else {
	DPRINTF(LSQUnit, "Execute: Instruction was squashed. PC: %s, [tid:%i]"
	" [sn:%llu]\n", inst->pcState(), inst->threadNumber,
	inst->seqNum);
	inst->setExecuted();
	inst->setCanCommit();
	flag[LdStFlags::Squashed] = true;
	}
	}
	}
	}

	void
	LSQUnit::executeStorePipeSx()
	{
	// TODO: execute operations in each store pipelines
	Fault fault = NoFault;
	for (int i = 0; i < storePipeSx.size(); i++) {
	auto& stage = storePipeSx[i];
	for (int j = 0; j < stage->size; j++) {
	auto& inst = stage->insts[j];
	auto& flag = stage->flags[j];
	if (!inst->isSquashed()) {
	switch (i) {
	case 0:
	fault = storePipeS0(inst, flag);
	break;
	case 1:
	fault = storePipeS1(inst, flag);
	if (inst->isTranslationDelayed() && fault == NoFault) {
	// A hw page table walk is currently going on; the
	// instruction must be deferred.
	DPRINTF(LSQUnit, "Execute: Delayed translation, deferring "
	"store.\n");
	iewStage->deferMemInst(inst);
	flag[LdStFlags::Replayed] = true;
	continue;
	}

	iewStage->notifyExecuted(inst);
	iewStage->SquashCheckAfterExe(inst);
	break;
	case 2:
	fault = storePipeS2(inst, flag);
	break;
	case 3:
	fault = storePipeS3(inst, flag);
	break;
	case 4:
	fault = storePipeS4(inst, flag);
	break;
	default:
	panic("unsupported storepipe length");
	}
	if (i == (lsq->storeWbStage() - 1)) {
	// If the store had a fault then it may not have a mem req
	if (fault != NoFault \|\| !inst->readPredicate() \|\| !inst->isStoreConditional()) {
	// If the instruction faulted, then we need to send it
	// along to commit without the instruction completing.
	// Send this instruction to commit, also make sure iew
	// stage realizes there is activity.
	if (!flag[LdStFlags::Replayed]) {
	inst->setExecuted();
	iewStage->instToCommit(inst);
	iewStage->activityThisCycle();
	}
	}
	}
	} else {
	DPRINTF(LSQUnit, "Execute: Instruction was squashed. PC: %s, [tid:%i]"
	" [sn:%llu]\n", inst->pcState(), inst->threadNumber,
	inst->seqNum);
	inst->setExecuted();
	inst->setCanCommit();
	flag[LdStFlags::Squashed] = true;
	}
	}
	}
	}

	if (load_fault != NoFault \|\| !inst->readPredicate()) {
	flag[LdStFlags::HasFault] = load_fault != NoFault;
	flag[LdStFlags::readNotPredicate] = !inst->readPredicate();
	} else {
	if (inst->effAddrValid()) {
	// raw violation check (nuke replay)
	for (int i = 0; i < storePipeSx[1]->size; i++) {
	auto& store_inst = storePipeSx[1]->insts[i];
	if (pipeLineNukeCheck(inst, store_inst)) {
	flag[LdStFlags::Nuke] = true;
	break;
	}
	}
	// rar violation check
	auto it = inst->lqIt;
	++it;

	if (checkLoads)
	load_fault = checkViolations(it, inst);
	}
	}

	bool
	LSQUnit::pipeLineNukeCheck(const DynInstPtr &load_inst, const DynInstPtr &store_inst)
	{
	Addr load_eff_addr1 = load_inst->effAddr >> depCheckShift;
	Addr load_eff_addr2 = (load_inst->effAddr + load_inst->effSize - 1) >> depCheckShift;

	Addr store_eff_addr1 = store_inst->effAddr >> depCheckShift;
	Addr store_eff_addr2 = (store_inst->effAddr + store_inst->effSize - 1) >> depCheckShift;

	LSQRequest* store_req = store_inst->savedRequest;
	bool load_need_check = load_inst->effAddrValid() && (load_inst->lqIt >= store_inst->lqIt);
	bool store_need_check = store_req && store_req->isTranslationComplete() &&
	store_req->isMemAccessRequired() && (store_inst->getFault() == NoFault);
	if (lsq->enablePipeNukeCheck() && load_need_check && store_need_check) {
	if (load_eff_addr1 <= store_eff_addr2 && store_eff_addr1 <= load_eff_addr2) {
	return true;
	}
	}
	return false;
	}

	if (flag[LdStFlags::Nuke]) {
	assert(lsq->enablePipeNukeCheck());
	// replay load if nuke happens
	loadReplayHelper(
	inst, request,
	false, // cache hit
	true, // fast replay
	true // call request->discard() now
	);
	stats.pipeRawNukeReplay++;

	// if this load has been marked as Nuke, the load will then be replayed
	// So next time this load replaying to pipeline will forward from store correctly
	// And no RAW violation happens
	if (skipNukeReplay(ld_inst)) {
	++loadIt;
	continue;
	}

	bool
	LSQ::SingleDataRequest::recvTimingResp(PacketPtr pkt)
	{
	// Dump inst num, request addr, and packet addr
	DPRINTF(LSQ, "Single Req::recvTimingResp: inst: %llu, pkt: %#lx\n", pkt->req->getReqInstSeqNum(),
	pkt->getAddr());
	assert(_numOutstandingPackets == 1);
	flags.set(Flag::Complete);
	assert(pkt == _packets.front());
	forward();
	_port.completeDataAccess(pkt);
	_hasStaleTranslation = false;
	return true;
	}

	// check if cache hit & get cache response?
	// NOTE: cache miss replay has higher priority than nuke replay!
	if (lsq->enableLdMissReplay() &&
	request && request->isNormalLd() && !flag[LdStFlags::FullForward] && !flag[LdStFlags::CacheHit]) {
	// cannot get cache data at load s2, replay this load
	loadReplayHelper(
	inst, request,
	true, // cache miss
	false, // Dont fast replay
	false // call request->discard() later when TimingResp comes
	);
	return fault;
	}

	if (hintWakeUpAheadCycles) {
	// send Hint in advance to wake up cache missed load earlier before real TimingResp
	CustomHintEvent* hint = new CustomHintEvent(cpu_side_port_id, pkt, this);
	schedule(hint, curTick() + latency - (clockPeriod() * hintWakeUpAheadCycles));
	}

	void
	Cache::sendHintViaMSHRTargets(MSHR *mshr, const PacketPtr pkt)
	{
	QueueEntry::Target *initial_tgt = mshr->getTarget();
	// First offset for critical word first calculations
	const int initial_offset = initial_tgt->pkt->getOffset(blkSize);

	MSHR::TargetList targets = mshr->copyServiceableTargets(pkt);
	// ResponseQueue is a forceOrder Queue, so if first tgt is delayed,
	// all tgt will be delayed to the same time as the first tgt,
	// for more details, see PacketQueue::schedSendTiming
	bool firstTgt = true;
	bool firstTgtDelayed = false;
	for (auto &target: targets) {
	Packet *tgt_pkt = target.pkt;
	if (target.source == MSHR::Target::FromCPU) {
	// How many bytes past the first request is this one
	int transfer_offset =
	tgt_pkt->getOffset(blkSize) - initial_offset;
	if (transfer_offset < 0) {
	transfer_offset += blkSize;
	}
	if (firstTgt) {
	firstTgtDelayed = transfer_offset != 0 && pkt->payloadDelay != 0;
	}
	Tick sendHintTime = curTick() + ((transfer_offset \|\| firstTgtDelayed) ? pkt->payloadDelay : 0);
	DPRINTF(Cache, "sendHintViaMSHRTargets: pkt: %#lx, sendHintTime: %ld", tgt_pkt->getAddr(), sendHintTime);
	if (sendHintTime == curTick()) {
	BaseCache::cpuSidePort.sendCustomSignal(tgt_pkt, DcacheRespType::Hint);
	} else {
	SendCustomEvent* hintEvent = new SendCustomEvent(this, tgt_pkt, DcacheRespType::Hint);
	schedule(hintEvent, sendHintTime);
	}
	firstTgt = false;
	}
	}
	}

	void
	LSQ::SingleDataRequest::recvFunctionalCustomSignal(PacketPtr pkt)
	{
	LSQ* lsq = this->_port.getLsq();
	bool isNormalLd = this->isNormalLd();
	bool enableLdMissReplay = lsq->enableLdMissReplay();
	if (enableLdMissReplay && isNormalLd && LSQRequest::_inst->waitingCacheRefill()) {
	// Receive Custom Hint, wake up cache missed load earlier before recvTimingResp
	DPRINTF(LSQ, "SingleDataRequest::CustomResp: inst: %llu, pkt: %#lx\n", pkt->req->getReqInstSeqNum(),
	pkt->getAddr());
	DPRINTF(Hint, "[sn:%ld] Recv Hint\n", pkt->req->getReqInstSeqNum());
	LSQRequest::_inst->waitingCacheRefill(false);
	}
	}

	DPRINTF(Hint, "[sn:%ld] Read\n", load_inst->seqNum);
	auto& bus = getLsq()->bus;
	bool busFwdSuccess = bus.find(load_inst->seqNum) != bus.end();
	if (request->_inst->hasPendingCacheReq()) {
	// Load has been waken up too early, TimingResp is not present now
	// try waiting TimingResp and forward bus again at load s2
	assert(request->isLoad());
	setFlagInPipeLine(load_inst, LdStFlags::WakeUpEarly);
	} else if (busFwdSuccess) {
	DPRINTF(LSQUnit, "[sn:%ld]: Forward from bus at load s1, data: %lx\n",
	load_inst->seqNum, ((uint64_t)(load_inst->memData)));
	panic_if(bus.size() > lsq->getLQEntries(), "packets on bus should never be greater than LQ size");
	for (auto ele : bus) {
	DPRINTF(LSQUnit, " bus:[sn:%ld], paddr:%lx\n", ele.first, ele.second);
	}
	// this load can forward data from bus
	forwardFrmBus(load_inst, request);
	} else {
	// if cannot forward from bus, do real cache access
	request->buildPackets();
	// if the cache is not blocked, do cache access
	if (!request->sendPacketToCache()) {
	iewStage->loadCancel(load_inst);
	}
	if (!request->isSent()) {
	iewStage->blockMemInst(load_inst);
	setFlagInPipeLine(load_inst, LdStFlags::Replayed);
	}
	}

cpu-o3: Transform the lsqunit #265

cpu-o3: Transform the lsqunit #265

Conversation

happy-lx commented Jan 10, 2025 • edited Loading

Background

1. Pipeline Construction

2. Nuke Replay

3. Miss Load Replay

4. Misc

4.1 fence opType

4.2 LRSC

4.3 store writeback

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

happy-lx commented Jan 10, 2025 •

edited

Loading

	if (flag[LdStFlags::WakeUpEarly]) {
	auto& bus = getLsq()->bus;
	bool busFwdSuccess = bus.find(inst->seqNum) != bus.end();
	if (inst->hasPendingCacheReq() \|\| !busFwdSuccess) {
	// Load has been waken up too early, even no TimingResp at load s2
	// Or load received TimingResp any time at [s1, s2], but can not find data on bus
	// replay this load
	// warn("Tick:%ld Hint & TimingResp not Match, "
	// "plz check the timing relationship between Hint & TimingResp, sn:%ld\n", curTick(), inst->seqNum);
	loadReplayHelper(
	inst, request,
	true, // cache miss
	false, // Dont fast replay
	true // call request->discard() now
	);
	stats.cacheMissReplayEarly++;
	} else {
	// Load received TimingResp any time at [s1, s2], forward from data bus
	DPRINTF(LSQUnit, "[sn:%ld]: Forward from bus at load s2, data: %lx\n",
	inst->seqNum, ((uint64_t)(inst->memData)));
	panic_if(bus.size() > lsq->getLQEntries(), "packets on bus should never be greater than LQ size");
	forwardFrmBus(inst, request);
	}
	}

	// The block will be ready when the payload arrives and the fill is done
	blk->setWhenReady(clockEdge(fillLatency) + pkt->headerDelay +
	pkt->payloadDelay);

	// NOTE: just send the block address back to lsu
	// notify lsu to clear data on data bus
	// when the block is ready in dcache (load can get data from cache directly)
	PacketPtr customPkt = new Packet(pkt->req, MemCmd::CustomBusClear);
	customPkt->setAddr(addr);
	SendCustomEvent* clearEvent = new SendCustomEvent(this, customPkt, DcacheRespType::Bus_Clear);
	schedule(clearEvent, clockEdge(fillLatency) + pkt->headerDelay +
	pkt->payloadDelay);

	} else if (sig == DcacheRespType::Bus_Clear) {
	assert(pkt->cmd == MemCmd::CustomBusClear);
	// Data block is ready in Dcache, data on bus can be cleared now
	Addr busClearBlkAddr = pkt->getAddr();
	DPRINTF(Hint, "Bus Clear\n");
	DPRINTF(LSQ, "Bus_Clear, clear address: %#lx, bus size: %d\n", busClearBlkAddr, bus.size());
	for (auto it = bus.begin(); it != bus.end();) {
	auto [seqNum, addr] = *it;
	if ((addr & ~((uint64_t)cpu->cacheLineSize() - 1)) == busClearBlkAddr) {
	it = bus.erase(it);
	DPRINTF(LSQ, " erased bus: [sn:%ld] addr: %#lx\n", seqNum, addr);
	} else {
	it++;
	}
	}
	delete pkt;
	panic_if(bus.size() > getLQEntries(), "elements on bus should never be greater than LQ size");