UoB-HPC · jj16791 · Dec 19, 2023 · Dec 15, 2023 · Dec 15, 2023 · Dec 19, 2023
diff --git a/src/include/simeng/RegisterFileSet.hh b/src/include/simeng/RegisterFileSet.hh
@@ -16,6 +16,10 @@ struct Register {
    * architectural register, depending on point of usage. */
   uint16_t tag;
 
+  /** A boolean identifier for whether the creation of this register has been a
+   * result of a register renaming scheme. */
+  bool renamed = false;
+
   /** Check for equality of two register identifiers. */
   bool operator==(const Register& other) const;
 

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -1678,9 +1678,10 @@ class sveHelp {
 
   /** Helper function for SVE instructions store instructions to merge
    * consecutive active elements into blocks to be written.
-   * T represents the type of operands (e.g. for zn.d, T = uint64_t).
+   * T represents the size of the vector elements (e.g. for zn.d, T = uint64_t).
+   * C represents the size of the memory elements (e.g. for st1w, C = uint32_t).
    * Return a vector of RegisterValues.  */
-  template <typename T>
+  template <typename T, typename C = T>
   static std::vector<RegisterValue> sve_merge_store_data(const T* d,
                                                          const uint64_t* p,
                                                          uint16_t vl_bits) {
@@ -1690,26 +1691,26 @@ class sveHelp {
     // Determine how many predicate elements are present per uint64_t.
     uint16_t predsPer64 = (64 / sizeof(T));
 
-    // Determine size of array based on the size of the stored element (This is
-    // the T specifier in sve instructions)
-    std::array<T, 256 / sizeof(T)> mData;
+    // Determine size of array based on the size of the memory access (This is
+    // the C specifier in sve instructions)
+    std::array<C, 256 / sizeof(C)> mData;
     uint16_t mdSize = 0;
 
     for (uint16_t x = 0; x < numVecElems; x++) {
       // Determine mask to get predication for active element.
       uint64_t shiftedActive = 1ull << ((x % predsPer64) * sizeof(T));
       if (p[x / predsPer64] & shiftedActive) {
-        mData[mdSize] = d[x];
+        mData[mdSize] = static_cast<C>(d[x]);
         mdSize++;
       } else if (mdSize) {
         outputData.push_back(
-            RegisterValue((char*)mData.data(), mdSize * sizeof(T)));
+            RegisterValue((char*)mData.data(), mdSize * sizeof(C)));
         mdSize = 0;
       }
     }
     if (mdSize) {
       outputData.push_back(
-          RegisterValue((char*)mData.data(), mdSize * sizeof(T)));
+          RegisterValue((char*)mData.data(), mdSize * sizeof(C)));
     }
     return outputData;
   }

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -4740,7 +4740,8 @@ void Instruction::execute() {
         const uint64_t* d = operands[0].getAsVector<uint64_t>();
         const uint64_t* p = operands[1].getAsVector<uint64_t>();
 
-        memoryData = sveHelp::sve_merge_store_data<uint64_t>(d, p, VL_bits);
+        memoryData =
+            sveHelp::sve_merge_store_data<uint64_t, uint32_t>(d, p, VL_bits);
         break;
       }
       case Opcode::AArch64_ST1W_IMM: {  // st1w {zt.s}, pg, [xn{, #imm, mul vl}]

diff --git a/src/lib/pipeline/RegisterAliasTable.cc b/src/lib/pipeline/RegisterAliasTable.cc
@@ -50,7 +50,7 @@ Register RegisterAliasTable::getMapping(Register architectural) const {
          "Invalid register type. Cannot find RAT mapping.");
 
   auto tag = mappingTable_[architectural.type][architectural.tag];
-  return {architectural.type, tag};
+  return {architectural.type, tag, true};
 }
 
 bool RegisterAliasTable::canAllocate(uint8_t type,
@@ -84,7 +84,7 @@ Register RegisterAliasTable::allocate(Register architectural) {
   mappingTable_[architectural.type][architectural.tag] = tag;
   destinationTable_[architectural.type][tag] = architectural.tag;
 
-  return {architectural.type, tag};
+  return {architectural.type, tag, true};
 }
 
 void RegisterAliasTable::commit(Register physical) {
@@ -94,6 +94,9 @@ void RegisterAliasTable::commit(Register physical) {
   freeQueues_[physical.type].push(oldTag);
 }
 void RegisterAliasTable::rewind(Register physical) {
+  assert(physical.renamed &&
+         "Attempted to rewind a physical register which hasn't been subject to "
+         "the register renaming scheme");
   // Find which architectural tag this referred to
   auto destinationTag = destinationTable_[physical.type][physical.tag];
   // Rewind the mapping table to the old physical tag

diff --git a/src/lib/pipeline/ReorderBuffer.cc b/src/lib/pipeline/ReorderBuffer.cc
@@ -169,7 +169,8 @@ void ReorderBuffer::flush(uint64_t afterSeqId) {
     auto destinations = uop->getDestinationRegisters();
     for (int i = destinations.size() - 1; i >= 0; i--) {
       const auto& reg = destinations[i];
-      rat_.rewind(reg);
+      // Only rewind the register if it was renamed
+      if (reg.renamed) rat_.rewind(reg);
     }
     uop->setFlushed();
     // If the instruction is a branch, supply address to branch flushing logic

diff --git a/test/regression/RegressionTest.cc b/test/regression/RegressionTest.cc
@@ -24,6 +24,10 @@ void RegressionTest::run(const char* source, const char* triple,
                          const char* extensions) {
   testing::internal::CaptureStdout();
 
+  // Zero-out process memory from any prior runs
+  if (processMemory_ != nullptr)
+    std::memset(processMemory_, '\0', processMemorySize_);
+
   // Assemble the source to a flat binary
   assemble(source, triple, extensions);
   if (HasFatalFailure()) return;

diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
@@ -6540,72 +6540,59 @@ TEST_P(InstSve, st1w) {
     EXPECT_EQ(getMemoryValue<uint32_t>((VL / 64) + 16 + (i * 4)), src[i % 4]);
   }
 
-  // 64-bit
-  // initialHeapData_.resize(64);
-  // uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
-  // heap64[0] = 0xDEADBEEFDEADBEEF;
-  // heap64[1] = 0x1234567812345678;
-  // heap64[2] = 0x9876543298765432;
-  // heap64[3] = 0xABCDEF01ABCDEF01;
-  // heap64[4] = 0xDEADBEEFDEADBEEF;
-  // heap64[5] = 0x1234567812345678;
-  // heap64[6] = 0x9876543298765432;
-  // heap64[7] = 0xABCDEF01ABCDEF01;
-
-  // RUN_AARCH64(R"(
-  //   # Get heap address
-  //   mov x0, 0
-  //   mov x8, 214
-  //   svc #0
-
-  //   mov x1, #0
-  //   mov x4, #64
-  //   mov x5, #3
-  //   ptrue p0.d
-  //   ld1w {z0.d}, p0/z, [x0, x1, lsl #3]
-  //   ld1w {z2.d}, p0/z, [x0, x1, lsl #3]
-  //   st1w {z0.d}, p0, [sp, x1, lsl #2]
-  //   st1w {z2.d}, p0, [x4, x5, lsl #2]
-  // )");
-  // CHECK_NEON(0, uint64_t,
-  //            {0xDEADBEEFDEADBEEFu, 0x1234567812345678u,
-  //            0x9876543298765432u,
-  //             0xABCDEF01ABCDEF01u, 0xDEADBEEFDEADBEEFu,
-  //             0x1234567812345678u, 0x9876543298765432u,
-  //             0xABCDEF01ABCDEF01u});
-  // CHECK_NEON(2, uint64_t,
-  //            {0xDEADBEEFDEADBEEFu, 0x1234567812345678u,
-  //            0x9876543298765432u,
-  //             0xABCDEF01ABCDEF01u, 0xDEADBEEFDEADBEEFu,
-  //             0x1234567812345678u, 0x9876543298765432u,
-  //             0xABCDEF01ABCDEF01u});
-
-  // EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer()),
-  // 0xDEADBEEF);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer()
-  // + 4),
-  //           0x12345678);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() + 8),
-  //           0x98765432);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() + 12),
-  //           0xABCDEF01);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() + 16),
-  //           0xDEADBEEF);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() + 20),
-  //           0x12345678);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() + 24),
-  //           0x98765432);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(process_->getStackPointer() + 28),
-  //           0xABCDEF01);
-
-  // EXPECT_EQ(getMemoryValue<uint32_t>(64 + (3 * 4)), 0xDEADBEEF);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(64 + (3 * 4) + 4), 0x12345678);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(64 + (3 * 4) + 8), 0x98765432);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(64 + (3 * 4) + 12), 0xABCDEF01);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(64 + (3 * 4) + 16), 0xDEADBEEF);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(64 + (3 * 4) + 20), 0x12345678);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(64 + (3 * 4) + 24), 0x98765432);
-  // EXPECT_EQ(getMemoryValue<uint32_t>(64 + (3 * 4) + 28), 0xABCDEF01);
+  // 64 - bit
+  initialHeapData_.resize(VL / 8);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> srcA = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01};
+  std::vector<uint64_t> srcB = {0xDEADBEEFDEADBEEF, 0x1234567812345678,
+                                0x9876543298765432, 0xABCDEF01ABCDEF01};
+  fillHeapCombined(heap64, srcA, srcB, VL / 64);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    sub sp, sp, #4095
+
+    ptrue p0.d
+    mov x2, #0
+    mov x5, #16
+    addvl x2, x2, #1
+    udiv x2, x2, x5
+    mov x3, #2
+    whilelo p1.d, xzr, x2
+
+    mov x1, #0
+    mov x6, #64
+    mov x7, #3
+
+    ld1d {z0.d}, p1/z, [x0, x1, lsl #3]
+    ld1d {z2.d}, p0/z, [x0, x1, lsl #3]
+    st1w {z0.d}, p1, [sp, x1, lsl #2]
+    st1w {z2.d}, p0, [x6, x7, lsl #2]
+  )");
+
+  CHECK_NEON(0, uint64_t, fillNeonCombined<uint64_t>(srcA, {0ull}, VL / 8));
+  CHECK_NEON(2, uint64_t, fillNeonCombined<uint64_t>(srcA, srcB, VL / 8));
+
+  std::array<uint32_t, (256 / sizeof(uint32_t))> srcC =
+      fillNeonCombined<uint32_t>(
+          {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, {0ul}, VL / 16);
+  for (int i = 0; i < (VL / 64); i++) {
+    EXPECT_EQ(
+        getMemoryValue<uint32_t>(process_->getStackPointer() - 4095 + (i * 4)),
+        srcC[i]);
+  }
+
+  std::array<uint32_t, (256 / sizeof(uint32_t))> srcD =
+      fillNeonCombined<uint32_t>(
+          {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01},
+          {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, VL / 16);
+  for (int i = 0; i < (VL / 64); i++) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(64 + (3 + i) * 4), srcD[i]);
+  }
 }
 
 TEST_P(InstSve, str_predicate) {