Snap for 10453563 from 38a36e289b9ebc40e3a2e1f07827612fedbba2e3 to mainline-art-release

Change-Id: I8e70a9746346022c45302742a401561b598825ff
diff --git a/Android.bp b/Android.bp
index ab3fb0d..43955e9 100644
--- a/Android.bp
+++ b/Android.bp
@@ -90,6 +90,7 @@
 
         "-Werror=pointer-to-int-cast",
         "-Werror=int-to-pointer-cast",
+        "-Werror=thread-safety",
         "-Werror=type-limits",
         "-Werror",
 
@@ -117,9 +118,12 @@
         "standalone/flags.cpp",
         "standalone/flags_parser.cpp",
         "standalone/linux.cpp",
+        "standalone/mem_map.cpp",
         "standalone/release.cpp",
         "standalone/report.cpp",
+        "standalone/rss_limit_checker.cpp",
         "standalone/string_utils.cpp",
+        "standalone/timing.cpp",
         "standalone/wrappers_c_bionic.cpp"
     ],
     arch: {
@@ -131,6 +135,11 @@
             cflags: ["-mcrc"],
             srcs: ["standalone/crc32_hw.cpp"],
         },
+        riscv64: {
+            // This is a temporary fix, and should be reverted after
+            // yieldProcessor supports riscv.
+            cflags: ["-Wno-unused-parameter"],
+        },
         x86_64: {
             cflags: ["-msse4.2"],
             srcs: ["standalone/crc32_hw.cpp"],
@@ -172,10 +181,40 @@
 cc_library_static {
     name: "libscudo_for_testing",
     defaults: ["libscudo_defaults"],
+    cflags: [
+        "-DSCUDO_DEBUG",
+    ],
+}
+
+cc_defaults {
+    name: "scudo_unit_tests_default",
+    static_libs: ["libscudo_for_testing"],
+    include_dirs: [
+        "external/scudo/standalone",
+        "external/scudo/standalone/include",
+    ],
+    cflags: [
+        "-fno-emulated-tls",
+        // In memtag_test.cpp, some tests are disabled by GTEST_SKIP() so that
+        // they won't be run. However, for those disabled tests, it may contain
+        // unreachable code paths which will mislead some compiler checks. Given
+        // this flag won't be impacted too much, disable it only in the test.
+        "-Wno-unreachable-code-loop-increment",
+        "-Wno-unused-parameter",
+        "-DSCUDO_DEBUG",
+    ],
+    target: {
+        bionic: {
+            header_libs: ["bionic_libc_platform_headers"],
+        },
+    },
+    test_suites: ["general-tests"],
+    bootstrap: true,
 }
 
 cc_test {
     name: "scudo_unit_tests",
+    defaults: ["scudo_unit_tests_default"],
     // Temporarily disabled on host due to a 15-20s per-test timeout,
     // which is currently exceeded by ScudoCombinedTest.BasicCombined.
     host_supported: false,
@@ -188,6 +227,7 @@
         "standalone/tests/flags_test.cpp",
         "standalone/tests/list_test.cpp",
         "standalone/tests/map_test.cpp",
+        "standalone/tests/memtag_test.cpp",
         "standalone/tests/mutex_test.cpp",
         "standalone/tests/primary_test.cpp",
         "standalone/tests/quarantine_test.cpp",
@@ -198,25 +238,20 @@
         "standalone/tests/size_class_map_test.cpp",
         "standalone/tests/stats_test.cpp",
         "standalone/tests/strings_test.cpp",
+        "standalone/tests/timing_test.cpp",
         "standalone/tests/tsd_test.cpp",
         "standalone/tests/vector_test.cpp",
     ],
-    static_libs: ["libscudo_for_testing"],
-    include_dirs: [
-        "external/scudo/standalone",
-        "external/scudo/standalone/include",
+}
+
+cc_test {
+    name: "scudo_hooks_unit_tests",
+    defaults: ["scudo_unit_tests_default"],
+    host_supported: true,
+    srcs: [
+      "standalone/tests/scudo_hooks_test.cpp",
+      "standalone/tests/scudo_unit_test_main.cpp",
     ],
-    cflags: [
-        "-Wno-unused-parameter",
-        "-fno-emulated-tls",
-    ],
-    target: {
-        bionic: {
-            header_libs: ["bionic_libc_platform_headers"],
-        },
-    },
-    test_suites: ["general-tests"],
-    bootstrap: true,
 }
 
 cc_fuzz {
diff --git a/OWNERS b/OWNERS
index 45e67e7..4f31bde 100644
--- a/OWNERS
+++ b/OWNERS
@@ -1,3 +1,3 @@
 cferris@google.com
 enh@google.com
-kostyak@google.com
+chiahungduan@google.com
diff --git a/TEST_MAPPING b/TEST_MAPPING
index a8f41fb..32f13f0 100644
--- a/TEST_MAPPING
+++ b/TEST_MAPPING
@@ -4,6 +4,9 @@
       "name": "scudo_unit_tests"
     },
     {
+      "name": "scudo_hooks_unit_tests"
+    },
+    {
       "name": "memunreachable_unit_test"
     },
     {
diff --git a/standalone/allocator_config.h b/standalone/allocator_config.h
index e6f46b5..d06f6df 100644
--- a/standalone/allocator_config.h
+++ b/standalone/allocator_config.h
@@ -26,7 +26,7 @@
 // allocator.
 //
 // struct ExampleConfig {
-//   // SizeClasMmap to use with the Primary.
+//   // SizeClassMap to use with the Primary.
 //   using SizeClassMap = DefaultSizeClassMap;
 //   // Indicates possible support for Memory Tagging.
 //   static const bool MaySupportMemoryTagging = false;
@@ -34,6 +34,14 @@
 //   typedef SizeClassAllocator64<ExampleConfig> Primary;
 //   // Log2 of the size of a size class region, as used by the Primary.
 //   static const uptr PrimaryRegionSizeLog = 30U;
+//   // Log2 of the size of block group, as used by the Primary. Each group
+//   // contains a range of memory addresses, blocks in the range will belong to
+//   // the same group. In general, single region may have 1 or 2MB group size.
+//   // Multiple regions will have the group size equal to the region size
+//   // because the region size is usually smaller than 1 MB.
+//   // Smaller value gives fine-grained control of memory usage but the trade
+//   // off is that it may take longer time of deallocation.
+//   static const uptr PrimaryGroupSizeLog = 20U;
 //   // Defines the type and scale of a compact pointer. A compact pointer can
 //   // be understood as the offset of a pointer within the region it belongs
 //   // to, in increments of a power-of-2 scale.
@@ -65,6 +73,7 @@
 #if SCUDO_CAN_USE_PRIMARY64
   typedef SizeClassAllocator64<DefaultConfig> Primary;
   static const uptr PrimaryRegionSizeLog = 32U;
+  static const uptr PrimaryGroupSizeLog = 21U;
   typedef uptr PrimaryCompactPtrT;
   static const uptr PrimaryCompactPtrScale = 0;
   static const bool PrimaryEnableRandomOffset = true;
@@ -72,6 +81,7 @@
 #else
   typedef SizeClassAllocator32<DefaultConfig> Primary;
   static const uptr PrimaryRegionSizeLog = 19U;
+  static const uptr PrimaryGroupSizeLog = 19U;
   typedef uptr PrimaryCompactPtrT;
 #endif
   static const s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
@@ -96,11 +106,13 @@
   static const uptr PrimaryRegionSizeLog = 28U;
   typedef u32 PrimaryCompactPtrT;
   static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
+  static const uptr PrimaryGroupSizeLog = 20U;
   static const bool PrimaryEnableRandomOffset = true;
   static const uptr PrimaryMapSizeIncrement = 1UL << 18;
 #else
   typedef SizeClassAllocator32<AndroidConfig> Primary;
   static const uptr PrimaryRegionSizeLog = 18U;
+  static const uptr PrimaryGroupSizeLog = 18U;
   typedef uptr PrimaryCompactPtrT;
 #endif
   static const s32 PrimaryMinReleaseToOsIntervalMs = 1000;
@@ -127,11 +139,13 @@
   static const uptr PrimaryRegionSizeLog = 27U;
   typedef u32 PrimaryCompactPtrT;
   static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
+  static const uptr PrimaryGroupSizeLog = 18U;
   static const bool PrimaryEnableRandomOffset = true;
   static const uptr PrimaryMapSizeIncrement = 1UL << 18;
 #else
   typedef SizeClassAllocator32<AndroidSvelteConfig> Primary;
   static const uptr PrimaryRegionSizeLog = 16U;
+  static const uptr PrimaryGroupSizeLog = 16U;
   typedef uptr PrimaryCompactPtrT;
 #endif
   static const s32 PrimaryMinReleaseToOsIntervalMs = 1000;
@@ -155,7 +169,14 @@
   static const bool MaySupportMemoryTagging = false;
 
   typedef SizeClassAllocator64<FuchsiaConfig> Primary;
+// Support 39-bit VMA for riscv-64
+#if SCUDO_RISCV64
+  static const uptr PrimaryRegionSizeLog = 28U;
+  static const uptr PrimaryGroupSizeLog = 19U;
+#else
   static const uptr PrimaryRegionSizeLog = 30U;
+  static const uptr PrimaryGroupSizeLog = 21U;
+#endif
   typedef u32 PrimaryCompactPtrT;
   static const bool PrimaryEnableRandomOffset = true;
   static const uptr PrimaryMapSizeIncrement = 1UL << 18;
@@ -175,6 +196,7 @@
   typedef SizeClassAllocator64<TrustyConfig> Primary;
   // Some apps have 1 page of heap total so small regions are necessary.
   static const uptr PrimaryRegionSizeLog = 10U;
+  static const uptr PrimaryGroupSizeLog = 10U;
   typedef u32 PrimaryCompactPtrT;
   static const bool PrimaryEnableRandomOffset = false;
   // Trusty is extremely memory-constrained so minimally round up map calls.
diff --git a/standalone/checksum.h b/standalone/checksum.h
index 0f787ce..f8eda81 100644
--- a/standalone/checksum.h
+++ b/standalone/checksum.h
@@ -20,7 +20,8 @@
 #if defined(__CRC32__)
 // NB: clang has <crc32intrin.h> but GCC does not
 #include <smmintrin.h>
-#define CRC32_INTRINSIC FIRST_32_SECOND_64(__builtin_ia32_crc32si, __builtin_ia32_crc32di)
+#define CRC32_INTRINSIC                                                        \
+  FIRST_32_SECOND_64(__builtin_ia32_crc32si, __builtin_ia32_crc32di)
 #elif defined(__SSE4_2__)
 #include <smmintrin.h>
 #define CRC32_INTRINSIC FIRST_32_SECOND_64(_mm_crc32_u32, _mm_crc32_u64)
diff --git a/standalone/chunk.h b/standalone/chunk.h
index 0581420..32874a8 100644
--- a/standalone/chunk.h
+++ b/standalone/chunk.h
@@ -42,7 +42,8 @@
       Checksum = computeBSDChecksum(Checksum, Array[I]);
     return Checksum;
   }
-#endif // defined(__CRC32__) || defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32)
+#endif // defined(__CRC32__) || defined(__SSE4_2__) ||
+       // defined(__ARM_FEATURE_CRC32)
 }
 
 namespace Chunk {
@@ -84,7 +85,7 @@
 constexpr uptr ChecksumMask = (1UL << 16) - 1;
 
 constexpr uptr getHeaderSize() {
-  return roundUpTo(sizeof(PackedHeader), 1U << SCUDO_MIN_ALIGNMENT_LOG);
+  return roundUp(sizeof(PackedHeader), 1U << SCUDO_MIN_ALIGNMENT_LOG);
 }
 
 inline AtomicPackedHeader *getAtomicHeader(void *Ptr) {
diff --git a/standalone/combined.h b/standalone/combined.h
index 365720d..0066056 100644
--- a/standalone/combined.h
+++ b/standalone/combined.h
@@ -18,6 +18,7 @@
 #include "options.h"
 #include "quarantine.h"
 #include "report.h"
+#include "rss_limit_checker.h"
 #include "secondary.h"
 #include "stack_depot.h"
 #include "string_utils.h"
@@ -147,6 +148,9 @@
     initFlags();
     reportUnrecognizedFlags();
 
+    RssChecker.init(scudo::getFlags()->soft_rss_limit_mb,
+                    scudo::getFlags()->hard_rss_limit_mb);
+
     // Store some flags locally.
     if (getFlags()->may_return_null)
       Primary.Options.set(OptionBit::MayReturnNull);
@@ -173,6 +177,8 @@
     Quarantine.init(
         static_cast<uptr>(getFlags()->quarantine_size_kb << 10),
         static_cast<uptr>(getFlags()->thread_local_quarantine_size_kb << 10));
+
+    initRingBuffer();
   }
 
   // Initialize the embedded GWP-ASan instance. Requires the main allocator to
@@ -185,6 +191,7 @@
         getFlags()->GWP_ASAN_MaxSimultaneousAllocations;
     Opt.SampleRate = getFlags()->GWP_ASAN_SampleRate;
     Opt.InstallSignalHandlers = getFlags()->GWP_ASAN_InstallSignalHandlers;
+    Opt.Recoverable = getFlags()->GWP_ASAN_Recoverable;
     // Embedded GWP-ASan is locked through the Scudo atfork handler (via
     // Allocator::disable calling GWPASan.disable). Disable GWP-ASan's atfork
     // handler.
@@ -196,7 +203,8 @@
       gwp_asan::segv_handler::installSignalHandlers(
           &GuardedAlloc, Printf,
           gwp_asan::backtrace::getPrintBacktraceFunction(),
-          gwp_asan::backtrace::getSegvBacktraceFunction());
+          gwp_asan::backtrace::getSegvBacktraceFunction(),
+          Opt.Recoverable);
 
     GuardedAllocSlotSize =
         GuardedAlloc.getAllocatorState()->maximumAllocationSize();
@@ -231,6 +239,7 @@
   }
 
   TSDRegistryT *getTSDRegistry() { return &TSDRegistry; }
+  QuarantineT *getQuarantine() { return &Quarantine; }
 
   // The Cache must be provided zero-initialized.
   void initCache(CacheT *Cache) { Cache->init(&Stats, &Primary); }
@@ -241,11 +250,18 @@
   // - unlinking the local stats from the global ones (destroying the cache does
   //   the last two items).
   void commitBack(TSD<ThisT> *TSD) {
-    Quarantine.drain(&TSD->QuarantineCache,
-                     QuarantineCallback(*this, TSD->Cache));
-    TSD->Cache.destroy(&Stats);
+    Quarantine.drain(&TSD->getQuarantineCache(),
+                     QuarantineCallback(*this, TSD->getCache()));
+    TSD->getCache().destroy(&Stats);
   }
 
+  void drainCache(TSD<ThisT> *TSD) {
+    Quarantine.drainAndRecycle(&TSD->getQuarantineCache(),
+                               QuarantineCallback(*this, TSD->getCache()));
+    TSD->getCache().drain();
+  }
+  void drainCaches() { TSDRegistry.drainCaches(this); }
+
   ALWAYS_INLINE void *getHeaderTaggedPointer(void *Ptr) {
     if (!allocatorSupportsMemoryTagging<Params>())
       return Ptr;
@@ -297,7 +313,7 @@
 
   NOINLINE void *allocate(uptr Size, Chunk::Origin Origin,
                           uptr Alignment = MinAlignment,
-                          bool ZeroContents = false) {
+                          bool ZeroContents = false) NO_THREAD_SAFETY_ANALYSIS {
     initThreadMaybe();
 
     const Options Options = Primary.Options.load();
@@ -334,7 +350,7 @@
     // to be sure that there will be an address in the block that will satisfy
     // the alignment.
     const uptr NeededSize =
-        roundUpTo(Size, MinAlignment) +
+        roundUp(Size, MinAlignment) +
         ((Alignment > MinAlignment) ? Alignment : Chunk::getHeaderSize());
 
     // Takes care of extravagantly large sizes as well as integer overflows.
@@ -346,6 +362,19 @@
     }
     DCHECK_LE(Size, NeededSize);
 
+    switch (RssChecker.getRssLimitExceeded()) {
+    case RssLimitChecker::Neither:
+      break;
+    case RssLimitChecker::Soft:
+      if (Options.get(OptionBit::MayReturnNull))
+        return nullptr;
+      reportSoftRSSLimit(RssChecker.getSoftRssLimit());
+      break;
+    case RssLimitChecker::Hard:
+      reportHardRSSLimit(RssChecker.getHardRssLimit());
+      break;
+    }
+
     void *Block = nullptr;
     uptr ClassId = 0;
     uptr SecondaryBlockEnd = 0;
@@ -354,23 +383,24 @@
       DCHECK_NE(ClassId, 0U);
       bool UnlockRequired;
       auto *TSD = TSDRegistry.getTSDAndLock(&UnlockRequired);
-      Block = TSD->Cache.allocate(ClassId);
+      Block = TSD->getCache().allocate(ClassId);
       // If the allocation failed, the most likely reason with a 32-bit primary
       // is the region being full. In that event, retry in each successively
       // larger class until it fits. If it fails to fit in the largest class,
       // fallback to the Secondary.
       if (UNLIKELY(!Block)) {
         while (ClassId < SizeClassMap::LargestClassId && !Block)
-          Block = TSD->Cache.allocate(++ClassId);
+          Block = TSD->getCache().allocate(++ClassId);
         if (!Block)
           ClassId = 0;
       }
       if (UnlockRequired)
         TSD->unlock();
     }
-    if (UNLIKELY(ClassId == 0))
+    if (UNLIKELY(ClassId == 0)) {
       Block = Secondary.allocate(Options, Size, Alignment, &SecondaryBlockEnd,
                                  FillContents);
+    }
 
     if (UNLIKELY(!Block)) {
       if (Options.get(OptionBit::MayReturnNull))
@@ -380,7 +410,7 @@
 
     const uptr BlockUptr = reinterpret_cast<uptr>(Block);
     const uptr UnalignedUserPtr = BlockUptr + Chunk::getHeaderSize();
-    const uptr UserPtr = roundUpTo(UnalignedUserPtr, Alignment);
+    const uptr UserPtr = roundUp(UnalignedUserPtr, Alignment);
 
     void *Ptr = reinterpret_cast<void *>(UserPtr);
     void *TaggedPtr = Ptr;
@@ -439,7 +469,7 @@
             PrevUserPtr == UserPtr &&
             (TaggedUserPtr = loadTag(UserPtr)) != UserPtr) {
           uptr PrevEnd = TaggedUserPtr + Header.SizeOrUnusedBytes;
-          const uptr NextPage = roundUpTo(TaggedUserPtr, getPageSizeCached());
+          const uptr NextPage = roundUp(TaggedUserPtr, getPageSizeCached());
           if (NextPage < PrevEnd && loadTag(NextPage) != NextPage)
             PrevEnd = NextPage;
           TaggedPtr = reinterpret_cast<void *>(TaggedUserPtr);
@@ -452,8 +482,8 @@
             // was freed, it would not have been retagged and thus zeroed, and
             // therefore it needs to be zeroed now.
             memset(TaggedPtr, 0,
-                   Min(Size, roundUpTo(PrevEnd - TaggedUserPtr,
-                                       archMemoryTagGranuleSize())));
+                   Min(Size, roundUp(PrevEnd - TaggedUserPtr,
+                                     archMemoryTagGranuleSize())));
           } else if (Size) {
             // Clear any stack metadata that may have previously been stored in
             // the chunk data.
@@ -666,6 +696,8 @@
     void *NewPtr = allocate(NewSize, Chunk::Origin::Malloc, Alignment);
     if (LIKELY(NewPtr)) {
       memcpy(NewPtr, OldTaggedPtr, Min(NewSize, OldSize));
+      if (UNLIKELY(&__scudo_deallocate_hook))
+        __scudo_deallocate_hook(OldTaggedPtr);
       quarantineOrDeallocateChunk(Options, OldTaggedPtr, &OldHeader, OldSize);
     }
     return NewPtr;
@@ -674,7 +706,7 @@
   // TODO(kostyak): disable() is currently best-effort. There are some small
   //                windows of time when an allocation could still succeed after
   //                this function finishes. We will revisit that later.
-  void disable() {
+  void disable() NO_THREAD_SAFETY_ANALYSIS {
     initThreadMaybe();
 #ifdef GWP_ASAN_HOOKS
     GuardedAlloc.disable();
@@ -686,7 +718,7 @@
     Secondary.disable();
   }
 
-  void enable() {
+  void enable() NO_THREAD_SAFETY_ANALYSIS {
     initThreadMaybe();
     Secondary.enable();
     Primary.enable();
@@ -705,9 +737,7 @@
   // sizing purposes.
   uptr getStats(char *Buffer, uptr Size) {
     ScopedString Str;
-    disable();
     const uptr Length = getStats(&Str) + 1;
-    enable();
     if (Length < Size)
       Size = Length;
     if (Buffer && Size) {
@@ -719,15 +749,15 @@
 
   void printStats() {
     ScopedString Str;
-    disable();
     getStats(&Str);
-    enable();
     Str.output();
   }
 
-  void releaseToOS() {
+  void releaseToOS(ReleaseToOS ReleaseType) {
     initThreadMaybe();
-    Primary.releaseToOS();
+    if (ReleaseType == ReleaseToOS::ForceAll)
+      drainCaches();
+    Primary.releaseToOS(ReleaseType);
     Secondary.releaseToOS();
   }
 
@@ -856,6 +886,13 @@
            Header.State == Chunk::State::Allocated;
   }
 
+  void setRssLimitsTestOnly(int SoftRssLimitMb, int HardRssLimitMb,
+                            bool MayReturnNull) {
+    RssChecker.init(SoftRssLimitMb, HardRssLimitMb);
+    if (MayReturnNull)
+      Primary.Options.set(OptionBit::MayReturnNull);
+  }
+
   bool useMemoryTaggingTestOnly() const {
     return useMemoryTagging<Params>(Primary.Options.load());
   }
@@ -875,6 +912,10 @@
 
   void setTrackAllocationStacks(bool Track) {
     initThreadMaybe();
+    if (getFlags()->allocation_ring_buffer_size == 0) {
+      DCHECK(!Primary.Options.load().get(OptionBit::TrackAllocationStacks));
+      return;
+    }
     if (Track)
       Primary.Options.set(OptionBit::TrackAllocationStacks);
     else
@@ -906,11 +947,29 @@
     return PrimaryT::getRegionInfoArraySize();
   }
 
-  const char *getRingBufferAddress() const {
-    return reinterpret_cast<const char *>(&RingBuffer);
+  const char *getRingBufferAddress() {
+    initThreadMaybe();
+    return RawRingBuffer;
   }
 
-  static uptr getRingBufferSize() { return sizeof(RingBuffer); }
+  uptr getRingBufferSize() {
+    initThreadMaybe();
+    auto *RingBuffer = getRingBuffer();
+    return RingBuffer ? ringBufferSizeInBytes(RingBuffer->Size) : 0;
+  }
+
+  static bool setRingBufferSizeForBuffer(char *Buffer, size_t Size) {
+    // Need at least one entry.
+    if (Size < sizeof(AllocationRingBuffer) +
+                   sizeof(typename AllocationRingBuffer::Entry)) {
+      return false;
+    }
+    AllocationRingBuffer *RingBuffer =
+        reinterpret_cast<AllocationRingBuffer *>(Buffer);
+    RingBuffer->Size = (Size - sizeof(AllocationRingBuffer)) /
+                       sizeof(typename AllocationRingBuffer::Entry);
+    return true;
+  }
 
   static const uptr MaxTraceSize = 64;
 
@@ -994,6 +1053,7 @@
   QuarantineT Quarantine;
   TSDRegistryT TSDRegistry;
   pthread_once_t PostInitNonce = PTHREAD_ONCE_INIT;
+  RssLimitChecker RssChecker;
 
 #ifdef GWP_ASAN_HOOKS
   gwp_asan::GuardedPoolAllocator GuardedAlloc;
@@ -1013,14 +1073,13 @@
     };
 
     atomic_uptr Pos;
-#ifdef SCUDO_FUZZ
-    static const uptr NumEntries = 2;
-#else
-    static const uptr NumEntries = 32768;
-#endif
-    Entry Entries[NumEntries];
+    u32 Size;
+    // An array of Size (at least one) elements of type Entry is immediately
+    // following to this struct.
   };
-  AllocationRingBuffer RingBuffer = {};
+  // Pointer to memory mapped area starting with AllocationRingBuffer struct,
+  // and immediately followed by Size elements of type Entry.
+  char *RawRingBuffer = {};
 
   // The following might get optimized out by the compiler.
   NOINLINE void performSanityChecks() {
@@ -1076,7 +1135,8 @@
   }
 
   void quarantineOrDeallocateChunk(Options Options, void *TaggedPtr,
-                                   Chunk::UnpackedHeader *Header, uptr Size) {
+                                   Chunk::UnpackedHeader *Header,
+                                   uptr Size) NO_THREAD_SAFETY_ANALYSIS {
     void *Ptr = getHeaderTaggedPointer(TaggedPtr);
     Chunk::UnpackedHeader NewHeader = *Header;
     // If the quarantine is disabled, the actual size of a chunk is 0 or larger
@@ -1118,7 +1178,7 @@
       if (LIKELY(ClassId)) {
         bool UnlockRequired;
         auto *TSD = TSDRegistry.getTSDAndLock(&UnlockRequired);
-        TSD->Cache.deallocate(ClassId, BlockBegin);
+        TSD->getCache().deallocate(ClassId, BlockBegin);
         if (UnlockRequired)
           TSD->unlock();
       } else {
@@ -1130,8 +1190,8 @@
     } else {
       bool UnlockRequired;
       auto *TSD = TSDRegistry.getTSDAndLock(&UnlockRequired);
-      Quarantine.put(&TSD->QuarantineCache,
-                     QuarantineCallback(*this, TSD->Cache), Ptr, Size);
+      Quarantine.put(&TSD->getQuarantineCache(),
+                     QuarantineCallback(*this, TSD->getCache()), Ptr, Size);
       if (UnlockRequired)
         TSD->unlock();
     }
@@ -1191,15 +1251,15 @@
 
   void resizeTaggedChunk(uptr OldPtr, uptr NewPtr, uptr NewSize,
                          uptr BlockEnd) {
-    uptr RoundOldPtr = roundUpTo(OldPtr, archMemoryTagGranuleSize());
+    uptr RoundOldPtr = roundUp(OldPtr, archMemoryTagGranuleSize());
     uptr RoundNewPtr;
     if (RoundOldPtr >= NewPtr) {
       // If the allocation is shrinking we just need to set the tag past the end
       // of the allocation to 0. See explanation in storeEndMarker() above.
-      RoundNewPtr = roundUpTo(NewPtr, archMemoryTagGranuleSize());
+      RoundNewPtr = roundUp(NewPtr, archMemoryTagGranuleSize());
     } else {
       // Set the memory tag of the region
-      // [RoundOldPtr, roundUpTo(NewPtr, archMemoryTagGranuleSize()))
+      // [RoundOldPtr, roundUp(NewPtr, archMemoryTagGranuleSize()))
       // to the pointer tag stored in OldPtr.
       RoundNewPtr = storeTags(RoundOldPtr, NewPtr);
     }
@@ -1217,9 +1277,9 @@
   void storeRingBufferEntry(void *Ptr, u32 AllocationTrace, u32 AllocationTid,
                             uptr AllocationSize, u32 DeallocationTrace,
                             u32 DeallocationTid) {
-    uptr Pos = atomic_fetch_add(&RingBuffer.Pos, 1, memory_order_relaxed);
+    uptr Pos = atomic_fetch_add(&getRingBuffer()->Pos, 1, memory_order_relaxed);
     typename AllocationRingBuffer::Entry *Entry =
-        &RingBuffer.Entries[Pos % AllocationRingBuffer::NumEntries];
+        getRingBufferEntry(RawRingBuffer, Pos % getRingBuffer()->Size);
 
     // First invalidate our entry so that we don't attempt to interpret a
     // partially written state in getSecondaryErrorInfo(). The fences below
@@ -1363,12 +1423,14 @@
                                      const char *RingBufferPtr) {
     auto *RingBuffer =
         reinterpret_cast<const AllocationRingBuffer *>(RingBufferPtr);
+    if (!RingBuffer || RingBuffer->Size == 0)
+      return;
     uptr Pos = atomic_load_relaxed(&RingBuffer->Pos);
 
-    for (uptr I = Pos - 1; I != Pos - 1 - AllocationRingBuffer::NumEntries &&
-                           NextErrorReport != NumErrorReports;
+    for (uptr I = Pos - 1;
+         I != Pos - 1 - RingBuffer->Size && NextErrorReport != NumErrorReports;
          --I) {
-      auto *Entry = &RingBuffer->Entries[I % AllocationRingBuffer::NumEntries];
+      auto *Entry = getRingBufferEntry(RingBufferPtr, I % RingBuffer->Size);
       uptr EntryPtr = atomic_load_relaxed(&Entry->Ptr);
       if (!EntryPtr)
         continue;
@@ -1431,8 +1493,49 @@
     Primary.getStats(Str);
     Secondary.getStats(Str);
     Quarantine.getStats(Str);
+    TSDRegistry.getStats(Str);
     return Str->length();
   }
+
+  static typename AllocationRingBuffer::Entry *
+  getRingBufferEntry(char *RawRingBuffer, uptr N) {
+    return &reinterpret_cast<typename AllocationRingBuffer::Entry *>(
+        &RawRingBuffer[sizeof(AllocationRingBuffer)])[N];
+  }
+  static const typename AllocationRingBuffer::Entry *
+  getRingBufferEntry(const char *RawRingBuffer, uptr N) {
+    return &reinterpret_cast<const typename AllocationRingBuffer::Entry *>(
+        &RawRingBuffer[sizeof(AllocationRingBuffer)])[N];
+  }
+
+  void initRingBuffer() {
+    u32 AllocationRingBufferSize =
+        static_cast<u32>(getFlags()->allocation_ring_buffer_size);
+    if (AllocationRingBufferSize < 1)
+      return;
+    MapPlatformData Data = {};
+    RawRingBuffer = static_cast<char *>(
+        map(/*Addr=*/nullptr,
+            roundUp(ringBufferSizeInBytes(AllocationRingBufferSize),
+                    getPageSizeCached()),
+            "AllocatorRingBuffer", /*Flags=*/0, &Data));
+    auto *RingBuffer = reinterpret_cast<AllocationRingBuffer *>(RawRingBuffer);
+    RingBuffer->Size = AllocationRingBufferSize;
+    static_assert(sizeof(AllocationRingBuffer) %
+                          alignof(typename AllocationRingBuffer::Entry) ==
+                      0,
+                  "invalid alignment");
+  }
+
+  static constexpr size_t ringBufferSizeInBytes(u32 AllocationRingBufferSize) {
+    return sizeof(AllocationRingBuffer) +
+           AllocationRingBufferSize *
+               sizeof(typename AllocationRingBuffer::Entry);
+  }
+
+  inline AllocationRingBuffer *getRingBuffer() {
+    return reinterpret_cast<AllocationRingBuffer *>(RawRingBuffer);
+  }
 };
 
 } // namespace scudo
diff --git a/standalone/common.cpp b/standalone/common.cpp
index 666f954..9f14fae 100644
--- a/standalone/common.cpp
+++ b/standalone/common.cpp
@@ -35,4 +35,8 @@
   die();
 }
 
+#if !SCUDO_LINUX
+uptr GetRSS() { return 0; }
+#endif
+
 } // namespace scudo
diff --git a/standalone/common.h b/standalone/common.h
index bc3dfec..82e6cf4 100644
--- a/standalone/common.h
+++ b/standalone/common.h
@@ -27,17 +27,31 @@
   return D;
 }
 
-inline constexpr uptr roundUpTo(uptr X, uptr Boundary) {
+inline constexpr bool isPowerOfTwo(uptr X) { return (X & (X - 1)) == 0; }
+
+inline constexpr uptr roundUp(uptr X, uptr Boundary) {
+  DCHECK(isPowerOfTwo(Boundary));
   return (X + Boundary - 1) & ~(Boundary - 1);
 }
+inline constexpr uptr roundUpSlow(uptr X, uptr Boundary) {
+  return ((X + Boundary - 1) / Boundary) * Boundary;
+}
 
-inline constexpr uptr roundDownTo(uptr X, uptr Boundary) {
+inline constexpr uptr roundDown(uptr X, uptr Boundary) {
+  DCHECK(isPowerOfTwo(Boundary));
   return X & ~(Boundary - 1);
 }
+inline constexpr uptr roundDownSlow(uptr X, uptr Boundary) {
+  return (X / Boundary) * Boundary;
+}
 
 inline constexpr bool isAligned(uptr X, uptr Alignment) {
+  DCHECK(isPowerOfTwo(Alignment));
   return (X & (Alignment - 1)) == 0;
 }
+inline constexpr bool isAlignedSlow(uptr X, uptr Alignment) {
+  return X % Alignment == 0;
+}
 
 template <class T> constexpr T Min(T A, T B) { return A < B ? A : B; }
 
@@ -49,14 +63,12 @@
   B = Tmp;
 }
 
-inline bool isPowerOfTwo(uptr X) { return (X & (X - 1)) == 0; }
-
 inline uptr getMostSignificantSetBitIndex(uptr X) {
   DCHECK_NE(X, 0U);
   return SCUDO_WORDSIZE - 1U - static_cast<uptr>(__builtin_clzl(X));
 }
 
-inline uptr roundUpToPowerOfTwo(uptr Size) {
+inline uptr roundUpPowerOfTwo(uptr Size) {
   DCHECK(Size);
   if (isPowerOfTwo(Size))
     return Size;
@@ -101,7 +113,7 @@
 
 // Hardware specific inlinable functions.
 
-inline void yieldProcessor(u8 Count) {
+inline void yieldProcessor(UNUSED u8 Count) {
 #if defined(__i386__) || defined(__x86_64__)
   __asm__ __volatile__("" ::: "memory");
   for (u8 I = 0; I < Count; I++)
@@ -132,7 +144,12 @@
 
 const char *getEnv(const char *Name);
 
+uptr GetRSS();
+
 u64 getMonotonicTime();
+// Gets the time faster but with less accuracy. Can call getMonotonicTime
+// if no fast version is available.
+u64 getMonotonicTimeFast();
 
 u32 getThreadID();
 
@@ -147,6 +164,7 @@
 #define MAP_NOACCESS (1U << 1)
 #define MAP_RESIZABLE (1U << 2)
 #define MAP_MEMTAG (1U << 3)
+#define MAP_PRECOMMIT (1U << 4)
 
 // Our platform memory mapping use is restricted to 3 scenarios:
 // - reserve memory at a random address (MAP_NOACCESS);
@@ -197,6 +215,13 @@
   MaxTSDsCount,         // Number of usable TSDs for the shared registry.
 };
 
+enum class ReleaseToOS : u8 {
+  Normal, // Follow the normal rules for releasing pages to the OS
+  Force,  // Force release pages to the OS, but avoid cases that take too long.
+  ForceAll, // Force release every page possible regardless of how long it will
+            // take.
+};
+
 constexpr unsigned char PatternFillByte = 0xAB;
 
 enum FillContentsMode {
diff --git a/standalone/crc32_hw.cpp b/standalone/crc32_hw.cpp
index d13c615..73f2ae0 100644
--- a/standalone/crc32_hw.cpp
+++ b/standalone/crc32_hw.cpp
@@ -14,6 +14,7 @@
 u32 computeHardwareCRC32(u32 Crc, uptr Data) {
   return static_cast<u32>(CRC32_INTRINSIC(Crc, Data));
 }
-#endif // defined(__CRC32__) || defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32)
+#endif // defined(__CRC32__) || defined(__SSE4_2__) ||
+       // defined(__ARM_FEATURE_CRC32)
 
 } // namespace scudo
diff --git a/standalone/flags.inc b/standalone/flags.inc
index 690d889..c1f153b 100644
--- a/standalone/flags.inc
+++ b/standalone/flags.inc
@@ -45,3 +45,15 @@
 SCUDO_FLAG(int, release_to_os_interval_ms, SCUDO_ANDROID ? INT32_MIN : 5000,
            "Interval (in milliseconds) at which to attempt release of unused "
            "memory to the OS. Negative values disable the feature.")
+
+SCUDO_FLAG(int, hard_rss_limit_mb, 0,
+           "Hard RSS Limit in Mb. If non-zero, once the limit is achieved, "
+           "abort the process")
+
+SCUDO_FLAG(int, soft_rss_limit_mb, 0,
+           "Soft RSS Limit in Mb. If non-zero, once the limit is reached, all "
+           "subsequent calls will fail or return NULL until the RSS goes below "
+           "the soft limit")
+
+SCUDO_FLAG(int, allocation_ring_buffer_size, 32768,
+           "Entries to keep in the allocation ring buffer for scudo.")
diff --git a/standalone/fuchsia.cpp b/standalone/fuchsia.cpp
index 3b473bc..0788c41 100644
--- a/standalone/fuchsia.cpp
+++ b/standalone/fuchsia.cpp
@@ -17,7 +17,9 @@
 #include <lib/sync/mutex.h> // for sync_mutex_t
 #include <stdlib.h>         // for getenv()
 #include <zircon/compiler.h>
+#include <zircon/process.h>
 #include <zircon/sanitizer.h>
+#include <zircon/status.h>
 #include <zircon/syscalls.h>
 
 namespace scudo {
@@ -30,6 +32,16 @@
 // with ZX_HANDLE_INVALID.
 static_assert(ZX_HANDLE_INVALID == 0, "");
 
+static void NORETURN dieOnError(zx_status_t Status, const char *FnName,
+                                uptr Size) {
+  char Error[128];
+  formatString(Error, sizeof(Error),
+               "SCUDO ERROR: %s failed with size %zuKB (%s)", FnName,
+               Size >> 10, zx_status_get_string(Status));
+  outputRaw(Error);
+  die();
+}
+
 static void *allocateVmar(uptr Size, MapPlatformData *Data, bool AllowNoMem) {
   // Only scenario so far.
   DCHECK(Data);
@@ -41,7 +53,7 @@
       Size, &Data->Vmar, &Data->VmarBase);
   if (UNLIKELY(Status != ZX_OK)) {
     if (Status != ZX_ERR_NO_MEMORY || !AllowNoMem)
-      dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY ? Size : 0);
+      dieOnError(Status, "zx_vmar_allocate", Size);
     return nullptr;
   }
   return reinterpret_cast<void *>(Data->VmarBase);
@@ -56,8 +68,9 @@
   if (Flags & MAP_NOACCESS)
     return allocateVmar(Size, Data, AllowNoMem);
 
-  const zx_handle_t Vmar = Data ? Data->Vmar : _zx_vmar_root_self();
-  CHECK_NE(Vmar, ZX_HANDLE_INVALID);
+  const zx_handle_t Vmar = (Data && Data->Vmar != ZX_HANDLE_INVALID)
+                               ? Data->Vmar
+                               : _zx_vmar_root_self();
 
   zx_status_t Status;
   zx_handle_t Vmo;
@@ -71,7 +84,7 @@
     Status = _zx_vmo_set_size(Vmo, VmoSize + Size);
     if (Status != ZX_OK) {
       if (Status != ZX_ERR_NO_MEMORY || !AllowNoMem)
-        dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY ? Size : 0);
+        dieOnError(Status, "zx_vmo_set_size", VmoSize + Size);
       return nullptr;
     }
   } else {
@@ -79,7 +92,7 @@
     Status = _zx_vmo_create(Size, ZX_VMO_RESIZABLE, &Vmo);
     if (UNLIKELY(Status != ZX_OK)) {
       if (Status != ZX_ERR_NO_MEMORY || !AllowNoMem)
-        dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY ? Size : 0);
+        dieOnError(Status, "zx_vmo_create", Size);
       return nullptr;
     }
     _zx_object_set_property(Vmo, ZX_PROP_NAME, Name, strlen(Name));
@@ -88,11 +101,24 @@
   uintptr_t P;
   zx_vm_option_t MapFlags =
       ZX_VM_PERM_READ | ZX_VM_PERM_WRITE | ZX_VM_ALLOW_FAULTS;
+  if (Addr)
+    DCHECK(Data);
   const uint64_t Offset =
       Addr ? reinterpret_cast<uintptr_t>(Addr) - Data->VmarBase : 0;
   if (Offset)
     MapFlags |= ZX_VM_SPECIFIC;
   Status = _zx_vmar_map(Vmar, MapFlags, Offset, Vmo, VmoSize, Size, &P);
+  if (UNLIKELY(Status != ZX_OK)) {
+    if (Status != ZX_ERR_NO_MEMORY || !AllowNoMem)
+      dieOnError(Status, "zx_vmar_map", Size);
+    return nullptr;
+  }
+
+  if (Flags & MAP_PRECOMMIT) {
+    Status = _zx_vmar_op_range(Vmar, ZX_VMAR_OP_COMMIT, P, Size,
+                               /*buffer=*/nullptr, /*buffer_size=*/0);
+  }
+
   // No need to track the Vmo if we don't intend on resizing it. Close it.
   if (Flags & MAP_RESIZABLE) {
     DCHECK(Data);
@@ -105,9 +131,10 @@
   }
   if (UNLIKELY(Status != ZX_OK)) {
     if (Status != ZX_ERR_NO_MEMORY || !AllowNoMem)
-      dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY ? Size : 0);
+      dieOnError(Status, "zx_vmar_op_range", Size);
     return nullptr;
   }
+
   if (Data)
     Data->VmoSize += Size;
 
@@ -123,11 +150,13 @@
     CHECK_EQ(_zx_vmar_destroy(Vmar), ZX_OK);
     CHECK_EQ(_zx_handle_close(Vmar), ZX_OK);
   } else {
-    const zx_handle_t Vmar = Data ? Data->Vmar : _zx_vmar_root_self();
+    const zx_handle_t Vmar = (Data && Data->Vmar != ZX_HANDLE_INVALID)
+                                 ? Data->Vmar
+                                 : _zx_vmar_root_self();
     const zx_status_t Status =
         _zx_vmar_unmap(Vmar, reinterpret_cast<uintptr_t>(Addr), Size);
     if (UNLIKELY(Status != ZX_OK))
-      dieOnMapUnmapError();
+      dieOnError(Status, "zx_vmar_unmap", Size);
   }
   if (Data) {
     if (Data->Vmo != ZX_HANDLE_INVALID)
@@ -142,12 +171,15 @@
       (Flags & MAP_NOACCESS) ? 0 : (ZX_VM_PERM_READ | ZX_VM_PERM_WRITE);
   DCHECK(Data);
   DCHECK_NE(Data->Vmar, ZX_HANDLE_INVALID);
-  if (_zx_vmar_protect(Data->Vmar, Prot, Addr, Size) != ZX_OK)
-    dieOnMapUnmapError();
+  const zx_status_t Status = _zx_vmar_protect(Data->Vmar, Prot, Addr, Size);
+  if (Status != ZX_OK)
+    dieOnError(Status, "zx_vmar_protect", Size);
 }
 
 void releasePagesToOS(UNUSED uptr BaseAddress, uptr Offset, uptr Size,
                       MapPlatformData *Data) {
+  // TODO: DCHECK the BaseAddress is consistent with the data in
+  // MapPlatformData.
   DCHECK(Data);
   DCHECK_NE(Data->Vmar, ZX_HANDLE_INVALID);
   DCHECK_NE(Data->Vmo, ZX_HANDLE_INVALID);
@@ -177,7 +209,10 @@
   sync_mutex_unlock(&M);
 }
 
+void HybridMutex::assertHeldImpl() __TA_NO_THREAD_SAFETY_ANALYSIS {}
+
 u64 getMonotonicTime() { return _zx_clock_get_monotonic(); }
+u64 getMonotonicTimeFast() { return _zx_clock_get_monotonic(); }
 
 u32 getNumberOfCPUs() { return _zx_system_get_num_cpus(); }
 
diff --git a/standalone/fuchsia.h b/standalone/fuchsia.h
index d6993f8..c1dfd76 100644
--- a/standalone/fuchsia.h
+++ b/standalone/fuchsia.h
@@ -13,7 +13,8 @@
 
 #if SCUDO_FUCHSIA
 
-#include <zircon/process.h>
+#include <stdint.h>
+#include <zircon/types.h>
 
 namespace scudo {
 
diff --git a/standalone/fuzz/get_error_info_fuzzer.cpp b/standalone/fuzz/get_error_info_fuzzer.cpp
index 078e44b..7445645 100644
--- a/standalone/fuzz/get_error_info_fuzzer.cpp
+++ b/standalone/fuzz/get_error_info_fuzzer.cpp
@@ -46,15 +46,14 @@
   }
 
   std::string RingBufferBytes = FDP.ConsumeRemainingBytesAsString();
-  std::vector<char> RingBuffer(AllocatorT::getRingBufferSize(), 0);
-  for (size_t i = 0; i < RingBufferBytes.length() && i < RingBuffer.size();
-       ++i) {
-    RingBuffer[i] = RingBufferBytes[i];
-  }
+  // RingBuffer is too short.
+  if (!AllocatorT::setRingBufferSizeForBuffer(RingBufferBytes.data(),
+                                              RingBufferBytes.size()))
+    return 0;
 
   scudo_error_info ErrorInfo;
   AllocatorT::getErrorInfo(&ErrorInfo, FaultAddr, StackDepot.data(),
-                           RegionInfo.data(), RingBuffer.data(), Memory,
+                           RegionInfo.data(), RingBufferBytes.data(), Memory,
                            MemoryTags, MemoryAddr, MemorySize);
   return 0;
 }
diff --git a/standalone/include/scudo/interface.h b/standalone/include/scudo/interface.h
index 9b9a846..3c083ed 100644
--- a/standalone/include/scudo/interface.h
+++ b/standalone/include/scudo/interface.h
@@ -14,7 +14,7 @@
 
 extern "C" {
 
-__attribute__((weak)) const char *__scudo_default_options();
+__attribute__((weak)) const char *__scudo_default_options(void);
 
 // Post-allocation & pre-deallocation hooks.
 // They must be thread-safe and not use heap related functions.
@@ -101,14 +101,14 @@
   struct scudo_error_report reports[3];
 };
 
-const char *__scudo_get_stack_depot_addr();
-size_t __scudo_get_stack_depot_size();
+const char *__scudo_get_stack_depot_addr(void);
+size_t __scudo_get_stack_depot_size(void);
 
-const char *__scudo_get_region_info_addr();
-size_t __scudo_get_region_info_size();
+const char *__scudo_get_region_info_addr(void);
+size_t __scudo_get_region_info_size(void);
 
-const char *__scudo_get_ring_buffer_addr();
-size_t __scudo_get_ring_buffer_size();
+const char *__scudo_get_ring_buffer_addr(void);
+size_t __scudo_get_ring_buffer_size(void);
 
 #ifndef M_DECAY_TIME
 #define M_DECAY_TIME -100
@@ -118,6 +118,10 @@
 #define M_PURGE -101
 #endif
 
+#ifndef M_PURGE_ALL
+#define M_PURGE_ALL -104
+#endif
+
 // Tune the allocator's choice of memory tags to make it more likely that
 // a certain class of memory errors will be detected. The value argument should
 // be one of the M_MEMTAG_TUNING_* constants below.
diff --git a/standalone/internal_defs.h b/standalone/internal_defs.h
index 621fc9c..27c6b45 100644
--- a/standalone/internal_defs.h
+++ b/standalone/internal_defs.h
@@ -133,25 +133,25 @@
 #else
 #define DCHECK(A)                                                              \
   do {                                                                         \
-  } while (false)
+  } while (false && (A))
 #define DCHECK_EQ(A, B)                                                        \
   do {                                                                         \
-  } while (false)
+  } while (false && (A) == (B))
 #define DCHECK_NE(A, B)                                                        \
   do {                                                                         \
-  } while (false)
+  } while (false && (A) != (B))
 #define DCHECK_LT(A, B)                                                        \
   do {                                                                         \
-  } while (false)
+  } while (false && (A) < (B))
 #define DCHECK_LE(A, B)                                                        \
   do {                                                                         \
-  } while (false)
+  } while (false && (A) <= (B))
 #define DCHECK_GT(A, B)                                                        \
   do {                                                                         \
-  } while (false)
+  } while (false && (A) > (B))
 #define DCHECK_GE(A, B)                                                        \
   do {                                                                         \
-  } while (false)
+  } while (false && (A) >= (B))
 #endif
 
 // The superfluous die() call effectively makes this macro NORETURN.
diff --git a/standalone/linux.cpp b/standalone/linux.cpp
index c77c1bb..e285d8a 100644
--- a/standalone/linux.cpp
+++ b/standalone/linux.cpp
@@ -11,6 +11,7 @@
 #if SCUDO_LINUX
 
 #include "common.h"
+#include "internal_defs.h"
 #include "linux.h"
 #include "mutex.h"
 #include "string_utils.h"
@@ -19,6 +20,7 @@
 #include <fcntl.h>
 #include <linux/futex.h>
 #include <sched.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/mman.h>
@@ -127,6 +129,10 @@
   }
 }
 
+void HybridMutex::assertHeldImpl() {
+  CHECK(atomic_load(&M, memory_order_acquire) != Unlocked);
+}
+
 u64 getMonotonicTime() {
   timespec TS;
   clock_gettime(CLOCK_MONOTONIC, &TS);
@@ -134,6 +140,17 @@
          static_cast<u64>(TS.tv_nsec);
 }
 
+u64 getMonotonicTimeFast() {
+#if defined(CLOCK_MONOTONIC_COARSE)
+  timespec TS;
+  clock_gettime(CLOCK_MONOTONIC_COARSE, &TS);
+  return static_cast<u64>(TS.tv_sec) * (1000ULL * 1000 * 1000) +
+         static_cast<u64>(TS.tv_nsec);
+#else
+  return getMonotonicTime();
+#endif
+}
+
 u32 getNumberOfCPUs() {
   cpu_set_t CPUs;
   // sched_getaffinity can fail for a variety of legitimate reasons (lack of
@@ -180,6 +197,39 @@
 extern "C" WEAK int async_safe_write_log(int pri, const char *tag,
                                          const char *msg);
 
+static uptr GetRSSFromBuffer(const char *Buf) {
+  // The format of the file is:
+  // 1084 89 69 11 0 79 0
+  // We need the second number which is RSS in pages.
+  const char *Pos = Buf;
+  // Skip the first number.
+  while (*Pos >= '0' && *Pos <= '9')
+    Pos++;
+  // Skip whitespaces.
+  while (!(*Pos >= '0' && *Pos <= '9') && *Pos != 0)
+    Pos++;
+  // Read the number.
+  u64 Rss = 0;
+  for (; *Pos >= '0' && *Pos <= '9'; Pos++)
+    Rss = Rss * 10 + static_cast<u64>(*Pos) - '0';
+  return static_cast<uptr>(Rss * getPageSizeCached());
+}
+
+uptr GetRSS() {
+  // TODO: We currently use sanitizer_common's GetRSS which reads the
+  // RSS from /proc/self/statm by default. We might want to
+  // call getrusage directly, even if it's less accurate.
+  auto Fd = open("/proc/self/statm", O_RDONLY);
+  char Buf[64];
+  s64 Len = read(Fd, Buf, sizeof(Buf) - 1);
+  close(Fd);
+  if (Len <= 0)
+    return 0;
+  Buf[Len] = 0;
+
+  return GetRSSFromBuffer(Buf);
+}
+
 void outputRaw(const char *Buffer) {
   if (&async_safe_write_log) {
     constexpr s32 AndroidLogInfo = 4;
diff --git a/standalone/list.h b/standalone/list.h
index 1ac93c2..0137667 100644
--- a/standalone/list.h
+++ b/standalone/list.h
@@ -110,6 +110,18 @@
     Size--;
   }
 
+  // Insert X next to Prev
+  void insert(T *Prev, T *X) {
+    DCHECK(!empty());
+    DCHECK_NE(Prev, nullptr);
+    DCHECK_NE(X, nullptr);
+    X->Next = Prev->Next;
+    Prev->Next = X;
+    if (Last == Prev)
+      Last = X;
+    ++Size;
+  }
+
   void extract(T *Prev, T *X) {
     DCHECK(!empty());
     DCHECK_NE(Prev, nullptr);
diff --git a/standalone/local_cache.h b/standalone/local_cache.h
index f46645f..c97095d 100644
--- a/standalone/local_cache.h
+++ b/standalone/local_cache.h
@@ -10,8 +10,11 @@
 #define SCUDO_LOCAL_CACHE_H_
 
 #include "internal_defs.h"
+#include "list.h"
+#include "platform.h"
 #include "report.h"
 #include "stats.h"
+#include "string_utils.h"
 
 namespace scudo {
 
@@ -20,12 +23,18 @@
   typedef typename SizeClassAllocator::CompactPtrT CompactPtrT;
 
   struct TransferBatch {
-    static const u32 MaxNumCached = SizeClassMap::MaxNumCachedHint;
-    void setFromArray(CompactPtrT *Array, u32 N) {
+    static const u16 MaxNumCached = SizeClassMap::MaxNumCachedHint;
+    void setFromArray(CompactPtrT *Array, u16 N) {
       DCHECK_LE(N, MaxNumCached);
       Count = N;
       memcpy(Batch, Array, sizeof(Batch[0]) * Count);
     }
+    void appendFromArray(CompactPtrT *Array, u16 N) {
+      DCHECK_LE(N, MaxNumCached - Count);
+      memcpy(Batch + Count, Array, sizeof(Batch[0]) * N);
+      // u16 will be promoted to int by arithmetic type conversion.
+      Count = static_cast<u16>(Count + N);
+    }
     void clear() { Count = 0; }
     void add(CompactPtrT P) {
       DCHECK_LT(Count, MaxNumCached);
@@ -34,21 +43,43 @@
     void copyToArray(CompactPtrT *Array) const {
       memcpy(Array, Batch, sizeof(Batch[0]) * Count);
     }
-    u32 getCount() const { return Count; }
-    CompactPtrT get(u32 I) const {
+    u16 getCount() const { return Count; }
+    CompactPtrT get(u16 I) const {
       DCHECK_LE(I, Count);
       return Batch[I];
     }
-    static u32 getMaxCached(uptr Size) {
+    static u16 getMaxCached(uptr Size) {
       return Min(MaxNumCached, SizeClassMap::getMaxCachedHint(Size));
     }
     TransferBatch *Next;
 
   private:
-    u32 Count;
     CompactPtrT Batch[MaxNumCached];
+    u16 Count;
   };
 
+  // A BatchGroup is used to collect blocks. Each group has a group id to
+  // identify the group kind of contained blocks.
+  struct BatchGroup {
+    // `Next` is used by IntrusiveList.
+    BatchGroup *Next;
+    // The compact base address of each group
+    uptr CompactPtrGroupBase;
+    // Cache value of TransferBatch::getMaxCached()
+    u16 MaxCachedPerBatch;
+    // Number of blocks pushed into this group. This is an increment-only
+    // counter.
+    uptr PushedBlocks;
+    // This is used to track how many bytes are not in-use since last time we
+    // tried to release pages.
+    uptr BytesInBGAtLastCheckpoint;
+    // Blocks are managed by TransferBatch in a list.
+    SinglyLinkedList<TransferBatch> Batches;
+  };
+
+  static_assert(sizeof(BatchGroup) <= sizeof(TransferBatch),
+                "BatchGroup uses the same class size as TransferBatch");
+
   void init(GlobalStats *S, SizeClassAllocator *A) {
     DCHECK(isEmpty());
     Stats.init();
@@ -120,17 +151,49 @@
   TransferBatch *createBatch(uptr ClassId, void *B) {
     if (ClassId != BatchClassId)
       B = allocate(BatchClassId);
+    if (UNLIKELY(!B))
+      reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
     return reinterpret_cast<TransferBatch *>(B);
   }
 
+  BatchGroup *createGroup() {
+    void *Ptr = allocate(BatchClassId);
+    if (UNLIKELY(!Ptr))
+      reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
+    return reinterpret_cast<BatchGroup *>(Ptr);
+  }
+
   LocalStats &getStats() { return Stats; }
 
+  void getStats(ScopedString *Str) {
+    bool EmptyCache = true;
+    for (uptr I = 0; I < NumClasses; ++I) {
+      if (PerClassArray[I].Count == 0)
+        continue;
+
+      EmptyCache = false;
+      // The size of BatchClass is set to 0 intentionally. See the comment in
+      // initCache() for more details.
+      const uptr ClassSize = I == BatchClassId
+                                 ? SizeClassAllocator::getSizeByClassId(I)
+                                 : PerClassArray[I].ClassSize;
+      // Note that the string utils don't support printing u16 thus we cast it
+      // to a common use type uptr.
+      Str->append("    %02zu (%6zu): cached: %4zu max: %4zu\n", I, ClassSize,
+                  static_cast<uptr>(PerClassArray[I].Count),
+                  static_cast<uptr>(PerClassArray[I].MaxCount));
+    }
+
+    if (EmptyCache)
+      Str->append("    No block is cached.\n");
+  }
+
 private:
   static const uptr NumClasses = SizeClassMap::NumClasses;
   static const uptr BatchClassId = SizeClassMap::BatchClassId;
-  struct PerClass {
-    u32 Count;
-    u32 MaxCount;
+  struct alignas(SCUDO_CACHE_LINE_SIZE) PerClass {
+    u16 Count;
+    u16 MaxCount;
     // Note: ClassSize is zero for the transfer batch.
     uptr ClassSize;
     CompactPtrT Chunks[2 * TransferBatch::MaxNumCached];
@@ -150,7 +213,7 @@
     for (uptr I = 0; I < NumClasses; I++) {
       PerClass *P = &PerClassArray[I];
       const uptr Size = SizeClassAllocator::getSizeByClassId(I);
-      P->MaxCount = 2 * TransferBatch::getMaxCached(Size);
+      P->MaxCount = static_cast<u16>(2 * TransferBatch::getMaxCached(Size));
       if (I != BatchClassId) {
         P->ClassSize = Size;
       } else {
@@ -180,16 +243,12 @@
   }
 
   NOINLINE void drain(PerClass *C, uptr ClassId) {
-    const u32 Count = Min(C->MaxCount / 2, C->Count);
-    TransferBatch *B =
-        createBatch(ClassId, Allocator->decompactPtr(ClassId, C->Chunks[0]));
-    if (UNLIKELY(!B))
-      reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
-    B->setFromArray(&C->Chunks[0], Count);
-    C->Count -= Count;
-    for (uptr I = 0; I < C->Count; I++)
+    const u16 Count = Min(static_cast<u16>(C->MaxCount / 2), C->Count);
+    Allocator->pushBlocks(this, ClassId, &C->Chunks[0], Count);
+    // u16 will be promoted to int by arithmetic type conversion.
+    C->Count = static_cast<u16>(C->Count - Count);
+    for (u16 I = 0; I < C->Count; I++)
       C->Chunks[I] = C->Chunks[I + Count];
-    Allocator->pushBatch(ClassId, B);
   }
 };
 
diff --git a/standalone/mem_map.cpp b/standalone/mem_map.cpp
new file mode 100644
index 0000000..115cc34
--- /dev/null
+++ b/standalone/mem_map.cpp
@@ -0,0 +1,84 @@
+//===-- mem_map.cpp ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mem_map.h"
+
+#include "common.h"
+
+namespace scudo {
+
+bool MemMapDefault::mapImpl(uptr Addr, uptr Size, const char *Name,
+                            uptr Flags) {
+  void *MappedAddr =
+      ::scudo::map(reinterpret_cast<void *>(Addr), Size, Name, Flags, &Data);
+  if (MappedAddr == nullptr)
+    return false;
+  Base = reinterpret_cast<uptr>(MappedAddr);
+  MappedBase = Base;
+  Capacity = Size;
+  return true;
+}
+
+void MemMapDefault::unmapImpl(uptr Addr, uptr Size) {
+  if (Size == Capacity) {
+    Base = MappedBase = Capacity = 0;
+  } else {
+    if (Base == Addr) {
+      Base = Addr + Size;
+      MappedBase = MappedBase == 0 ? Base : Max(MappedBase, Base);
+    }
+    Capacity -= Size;
+  }
+
+  ::scudo::unmap(reinterpret_cast<void *>(Addr), Size, UNMAP_ALL, &Data);
+}
+
+bool MemMapDefault::remapImpl(uptr Addr, uptr Size, const char *Name,
+                              uptr Flags) {
+  void *RemappedPtr =
+      ::scudo::map(reinterpret_cast<void *>(Addr), Size, Name, Flags, &Data);
+  const uptr RemappedAddr = reinterpret_cast<uptr>(RemappedPtr);
+  MappedBase = MappedBase == 0 ? RemappedAddr : Min(MappedBase, RemappedAddr);
+  return RemappedAddr == Addr;
+}
+
+void MemMapDefault::releaseAndZeroPagesToOSImpl(uptr From, uptr Size) {
+  DCHECK_NE(MappedBase, 0U);
+  DCHECK_GE(From, MappedBase);
+  return ::scudo::releasePagesToOS(MappedBase, From - MappedBase, Size, &Data);
+}
+
+void MemMapDefault::setMemoryPermissionImpl(uptr Addr, uptr Size, uptr Flags) {
+  return ::scudo::setMemoryPermission(Addr, Size, Flags);
+}
+
+void ReservedMemoryDefault::releaseImpl() {
+  ::scudo::unmap(reinterpret_cast<void *>(Base), Capacity, UNMAP_ALL, &Data);
+}
+
+bool ReservedMemoryDefault::createImpl(uptr Addr, uptr Size, const char *Name,
+                                       uptr Flags) {
+  void *Reserved = ::scudo::map(reinterpret_cast<void *>(Addr), Size, Name,
+                                Flags | MAP_NOACCESS, &Data);
+  if (Reserved == nullptr)
+    return false;
+
+  Base = reinterpret_cast<uptr>(Reserved);
+  Capacity = Size;
+
+  return true;
+}
+
+ReservedMemoryDefault::MemMapT ReservedMemoryDefault::dispatchImpl(uptr Addr,
+                                                                   uptr Size) {
+  ReservedMemoryDefault::MemMapT NewMap(Addr, Size);
+  NewMap.setMapPlatformData(Data);
+  return NewMap;
+}
+
+} // namespace scudo
diff --git a/standalone/mem_map.h b/standalone/mem_map.h
new file mode 100644
index 0000000..0b27fa8
--- /dev/null
+++ b/standalone/mem_map.h
@@ -0,0 +1,89 @@
+//===-- mem_map.h -----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_MEM_MAP_H_
+#define SCUDO_MEM_MAP_H_
+
+#include "mem_map_base.h"
+
+#include "common.h"
+#include "internal_defs.h"
+
+// TODO: This is only used for `MapPlatformData`. Remove these includes when we
+// have all three platform specific `MemMap` and `ReservedMemory`
+// implementations.
+#include "fuchsia.h"
+#include "linux.h"
+#include "trusty.h"
+
+namespace scudo {
+
+// This will be deprecated when every allocator has been supported by each
+// platform's `MemMap` implementation.
+class MemMapDefault final : public MemMapBase<MemMapDefault> {
+public:
+  constexpr MemMapDefault() = default;
+  MemMapDefault(uptr Base, uptr Capacity) : Base(Base), Capacity(Capacity) {}
+
+  // Impls for base functions.
+  bool mapImpl(uptr Addr, uptr Size, const char *Name, uptr Flags);
+  void unmapImpl(uptr Addr, uptr Size);
+  bool remapImpl(uptr Addr, uptr Size, const char *Name, uptr Flags);
+  void setMemoryPermissionImpl(uptr Addr, uptr Size, uptr Flags);
+  void releasePagesToOSImpl(uptr From, uptr Size) {
+    return releaseAndZeroPagesToOSImpl(From, Size);
+  }
+  void releaseAndZeroPagesToOSImpl(uptr From, uptr Size);
+  uptr getBaseImpl() { return Base; }
+  uptr getCapacityImpl() { return Capacity; }
+
+  void setMapPlatformData(MapPlatformData &NewData) { Data = NewData; }
+
+private:
+  uptr Base = 0;
+  uptr Capacity = 0;
+  uptr MappedBase = 0;
+  MapPlatformData Data = {};
+};
+
+// This will be deprecated when every allocator has been supported by each
+// platform's `MemMap` implementation.
+class ReservedMemoryDefault final
+    : public ReservedMemory<ReservedMemoryDefault, MemMapDefault> {
+public:
+  constexpr ReservedMemoryDefault() = default;
+
+  bool createImpl(uptr Addr, uptr Size, const char *Name, uptr Flags);
+  void releaseImpl();
+  MemMapT dispatchImpl(uptr Addr, uptr Size);
+  uptr getBaseImpl() { return Base; }
+  uptr getCapacityImpl() { return Capacity; }
+
+private:
+  uptr Base = 0;
+  uptr Capacity = 0;
+  MapPlatformData Data = {};
+};
+
+#if SCUDO_LINUX
+using ReservedMemoryT = ReservedMemoryDefault;
+using MemMapT = ReservedMemoryT::MemMapT;
+#elif SCUDO_FUCHSIA
+using ReservedMemoryT = ReservedMemoryDefault;
+using MemMapT = ReservedMemoryT::MemMapT;
+#elif SCUDO_TRUSTY
+using ReservedMemoryT = ReservedMemoryDefault;
+using MemMapT = ReservedMemoryT::MemMapT;
+#else
+#error                                                                         \
+    "Unsupported platform, please implement the ReservedMemory for your platform!"
+#endif
+
+} // namespace scudo
+
+#endif // SCUDO_MEM_MAP_H_
diff --git a/standalone/mem_map_base.h b/standalone/mem_map_base.h
new file mode 100644
index 0000000..0560f41
--- /dev/null
+++ b/standalone/mem_map_base.h
@@ -0,0 +1,130 @@
+//===-- mem_map_base.h ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_MEM_MAP_BASE_H_
+#define SCUDO_MEM_MAP_BASE_H_
+
+#include "common.h"
+
+namespace scudo {
+
+// In Scudo, every memory operation will be fulfilled through a
+// platform-specific `MemMap` instance. The essential APIs are listed in the
+// `MemMapBase` below. This is implemented in CRTP, so for each implementation,
+// it has to implement all of the 'Impl' named functions.
+template <class Derived> class MemMapBase {
+public:
+  constexpr MemMapBase() = default;
+
+  // This is used to map a new set of contiguous pages. Note that the `Addr` is
+  // only a suggestion to the system.
+  bool map(uptr Addr, uptr Size, const char *Name, uptr Flags = 0) {
+    DCHECK(!isAllocated());
+    return invokeImpl(&Derived::mapImpl, Addr, Size, Name, Flags);
+  }
+
+  // This is used to unmap partial/full pages from the beginning or the end.
+  // I.e., the result pages are expected to be still contiguous.
+  void unmap(uptr Addr, uptr Size) {
+    DCHECK(isAllocated());
+    DCHECK((Addr == getBase()) || (Addr + Size == getBase() + getCapacity()));
+    invokeImpl(&Derived::unmapImpl, Addr, Size);
+  }
+
+  // This is used to remap a mapped range (either from map() or dispatched from
+  // ReservedMemory). For example, we have reserved several pages and then we
+  // want to remap them with different accessibility.
+  bool remap(uptr Addr, uptr Size, const char *Name, uptr Flags = 0) {
+    DCHECK(isAllocated());
+    DCHECK((Addr >= getBase()) || (Addr + Size <= getBase() + getCapacity()));
+    return invokeImpl(&Derived::remapImpl, Addr, Size, Name, Flags);
+  }
+
+  // This is used to update the pages' access permission. For example, mark
+  // pages as no read/write permission.
+  void setMemoryPermission(uptr Addr, uptr Size, uptr Flags) {
+    DCHECK(isAllocated());
+    DCHECK((Addr >= getBase()) || (Addr + Size <= getBase() + getCapacity()));
+    return static_cast<Derived *>(this)->setMemoryPermissionImpl(Addr, Size,
+                                                                 Flags);
+  }
+
+  // Suggest releasing a set of contiguous physical pages back to the OS. Note
+  // that only physical pages are supposed to be released. Any release of
+  // virtual pages may lead to undefined behavior.
+  void releasePagesToOS(uptr From, uptr Size) {
+    DCHECK(isAllocated());
+    DCHECK((From >= getBase()) || (From + Size <= getBase() + getCapacity()));
+    invokeImpl(&Derived::releasePagesToOSImpl, From, Size);
+  }
+  // This is similar to the above one except that any subsequent access to the
+  // released pages will return with zero-filled pages.
+  void releaseAndZeroPagesToOS(uptr From, uptr Size) {
+    DCHECK(isAllocated());
+    DCHECK((From >= getBase()) || (From + Size <= getBase() + getCapacity()));
+    invokeImpl(&Derived::releaseAndZeroPagesToOSImpl, From, Size);
+  }
+
+  uptr getBase() { return invokeImpl(&Derived::getBaseImpl); }
+  uptr getCapacity() { return invokeImpl(&Derived::getCapacityImpl); }
+
+  bool isAllocated() { return getBase() != 0U; }
+
+protected:
+  template <typename R, typename... Args>
+  R invokeImpl(R (Derived::*MemFn)(Args...), Args... args) {
+    return (static_cast<Derived *>(this)->*MemFn)(args...);
+  }
+};
+
+// `ReservedMemory` is a special memory handle which can be viewed as a page
+// allocator. `ReservedMemory` will reserve a contiguous pages and the later
+// page request can be fulfilled at the designated address. This is used when
+// we want to ensure the virtual address of the MemMap will be in a known range.
+// This is implemented in CRTP, so for each
+// implementation, it has to implement all of the 'Impl' named functions.
+template <class Derived, typename MemMapTy> class ReservedMemory {
+public:
+  using MemMapT = MemMapTy;
+  constexpr ReservedMemory() = default;
+
+  // Reserve a chunk of memory at a suggested address.
+  bool create(uptr Addr, uptr Size, const char *Name, uptr Flags = 0) {
+    DCHECK(!isCreated());
+    return invokeImpl(&Derived::createImpl, Addr, Size, Name, Flags);
+  }
+
+  // Release the entire reserved memory.
+  void release() {
+    DCHECK(isCreated());
+    invokeImpl(&Derived::releaseImpl);
+  }
+
+  // Dispatch a sub-range of reserved memory. Note that any fragmentation of
+  // the reserved pages is managed by each implementation.
+  MemMapT dispatch(uptr Addr, uptr Size) {
+    DCHECK(isCreated());
+    DCHECK((Addr >= getBase()) || (Addr + Size <= getBase() + getCapacity()));
+    return invokeImpl(&Derived::dispatchImpl, Addr, Size);
+  }
+
+  uptr getBase() { return invokeImpl(&Derived::getBaseImpl); }
+  uptr getCapacity() { return invokeImpl(&Derived::getCapacityImpl); }
+
+  bool isCreated() { return getBase() != 0U; }
+
+protected:
+  template <typename R, typename... Args>
+  R invokeImpl(R (Derived::*MemFn)(Args...), Args... args) {
+    return (static_cast<Derived *>(this)->*MemFn)(args...);
+  }
+};
+
+} // namespace scudo
+
+#endif // SCUDO_MEM_MAP_BASE_H_
diff --git a/standalone/memtag.h b/standalone/memtag.h
index 7578aff..7f14a30 100644
--- a/standalone/memtag.h
+++ b/standalone/memtag.h
@@ -18,7 +18,8 @@
 
 namespace scudo {
 
-#if (__clang_major__ >= 12 && defined(__aarch64__)) || defined(SCUDO_FUZZ)
+#if (__clang_major__ >= 12 && defined(__aarch64__) && !defined(__ILP32__)) ||  \
+    defined(SCUDO_FUZZ)
 
 // We assume that Top-Byte Ignore is enabled if the architecture supports memory
 // tagging. Not all operating systems enable TBI, so we only claim architectural
@@ -57,7 +58,7 @@
 
 #endif
 
-#if __clang_major__ >= 12 && defined(__aarch64__)
+#if __clang_major__ >= 12 && defined(__aarch64__) && !defined(__ILP32__)
 
 #if SCUDO_LINUX
 
diff --git a/standalone/mutex.h b/standalone/mutex.h
index c8504c0..05340de 100644
--- a/standalone/mutex.h
+++ b/standalone/mutex.h
@@ -11,6 +11,7 @@
 
 #include "atomic_helpers.h"
 #include "common.h"
+#include "thread_annotations.h"
 
 #include <string.h>
 
@@ -20,10 +21,10 @@
 
 namespace scudo {
 
-class HybridMutex {
+class CAPABILITY("mutex") HybridMutex {
 public:
-  bool tryLock();
-  NOINLINE void lock() {
+  bool tryLock() TRY_ACQUIRE(true);
+  NOINLINE void lock() ACQUIRE() {
     if (LIKELY(tryLock()))
       return;
       // The compiler may try to fully unroll the loop, ending up in a
@@ -40,9 +41,20 @@
     }
     lockSlow();
   }
-  void unlock();
+  void unlock() RELEASE();
+
+  // TODO(chiahungduan): In general, we may want to assert the owner of lock as
+  // well. Given the current uses of HybridMutex, it's acceptable without
+  // asserting the owner. Re-evaluate this when we have certain scenarios which
+  // requires a more fine-grained lock granularity.
+  ALWAYS_INLINE void assertHeld() ASSERT_CAPABILITY(this) {
+    if (SCUDO_DEBUG)
+      assertHeldImpl();
+  }
 
 private:
+  void assertHeldImpl();
+
   static constexpr u8 NumberOfTries = 8U;
   static constexpr u8 NumberOfYields = 8U;
 
@@ -52,13 +64,13 @@
   sync_mutex_t M = {};
 #endif
 
-  void lockSlow();
+  void lockSlow() ACQUIRE();
 };
 
-class ScopedLock {
+class SCOPED_CAPABILITY ScopedLock {
 public:
-  explicit ScopedLock(HybridMutex &M) : Mutex(M) { Mutex.lock(); }
-  ~ScopedLock() { Mutex.unlock(); }
+  explicit ScopedLock(HybridMutex &M) ACQUIRE(M) : Mutex(M) { Mutex.lock(); }
+  ~ScopedLock() RELEASE() { Mutex.unlock(); }
 
 private:
   HybridMutex &Mutex;
diff --git a/standalone/platform.h b/standalone/platform.h
index db4217d..aae3b9a 100644
--- a/standalone/platform.h
+++ b/standalone/platform.h
@@ -37,6 +37,12 @@
 #define SCUDO_TRUSTY 0
 #endif
 
+#if defined(__riscv) && (__riscv_xlen == 64)
+#define SCUDO_RISCV64 1
+#else
+#define SCUDO_RISCV64 0
+#endif
+
 #if defined(__LP64__)
 #define SCUDO_WORDSIZE 64U
 #else
diff --git a/standalone/primary32.h b/standalone/primary32.h
index 326c10a..b3d6e53 100644
--- a/standalone/primary32.h
+++ b/standalone/primary32.h
@@ -18,6 +18,7 @@
 #include "report.h"
 #include "stats.h"
 #include "string_utils.h"
+#include "thread_annotations.h"
 
 namespace scudo {
 
@@ -43,6 +44,7 @@
 public:
   typedef typename Config::PrimaryCompactPtrT CompactPtrT;
   typedef typename Config::SizeClassMap SizeClassMap;
+  static const uptr GroupSizeLog = Config::PrimaryGroupSizeLog;
   // The bytemap can only track UINT8_MAX - 1 classes.
   static_assert(SizeClassMap::LargestClassId <= (UINT8_MAX - 1), "");
   // Regions should be large enough to hold the largest Block.
@@ -51,6 +53,7 @@
   typedef SizeClassAllocator32<Config> ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
   typedef typename CacheT::TransferBatch TransferBatch;
+  typedef typename CacheT::BatchGroup BatchGroup;
 
   static uptr getSizeByClassId(uptr ClassId) {
     return (ClassId == SizeClassMap::BatchClassId)
@@ -60,7 +63,7 @@
 
   static bool canAllocate(uptr Size) { return Size <= SizeClassMap::MaxSize; }
 
-  void init(s32 ReleaseToOsInterval) {
+  void init(s32 ReleaseToOsInterval) NO_THREAD_SAFETY_ANALYSIS {
     if (SCUDO_FUCHSIA)
       reportError("SizeClassAllocator32 is not supported on Fuchsia");
 
@@ -70,7 +73,7 @@
     DCHECK(isAligned(reinterpret_cast<uptr>(this), alignof(ThisT)));
     PossibleRegions.init();
     u32 Seed;
-    const u64 Time = getMonotonicTime();
+    const u64 Time = getMonotonicTimeFast();
     if (!getRandom(reinterpret_cast<void *>(&Seed), sizeof(Seed)))
       Seed = static_cast<u32>(
           Time ^ (reinterpret_cast<uptr>(SizeClassInfoArray) >> 6));
@@ -85,18 +88,26 @@
   }
 
   void unmapTestOnly() {
-    while (NumberOfStashedRegions > 0)
-      unmap(reinterpret_cast<void *>(RegionsStash[--NumberOfStashedRegions]),
-            RegionSize);
+    {
+      ScopedLock L(RegionsStashMutex);
+      while (NumberOfStashedRegions > 0) {
+        unmap(reinterpret_cast<void *>(RegionsStash[--NumberOfStashedRegions]),
+              RegionSize);
+      }
+    }
+
     uptr MinRegionIndex = NumRegions, MaxRegionIndex = 0;
     for (uptr I = 0; I < NumClasses; I++) {
       SizeClassInfo *Sci = getSizeClassInfo(I);
+      ScopedLock L(Sci->Mutex);
       if (Sci->MinRegionIndex < MinRegionIndex)
         MinRegionIndex = Sci->MinRegionIndex;
       if (Sci->MaxRegionIndex > MaxRegionIndex)
         MaxRegionIndex = Sci->MaxRegionIndex;
       *Sci = {};
     }
+
+    ScopedLock L(ByteMapMutex);
     for (uptr I = MinRegionIndex; I < MaxRegionIndex; I++)
       if (PossibleRegions[I])
         unmap(reinterpret_cast<void *>(I * RegionSize), RegionSize);
@@ -111,35 +122,80 @@
     return reinterpret_cast<void *>(static_cast<uptr>(CompactPtr));
   }
 
+  uptr compactPtrGroupBase(CompactPtrT CompactPtr) {
+    const uptr Mask = (static_cast<uptr>(1) << GroupSizeLog) - 1;
+    return CompactPtr & ~Mask;
+  }
+
+  uptr decompactGroupBase(uptr CompactPtrGroupBase) {
+    return CompactPtrGroupBase;
+  }
+
   TransferBatch *popBatch(CacheT *C, uptr ClassId) {
     DCHECK_LT(ClassId, NumClasses);
     SizeClassInfo *Sci = getSizeClassInfo(ClassId);
     ScopedLock L(Sci->Mutex);
-    TransferBatch *B = Sci->FreeList.front();
-    if (B) {
-      Sci->FreeList.pop_front();
-    } else {
-      B = populateFreeList(C, ClassId, Sci);
-      if (UNLIKELY(!B))
+    TransferBatch *B = popBatchImpl(C, ClassId, Sci);
+    if (UNLIKELY(!B)) {
+      if (UNLIKELY(!populateFreeList(C, ClassId, Sci)))
         return nullptr;
+      B = popBatchImpl(C, ClassId, Sci);
+      // if `populateFreeList` succeeded, we are supposed to get free blocks.
+      DCHECK_NE(B, nullptr);
     }
-    DCHECK_GT(B->getCount(), 0);
     Sci->Stats.PoppedBlocks += B->getCount();
     return B;
   }
 
-  void pushBatch(uptr ClassId, TransferBatch *B) {
+  // Push the array of free blocks to the designated batch group.
+  void pushBlocks(CacheT *C, uptr ClassId, CompactPtrT *Array, u32 Size) {
     DCHECK_LT(ClassId, NumClasses);
-    DCHECK_GT(B->getCount(), 0);
+    DCHECK_GT(Size, 0);
+
     SizeClassInfo *Sci = getSizeClassInfo(ClassId);
+    if (ClassId == SizeClassMap::BatchClassId) {
+      ScopedLock L(Sci->Mutex);
+      // Constructing a batch group in the free list will use two blocks in
+      // BatchClassId. If we are pushing BatchClassId blocks, we will use the
+      // blocks in the array directly (can't delegate local cache which will
+      // cause a recursive allocation). However, The number of free blocks may
+      // be less than two. Therefore, populate the free list before inserting
+      // the blocks.
+      if (Size == 1 && !populateFreeList(C, ClassId, Sci))
+        return;
+      pushBlocksImpl(C, ClassId, Sci, Array, Size);
+      Sci->Stats.PushedBlocks += Size;
+      return;
+    }
+
+    // TODO(chiahungduan): Consider not doing grouping if the group size is not
+    // greater than the block size with a certain scale.
+
+    // Sort the blocks so that blocks belonging to the same group can be pushed
+    // together.
+    bool SameGroup = true;
+    for (u32 I = 1; I < Size; ++I) {
+      if (compactPtrGroupBase(Array[I - 1]) != compactPtrGroupBase(Array[I]))
+        SameGroup = false;
+      CompactPtrT Cur = Array[I];
+      u32 J = I;
+      while (J > 0 &&
+             compactPtrGroupBase(Cur) < compactPtrGroupBase(Array[J - 1])) {
+        Array[J] = Array[J - 1];
+        --J;
+      }
+      Array[J] = Cur;
+    }
+
     ScopedLock L(Sci->Mutex);
-    Sci->FreeList.push_front(B);
-    Sci->Stats.PushedBlocks += B->getCount();
+    pushBlocksImpl(C, ClassId, Sci, Array, Size, SameGroup);
+
+    Sci->Stats.PushedBlocks += Size;
     if (ClassId != SizeClassMap::BatchClassId)
       releaseToOSMaybe(Sci, ClassId);
   }
 
-  void disable() {
+  void disable() NO_THREAD_SAFETY_ANALYSIS {
     // The BatchClassId must be locked last since other classes can use it.
     for (sptr I = static_cast<sptr>(NumClasses) - 1; I >= 0; I--) {
       if (static_cast<uptr>(I) == SizeClassMap::BatchClassId)
@@ -148,11 +204,11 @@
     }
     getSizeClassInfo(SizeClassMap::BatchClassId)->Mutex.lock();
     RegionsStashMutex.lock();
-    PossibleRegions.disable();
+    ByteMapMutex.lock();
   }
 
-  void enable() {
-    PossibleRegions.enable();
+  void enable() NO_THREAD_SAFETY_ANALYSIS {
+    ByteMapMutex.unlock();
     RegionsStashMutex.unlock();
     getSizeClassInfo(SizeClassMap::BatchClassId)->Mutex.unlock();
     for (uptr I = 0; I < NumClasses; I++) {
@@ -166,12 +222,20 @@
     uptr MinRegionIndex = NumRegions, MaxRegionIndex = 0;
     for (uptr I = 0; I < NumClasses; I++) {
       SizeClassInfo *Sci = getSizeClassInfo(I);
+      // TODO: The call of `iterateOverBlocks` requires disabling
+      // SizeClassAllocator32. We may consider locking each region on demand
+      // only.
+      Sci->Mutex.assertHeld();
       if (Sci->MinRegionIndex < MinRegionIndex)
         MinRegionIndex = Sci->MinRegionIndex;
       if (Sci->MaxRegionIndex > MaxRegionIndex)
         MaxRegionIndex = Sci->MaxRegionIndex;
     }
-    for (uptr I = MinRegionIndex; I <= MaxRegionIndex; I++)
+
+    // SizeClassAllocator32 is disabled, i.e., ByteMapMutex is held.
+    ByteMapMutex.assertHeld();
+
+    for (uptr I = MinRegionIndex; I <= MaxRegionIndex; I++) {
       if (PossibleRegions[I] &&
           (PossibleRegions[I] - 1U) != SizeClassMap::BatchClassId) {
         const uptr BlockSize = getSizeByClassId(PossibleRegions[I] - 1U);
@@ -180,6 +244,7 @@
         for (uptr Block = From; Block < To; Block += BlockSize)
           Callback(Block);
       }
+    }
   }
 
   void getStats(ScopedString *Str) {
@@ -189,6 +254,7 @@
     uptr PushedBlocks = 0;
     for (uptr I = 0; I < NumClasses; I++) {
       SizeClassInfo *Sci = getSizeClassInfo(I);
+      ScopedLock L(Sci->Mutex);
       TotalMapped += Sci->AllocatedUser;
       PoppedBlocks += Sci->Stats.PoppedBlocks;
       PushedBlocks += Sci->Stats.PushedBlocks;
@@ -196,8 +262,11 @@
     Str->append("Stats: SizeClassAllocator32: %zuM mapped in %zu allocations; "
                 "remains %zu\n",
                 TotalMapped >> 20, PoppedBlocks, PoppedBlocks - PushedBlocks);
-    for (uptr I = 0; I < NumClasses; I++)
-      getStats(Str, I, 0);
+    for (uptr I = 0; I < NumClasses; I++) {
+      SizeClassInfo *Sci = getSizeClassInfo(I);
+      ScopedLock L(Sci->Mutex);
+      getStats(Str, I, Sci, 0);
+    }
   }
 
   bool setOption(Option O, sptr Value) {
@@ -212,14 +281,14 @@
     return true;
   }
 
-  uptr releaseToOS() {
+  uptr releaseToOS(ReleaseToOS ReleaseType) {
     uptr TotalReleasedBytes = 0;
     for (uptr I = 0; I < NumClasses; I++) {
       if (I == SizeClassMap::BatchClassId)
         continue;
       SizeClassInfo *Sci = getSizeClassInfo(I);
       ScopedLock L(Sci->Mutex);
-      TotalReleasedBytes += releaseToOSMaybe(Sci, I, /*Force=*/true);
+      TotalReleasedBytes += releaseToOSMaybe(Sci, I, ReleaseType);
     }
     return TotalReleasedBytes;
   }
@@ -248,7 +317,7 @@
   };
 
   struct ReleaseToOsInfo {
-    uptr PushedBlocksAtLastRelease;
+    uptr BytesInFreeListAtLastCheckpoint;
     uptr RangesReleased;
     uptr LastReleasedBytes;
     u64 LastReleaseAtNs;
@@ -256,17 +325,17 @@
 
   struct alignas(SCUDO_CACHE_LINE_SIZE) SizeClassInfo {
     HybridMutex Mutex;
-    SinglyLinkedList<TransferBatch> FreeList;
-    uptr CurrentRegion;
-    uptr CurrentRegionAllocated;
-    SizeClassStats Stats;
+    SinglyLinkedList<BatchGroup> FreeList GUARDED_BY(Mutex);
+    uptr CurrentRegion GUARDED_BY(Mutex);
+    uptr CurrentRegionAllocated GUARDED_BY(Mutex);
+    SizeClassStats Stats GUARDED_BY(Mutex);
     u32 RandState;
-    uptr AllocatedUser;
+    uptr AllocatedUser GUARDED_BY(Mutex);
     // Lowest & highest region index allocated for this size class, to avoid
     // looping through the whole NumRegions.
-    uptr MinRegionIndex;
-    uptr MaxRegionIndex;
-    ReleaseToOsInfo ReleaseInfo;
+    uptr MinRegionIndex GUARDED_BY(Mutex);
+    uptr MaxRegionIndex GUARDED_BY(Mutex);
+    ReleaseToOsInfo ReleaseInfo GUARDED_BY(Mutex);
   };
   static_assert(sizeof(SizeClassInfo) % SCUDO_CACHE_LINE_SIZE == 0, "");
 
@@ -291,17 +360,22 @@
       else
         MapSize = RegionSize;
     } else {
-      Region = roundUpTo(MapBase, RegionSize);
+      Region = roundUp(MapBase, RegionSize);
       unmap(reinterpret_cast<void *>(MapBase), Region - MapBase);
       MapSize = RegionSize;
     }
     const uptr End = Region + MapSize;
     if (End != MapEnd)
       unmap(reinterpret_cast<void *>(End), MapEnd - End);
+
+    DCHECK_EQ(Region % RegionSize, 0U);
+    static_assert(Config::PrimaryRegionSizeLog == GroupSizeLog,
+                  "Memory group should be the same size as Region");
+
     return Region;
   }
 
-  uptr allocateRegion(SizeClassInfo *Sci, uptr ClassId) {
+  uptr allocateRegion(SizeClassInfo *Sci, uptr ClassId) REQUIRES(Sci->Mutex) {
     DCHECK_LT(ClassId, NumClasses);
     uptr Region = 0;
     {
@@ -318,6 +392,7 @@
         Sci->MinRegionIndex = RegionIndex;
       if (RegionIndex > Sci->MaxRegionIndex)
         Sci->MaxRegionIndex = RegionIndex;
+      ScopedLock L(ByteMapMutex);
       PossibleRegions.set(RegionIndex, static_cast<u8>(ClassId + 1U));
     }
     return Region;
@@ -328,8 +403,231 @@
     return &SizeClassInfoArray[ClassId];
   }
 
-  NOINLINE TransferBatch *populateFreeList(CacheT *C, uptr ClassId,
-                                           SizeClassInfo *Sci) {
+  // Push the blocks to their batch group. The layout will be like,
+  //
+  // FreeList - > BG -> BG -> BG
+  //              |     |     |
+  //              v     v     v
+  //              TB    TB    TB
+  //              |
+  //              v
+  //              TB
+  //
+  // Each BlockGroup(BG) will associate with unique group id and the free blocks
+  // are managed by a list of TransferBatch(TB). To reduce the time of inserting
+  // blocks, BGs are sorted and the input `Array` are supposed to be sorted so
+  // that we can get better performance of maintaining sorted property.
+  // Use `SameGroup=true` to indicate that all blocks in the array are from the
+  // same group then we will skip checking the group id of each block.
+  //
+  // The region mutex needs to be held while calling this method.
+  void pushBlocksImpl(CacheT *C, uptr ClassId, SizeClassInfo *Sci,
+                      CompactPtrT *Array, u32 Size, bool SameGroup = false)
+      REQUIRES(Sci->Mutex) {
+    DCHECK_GT(Size, 0U);
+
+    auto CreateGroup = [&](uptr CompactPtrGroupBase) {
+      BatchGroup *BG = nullptr;
+      TransferBatch *TB = nullptr;
+      if (ClassId == SizeClassMap::BatchClassId) {
+        DCHECK_GE(Size, 2U);
+
+        // Free blocks are recorded by TransferBatch in freelist, blocks of
+        // BatchClassId are included. In order not to use additional memory to
+        // record blocks of BatchClassId, they are self-contained. I.e., A
+        // TransferBatch may record the block address of itself. See the figure
+        // below:
+        //
+        // TransferBatch at 0xABCD
+        // +----------------------------+
+        // | Free blocks' addr          |
+        // | +------+------+------+     |
+        // | |0xABCD|...   |...   |     |
+        // | +------+------+------+     |
+        // +----------------------------+
+        //
+        // The safeness of manipulating TransferBatch is kept by the invariant,
+        //
+        //   The unit of each pop-block request is a TransferBatch. Return
+        //   part of the blocks in a TransferBatch is not allowed.
+        //
+        // This ensures that TransferBatch won't leak the address itself while
+        // it's still holding other valid data.
+        //
+        // Besides, BatchGroup uses the same size-class as TransferBatch does
+        // and its address is recorded in the TransferBatch too. To maintain the
+        // safeness, the invariant to keep is,
+        //
+        //   The address of itself is always recorded in the last TransferBatch
+        //   of the freelist (also imply that the freelist should only be
+        //   updated with push_front). Once the last TransferBatch is popped,
+        //   the BatchGroup becomes invalid.
+        //
+        // As a result, the blocks used by BatchGroup and TransferBatch are
+        // reusable and don't need additional space for them.
+        BG = reinterpret_cast<BatchGroup *>(
+            decompactPtr(ClassId, Array[Size - 1]));
+        BG->Batches.clear();
+
+        TB = reinterpret_cast<TransferBatch *>(
+            decompactPtr(ClassId, Array[Size - 2]));
+        TB->clear();
+
+        // Append the blocks used by BatchGroup and TransferBatch immediately so
+        // that we ensure that they are in the last TransBatch.
+        TB->appendFromArray(Array + Size - 2, 2);
+        Size -= 2;
+      } else {
+        BG = C->createGroup();
+        BG->Batches.clear();
+
+        TB = C->createBatch(ClassId, nullptr);
+        TB->clear();
+      }
+
+      BG->CompactPtrGroupBase = CompactPtrGroupBase;
+      // TODO(chiahungduan): Avoid the use of push_back() in `Batches`.
+      BG->Batches.push_front(TB);
+      BG->PushedBlocks = 0;
+      BG->BytesInBGAtLastCheckpoint = 0;
+      BG->MaxCachedPerBatch =
+          TransferBatch::getMaxCached(getSizeByClassId(ClassId));
+
+      return BG;
+    };
+
+    auto InsertBlocks = [&](BatchGroup *BG, CompactPtrT *Array, u32 Size) {
+      SinglyLinkedList<TransferBatch> &Batches = BG->Batches;
+      TransferBatch *CurBatch = Batches.front();
+      DCHECK_NE(CurBatch, nullptr);
+
+      for (u32 I = 0; I < Size;) {
+        DCHECK_GE(BG->MaxCachedPerBatch, CurBatch->getCount());
+        u16 UnusedSlots =
+            static_cast<u16>(BG->MaxCachedPerBatch - CurBatch->getCount());
+        if (UnusedSlots == 0) {
+          CurBatch = C->createBatch(
+              ClassId,
+              reinterpret_cast<void *>(decompactPtr(ClassId, Array[I])));
+          CurBatch->clear();
+          Batches.push_front(CurBatch);
+          UnusedSlots = BG->MaxCachedPerBatch;
+        }
+        // `UnusedSlots` is u16 so the result will be also fit in u16.
+        u16 AppendSize = static_cast<u16>(Min<u32>(UnusedSlots, Size - I));
+        CurBatch->appendFromArray(&Array[I], AppendSize);
+        I += AppendSize;
+      }
+
+      BG->PushedBlocks += Size;
+    };
+
+    BatchGroup *Cur = Sci->FreeList.front();
+
+    if (ClassId == SizeClassMap::BatchClassId) {
+      if (Cur == nullptr) {
+        // Don't need to classify BatchClassId.
+        Cur = CreateGroup(/*CompactPtrGroupBase=*/0);
+        Sci->FreeList.push_front(Cur);
+      }
+      InsertBlocks(Cur, Array, Size);
+      return;
+    }
+
+    // In the following, `Cur` always points to the BatchGroup for blocks that
+    // will be pushed next. `Prev` is the element right before `Cur`.
+    BatchGroup *Prev = nullptr;
+
+    while (Cur != nullptr &&
+           compactPtrGroupBase(Array[0]) > Cur->CompactPtrGroupBase) {
+      Prev = Cur;
+      Cur = Cur->Next;
+    }
+
+    if (Cur == nullptr ||
+        compactPtrGroupBase(Array[0]) != Cur->CompactPtrGroupBase) {
+      Cur = CreateGroup(compactPtrGroupBase(Array[0]));
+      if (Prev == nullptr)
+        Sci->FreeList.push_front(Cur);
+      else
+        Sci->FreeList.insert(Prev, Cur);
+    }
+
+    // All the blocks are from the same group, just push without checking group
+    // id.
+    if (SameGroup) {
+      for (u32 I = 0; I < Size; ++I)
+        DCHECK_EQ(compactPtrGroupBase(Array[I]), Cur->CompactPtrGroupBase);
+
+      InsertBlocks(Cur, Array, Size);
+      return;
+    }
+
+    // The blocks are sorted by group id. Determine the segment of group and
+    // push them to their group together.
+    u32 Count = 1;
+    for (u32 I = 1; I < Size; ++I) {
+      if (compactPtrGroupBase(Array[I - 1]) != compactPtrGroupBase(Array[I])) {
+        DCHECK_EQ(compactPtrGroupBase(Array[I - 1]), Cur->CompactPtrGroupBase);
+        InsertBlocks(Cur, Array + I - Count, Count);
+
+        while (Cur != nullptr &&
+               compactPtrGroupBase(Array[I]) > Cur->CompactPtrGroupBase) {
+          Prev = Cur;
+          Cur = Cur->Next;
+        }
+
+        if (Cur == nullptr ||
+            compactPtrGroupBase(Array[I]) != Cur->CompactPtrGroupBase) {
+          Cur = CreateGroup(compactPtrGroupBase(Array[I]));
+          DCHECK_NE(Prev, nullptr);
+          Sci->FreeList.insert(Prev, Cur);
+        }
+
+        Count = 1;
+      } else {
+        ++Count;
+      }
+    }
+
+    InsertBlocks(Cur, Array + Size - Count, Count);
+  }
+
+  // Pop one TransferBatch from a BatchGroup. The BatchGroup with the smallest
+  // group id will be considered first.
+  //
+  // The region mutex needs to be held while calling this method.
+  TransferBatch *popBatchImpl(CacheT *C, uptr ClassId, SizeClassInfo *Sci)
+      REQUIRES(Sci->Mutex) {
+    if (Sci->FreeList.empty())
+      return nullptr;
+
+    SinglyLinkedList<TransferBatch> &Batches = Sci->FreeList.front()->Batches;
+    DCHECK(!Batches.empty());
+
+    TransferBatch *B = Batches.front();
+    Batches.pop_front();
+    DCHECK_NE(B, nullptr);
+    DCHECK_GT(B->getCount(), 0U);
+
+    if (Batches.empty()) {
+      BatchGroup *BG = Sci->FreeList.front();
+      Sci->FreeList.pop_front();
+
+      // We don't keep BatchGroup with zero blocks to avoid empty-checking while
+      // allocating. Note that block used by constructing BatchGroup is recorded
+      // as free blocks in the last element of BatchGroup::Batches. Which means,
+      // once we pop the last TransferBatch, the block is implicitly
+      // deallocated.
+      if (ClassId != SizeClassMap::BatchClassId)
+        C->deallocate(SizeClassMap::BatchClassId, BG);
+    }
+
+    return B;
+  }
+
+  NOINLINE bool populateFreeList(CacheT *C, uptr ClassId, SizeClassInfo *Sci)
+      REQUIRES(Sci->Mutex) {
     uptr Region;
     uptr Offset;
     // If the size-class currently has a region associated to it, use it. The
@@ -344,14 +642,14 @@
       DCHECK_EQ(Sci->CurrentRegionAllocated, 0U);
       Region = allocateRegion(Sci, ClassId);
       if (UNLIKELY(!Region))
-        return nullptr;
+        return false;
       C->getStats().add(StatMapped, RegionSize);
       Sci->CurrentRegion = Region;
       Offset = 0;
     }
 
     const uptr Size = getSizeByClassId(ClassId);
-    const u32 MaxCount = TransferBatch::getMaxCached(Size);
+    const u16 MaxCount = TransferBatch::getMaxCached(Size);
     DCHECK_GT(MaxCount, 0U);
     // The maximum number of blocks we should carve in the region is dictated
     // by the maximum number of batches we want to fill, and the amount of
@@ -374,23 +672,29 @@
     uptr P = Region + Offset;
     for (u32 I = 0; I < NumberOfBlocks; I++, P += Size)
       ShuffleArray[I] = reinterpret_cast<CompactPtrT>(P);
-    // No need to shuffle the batches size class.
-    if (ClassId != SizeClassMap::BatchClassId)
-      shuffle(ShuffleArray, NumberOfBlocks, &Sci->RandState);
-    for (u32 I = 0; I < NumberOfBlocks;) {
-      TransferBatch *B =
-          C->createBatch(ClassId, reinterpret_cast<void *>(ShuffleArray[I]));
-      if (UNLIKELY(!B))
-        return nullptr;
-      const u32 N = Min(MaxCount, NumberOfBlocks - I);
-      B->setFromArray(&ShuffleArray[I], N);
-      Sci->FreeList.push_back(B);
-      I += N;
+
+    if (ClassId != SizeClassMap::BatchClassId) {
+      u32 N = 1;
+      uptr CurGroup = compactPtrGroupBase(ShuffleArray[0]);
+      for (u32 I = 1; I < NumberOfBlocks; I++) {
+        if (UNLIKELY(compactPtrGroupBase(ShuffleArray[I]) != CurGroup)) {
+          shuffle(ShuffleArray + I - N, N, &Sci->RandState);
+          pushBlocksImpl(C, ClassId, Sci, ShuffleArray + I - N, N,
+                         /*SameGroup=*/true);
+          N = 1;
+          CurGroup = compactPtrGroupBase(ShuffleArray[I]);
+        } else {
+          ++N;
+        }
+      }
+
+      shuffle(ShuffleArray + NumberOfBlocks - N, N, &Sci->RandState);
+      pushBlocksImpl(C, ClassId, Sci, &ShuffleArray[NumberOfBlocks - N], N,
+                     /*SameGroup=*/true);
+    } else {
+      pushBlocksImpl(C, ClassId, Sci, ShuffleArray, NumberOfBlocks,
+                     /*SameGroup=*/true);
     }
-    TransferBatch *B = Sci->FreeList.front();
-    Sci->FreeList.pop_front();
-    DCHECK(B);
-    DCHECK_GT(B->getCount(), 0);
 
     const uptr AllocatedUser = Size * NumberOfBlocks;
     C->getStats().add(StatFree, AllocatedUser);
@@ -406,11 +710,11 @@
     }
     Sci->AllocatedUser += AllocatedUser;
 
-    return B;
+    return true;
   }
 
-  void getStats(ScopedString *Str, uptr ClassId, uptr Rss) {
-    SizeClassInfo *Sci = getSizeClassInfo(ClassId);
+  void getStats(ScopedString *Str, uptr ClassId, SizeClassInfo *Sci, uptr Rss)
+      REQUIRES(Sci->Mutex) {
     if (Sci->AllocatedUser == 0)
       return;
     const uptr InUse = Sci->Stats.PoppedBlocks - Sci->Stats.PushedBlocks;
@@ -423,7 +727,8 @@
   }
 
   NOINLINE uptr releaseToOSMaybe(SizeClassInfo *Sci, uptr ClassId,
-                                 bool Force = false) {
+                                 ReleaseToOS ReleaseType = ReleaseToOS::Normal)
+      REQUIRES(Sci->Mutex) {
     const uptr BlockSize = getSizeByClassId(ClassId);
     const uptr PageSize = getPageSizeCached();
 
@@ -431,33 +736,60 @@
     const uptr BytesInFreeList =
         Sci->AllocatedUser -
         (Sci->Stats.PoppedBlocks - Sci->Stats.PushedBlocks) * BlockSize;
-    if (BytesInFreeList < PageSize)
-      return 0; // No chance to release anything.
-    const uptr BytesPushed =
-        (Sci->Stats.PushedBlocks - Sci->ReleaseInfo.PushedBlocksAtLastRelease) *
-        BlockSize;
-    if (BytesPushed < PageSize)
-      return 0; // Nothing new to release.
 
+    if (UNLIKELY(BytesInFreeList == 0))
+      return 0;
+
+    bool MaySkip = false;
+
+    if (BytesInFreeList <= Sci->ReleaseInfo.BytesInFreeListAtLastCheckpoint) {
+      Sci->ReleaseInfo.BytesInFreeListAtLastCheckpoint = BytesInFreeList;
+      MaySkip = true;
+    }
+
+    // Always update `BytesInFreeListAtLastCheckpoint` with the smallest value
+    // so that we won't underestimate the releasable pages. For example, the
+    // following is the region usage,
+    //
+    //  BytesInFreeListAtLastCheckpoint   AllocatedUser
+    //                v                         v
+    //  |--------------------------------------->
+    //         ^                   ^
+    //  BytesInFreeList     ReleaseThreshold
+    //
+    // In general, if we have collected enough bytes and the amount of free
+    // bytes meets the ReleaseThreshold, we will try to do page release. If we
+    // don't update `BytesInFreeListAtLastCheckpoint` when the current
+    // `BytesInFreeList` is smaller, we may take longer time to wait for enough
+    // freed blocks because we miss the bytes between
+    // (BytesInFreeListAtLastCheckpoint - BytesInFreeList).
+    const uptr PushedBytesDelta =
+        BytesInFreeList - Sci->ReleaseInfo.BytesInFreeListAtLastCheckpoint;
+    if (PushedBytesDelta < PageSize)
+      MaySkip = true;
+
+    const bool CheckDensity =
+        BlockSize < PageSize / 16U && ReleaseType != ReleaseToOS::ForceAll;
     // Releasing smaller blocks is expensive, so we want to make sure that a
     // significant amount of bytes are free, and that there has been a good
     // amount of batches pushed to the freelist before attempting to release.
-    if (BlockSize < PageSize / 16U) {
-      if (!Force && BytesPushed < Sci->AllocatedUser / 16U)
-        return 0;
-      // We want 8x% to 9x% free bytes (the larger the block, the lower the %).
-      if ((BytesInFreeList * 100U) / Sci->AllocatedUser <
-          (100U - 1U - BlockSize / 16U))
-        return 0;
+    if (CheckDensity) {
+      if (ReleaseType == ReleaseToOS::Normal &&
+          PushedBytesDelta < Sci->AllocatedUser / 16U) {
+        MaySkip = true;
+      }
     }
 
-    if (!Force) {
+    if (MaySkip && ReleaseType != ReleaseToOS::ForceAll)
+      return 0;
+
+    if (ReleaseType == ReleaseToOS::Normal) {
       const s32 IntervalMs = atomic_load_relaxed(&ReleaseToOsIntervalMs);
       if (IntervalMs < 0)
         return 0;
       if (Sci->ReleaseInfo.LastReleaseAtNs +
               static_cast<u64>(IntervalMs) * 1000000 >
-          getMonotonicTime()) {
+          getMonotonicTimeFast()) {
         return 0; // Memory was returned recently.
       }
     }
@@ -469,37 +801,115 @@
     uptr TotalReleasedBytes = 0;
     const uptr Base = First * RegionSize;
     const uptr NumberOfRegions = Last - First + 1U;
+    const uptr GroupSize = (1U << GroupSizeLog);
+    const uptr CurGroupBase =
+        compactPtrGroupBase(compactPtr(ClassId, Sci->CurrentRegion));
+
     ReleaseRecorder Recorder(Base);
-    auto SkipRegion = [this, First, ClassId](uptr RegionIndex) {
-      return (PossibleRegions[First + RegionIndex] - 1U) != ClassId;
-    };
+    PageReleaseContext Context(BlockSize, NumberOfRegions,
+                               /*ReleaseSize=*/RegionSize);
+
     auto DecompactPtr = [](CompactPtrT CompactPtr) {
       return reinterpret_cast<uptr>(CompactPtr);
     };
-    releaseFreeMemoryToOS(Sci->FreeList, RegionSize, NumberOfRegions, BlockSize,
-                          &Recorder, DecompactPtr, SkipRegion);
+    for (BatchGroup &BG : Sci->FreeList) {
+      const uptr GroupBase = decompactGroupBase(BG.CompactPtrGroupBase);
+      // The `GroupSize` may not be divided by `BlockSize`, which means there is
+      // an unused space at the end of Region. Exclude that space to avoid
+      // unused page map entry.
+      uptr AllocatedGroupSize = GroupBase == CurGroupBase
+                                    ? Sci->CurrentRegionAllocated
+                                    : roundDownSlow(GroupSize, BlockSize);
+      if (AllocatedGroupSize == 0)
+        continue;
+
+      // TransferBatches are pushed in front of BG.Batches. The first one may
+      // not have all caches used.
+      const uptr NumBlocks = (BG.Batches.size() - 1) * BG.MaxCachedPerBatch +
+                             BG.Batches.front()->getCount();
+      const uptr BytesInBG = NumBlocks * BlockSize;
+
+      if (ReleaseType != ReleaseToOS::ForceAll &&
+          BytesInBG <= BG.BytesInBGAtLastCheckpoint) {
+        BG.BytesInBGAtLastCheckpoint = BytesInBG;
+        continue;
+      }
+      const uptr PushedBytesDelta = BytesInBG - BG.BytesInBGAtLastCheckpoint;
+      if (PushedBytesDelta < PageSize)
+        continue;
+
+      // Given the randomness property, we try to release the pages only if the
+      // bytes used by free blocks exceed certain proportion of allocated
+      // spaces.
+      if (CheckDensity && (BytesInBG * 100U) / AllocatedGroupSize <
+                              (100U - 1U - BlockSize / 16U)) {
+        continue;
+      }
+
+      // TODO: Consider updating this after page release if `ReleaseRecorder`
+      // can tell the releasd bytes in each group.
+      BG.BytesInBGAtLastCheckpoint = BytesInBG;
+
+      const uptr MaxContainedBlocks = AllocatedGroupSize / BlockSize;
+      const uptr RegionIndex = (GroupBase - Base) / RegionSize;
+
+      if (NumBlocks == MaxContainedBlocks) {
+        for (const auto &It : BG.Batches)
+          for (u16 I = 0; I < It.getCount(); ++I)
+            DCHECK_EQ(compactPtrGroupBase(It.get(I)), BG.CompactPtrGroupBase);
+
+        const uptr To = GroupBase + AllocatedGroupSize;
+        Context.markRangeAsAllCounted(GroupBase, To, GroupBase, RegionIndex,
+                                      AllocatedGroupSize);
+      } else {
+        DCHECK_LT(NumBlocks, MaxContainedBlocks);
+
+        // Note that we don't always visit blocks in each BatchGroup so that we
+        // may miss the chance of releasing certain pages that cross
+        // BatchGroups.
+        Context.markFreeBlocksInRegion(BG.Batches, DecompactPtr, GroupBase,
+                                       RegionIndex, AllocatedGroupSize,
+                                       /*MayContainLastBlockInRegion=*/true);
+      }
+
+      // We may not be able to do the page release In a rare case that we may
+      // fail on PageMap allocation.
+      if (UNLIKELY(!Context.hasBlockMarked()))
+        return 0;
+    }
+
+    if (!Context.hasBlockMarked())
+      return 0;
+
+    auto SkipRegion = [this, First, ClassId](uptr RegionIndex) {
+      ScopedLock L(ByteMapMutex);
+      return (PossibleRegions[First + RegionIndex] - 1U) != ClassId;
+    };
+    releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
+
     if (Recorder.getReleasedRangesCount() > 0) {
-      Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks;
+      Sci->ReleaseInfo.BytesInFreeListAtLastCheckpoint = BytesInFreeList;
       Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
       Sci->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
       TotalReleasedBytes += Sci->ReleaseInfo.LastReleasedBytes;
     }
-    Sci->ReleaseInfo.LastReleaseAtNs = getMonotonicTime();
+    Sci->ReleaseInfo.LastReleaseAtNs = getMonotonicTimeFast();
 
     return TotalReleasedBytes;
   }
 
   SizeClassInfo SizeClassInfoArray[NumClasses] = {};
 
+  HybridMutex ByteMapMutex;
   // Track the regions in use, 0 is unused, otherwise store ClassId + 1.
-  ByteMap PossibleRegions = {};
+  ByteMap PossibleRegions GUARDED_BY(ByteMapMutex) = {};
   atomic_s32 ReleaseToOsIntervalMs = {};
   // Unless several threads request regions simultaneously from different size
   // classes, the stash rarely contains more than 1 entry.
   static constexpr uptr MaxStashedRegions = 4;
   HybridMutex RegionsStashMutex;
-  uptr NumberOfStashedRegions = 0;
-  uptr RegionsStash[MaxStashedRegions] = {};
+  uptr NumberOfStashedRegions GUARDED_BY(RegionsStashMutex) = 0;
+  uptr RegionsStash[MaxStashedRegions] GUARDED_BY(RegionsStashMutex) = {};
 };
 
 } // namespace scudo
diff --git a/standalone/primary64.h b/standalone/primary64.h
index 14784ee..d3a1aea 100644
--- a/standalone/primary64.h
+++ b/standalone/primary64.h
@@ -13,11 +13,13 @@
 #include "common.h"
 #include "list.h"
 #include "local_cache.h"
+#include "mem_map.h"
 #include "memtag.h"
 #include "options.h"
 #include "release.h"
 #include "stats.h"
 #include "string_utils.h"
+#include "thread_annotations.h"
 
 namespace scudo {
 
@@ -45,84 +47,219 @@
 public:
   typedef typename Config::PrimaryCompactPtrT CompactPtrT;
   static const uptr CompactPtrScale = Config::PrimaryCompactPtrScale;
+  static const uptr GroupSizeLog = Config::PrimaryGroupSizeLog;
+  static const uptr GroupScale = GroupSizeLog - CompactPtrScale;
   typedef typename Config::SizeClassMap SizeClassMap;
   typedef SizeClassAllocator64<Config> ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
   typedef typename CacheT::TransferBatch TransferBatch;
+  typedef typename CacheT::BatchGroup BatchGroup;
 
   static uptr getSizeByClassId(uptr ClassId) {
     return (ClassId == SizeClassMap::BatchClassId)
-               ? roundUpTo(sizeof(TransferBatch), 1U << CompactPtrScale)
+               ? roundUp(sizeof(TransferBatch), 1U << CompactPtrScale)
                : SizeClassMap::getSizeByClassId(ClassId);
   }
 
   static bool canAllocate(uptr Size) { return Size <= SizeClassMap::MaxSize; }
 
-  void init(s32 ReleaseToOsInterval) {
+  void init(s32 ReleaseToOsInterval) NO_THREAD_SAFETY_ANALYSIS {
     DCHECK(isAligned(reinterpret_cast<uptr>(this), alignof(ThisT)));
-    DCHECK_EQ(PrimaryBase, 0U);
+
+    const uptr PageSize = getPageSizeCached();
+    const uptr GroupSize = (1U << GroupSizeLog);
+    const uptr PagesInGroup = GroupSize / PageSize;
+    const uptr MinSizeClass = getSizeByClassId(1);
+    // When trying to release pages back to memory, visiting smaller size
+    // classes is expensive. Therefore, we only try to release smaller size
+    // classes when the amount of free blocks goes over a certain threshold (See
+    // the comment in releaseToOSMaybe() for more details). For example, for
+    // size class 32, we only do the release when the size of free blocks is
+    // greater than 97% of pages in a group. However, this may introduce another
+    // issue that if the number of free blocks is bouncing between 97% ~ 100%.
+    // Which means we may try many page releases but only release very few of
+    // them (less than 3% in a group). Even though we have
+    // `&ReleaseToOsIntervalMs` which slightly reduce the frequency of these
+    // calls but it will be better to have another guard to mitigate this issue.
+    //
+    // Here we add another constraint on the minimum size requirement. The
+    // constraint is determined by the size of in-use blocks in the minimal size
+    // class. Take size class 32 as an example,
+    //
+    //   +-     one memory group      -+
+    //   +----------------------+------+
+    //   |  97% of free blocks  |      |
+    //   +----------------------+------+
+    //                           \    /
+    //                      3% in-use blocks
+    //
+    //   * The release size threshold is 97%.
+    //
+    // The 3% size in a group is about 7 pages. For two consecutive
+    // releaseToOSMaybe(), we require the difference between `PushedBlocks`
+    // should be greater than 7 pages. This mitigates the page releasing
+    // thrashing which is caused by memory usage bouncing around the threshold.
+    // The smallest size class takes longest time to do the page release so we
+    // use its size of in-use blocks as a heuristic.
+    SmallerBlockReleasePageDelta =
+        PagesInGroup * (1 + MinSizeClass / 16U) / 100;
+
     // Reserve the space required for the Primary.
-    PrimaryBase = reinterpret_cast<uptr>(
-        map(nullptr, PrimarySize, nullptr, MAP_NOACCESS, &Data));
+    CHECK(ReservedMemory.create(/*Addr=*/0U, PrimarySize,
+                                "scudo:primary_reserve"));
+    PrimaryBase = ReservedMemory.getBase();
+    DCHECK_NE(PrimaryBase, 0U);
 
     u32 Seed;
-    const u64 Time = getMonotonicTime();
+    const u64 Time = getMonotonicTimeFast();
     if (!getRandom(reinterpret_cast<void *>(&Seed), sizeof(Seed)))
       Seed = static_cast<u32>(Time ^ (PrimaryBase >> 12));
-    const uptr PageSize = getPageSizeCached();
+
     for (uptr I = 0; I < NumClasses; I++) {
       RegionInfo *Region = getRegionInfo(I);
       // The actual start of a region is offset by a random number of pages
       // when PrimaryEnableRandomOffset is set.
-      Region->RegionBeg = getRegionBaseByClassId(I) +
+      Region->RegionBeg = (PrimaryBase + (I << Config::PrimaryRegionSizeLog)) +
                           (Config::PrimaryEnableRandomOffset
                                ? ((getRandomModN(&Seed, 16) + 1) * PageSize)
                                : 0);
       Region->RandState = getRandomU32(&Seed);
+      // Releasing small blocks is expensive, set a higher threshold to avoid
+      // frequent page releases.
+      if (isSmallBlock(getSizeByClassId(I)))
+        Region->TryReleaseThreshold = PageSize * SmallerBlockReleasePageDelta;
+      else
+        Region->TryReleaseThreshold = PageSize;
       Region->ReleaseInfo.LastReleaseAtNs = Time;
     }
+    shuffle(RegionInfoArray, NumClasses, &Seed);
+
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
 
-  void unmapTestOnly() {
+  void unmapTestOnly() NO_THREAD_SAFETY_ANALYSIS {
     for (uptr I = 0; I < NumClasses; I++) {
       RegionInfo *Region = getRegionInfo(I);
       *Region = {};
     }
     if (PrimaryBase)
-      unmap(reinterpret_cast<void *>(PrimaryBase), PrimarySize, UNMAP_ALL,
-            &Data);
+      ReservedMemory.release();
     PrimaryBase = 0U;
   }
 
   TransferBatch *popBatch(CacheT *C, uptr ClassId) {
     DCHECK_LT(ClassId, NumClasses);
     RegionInfo *Region = getRegionInfo(ClassId);
-    ScopedLock L(Region->Mutex);
-    TransferBatch *B = Region->FreeList.front();
-    if (B) {
-      Region->FreeList.pop_front();
-    } else {
-      B = populateFreeList(C, ClassId, Region);
-      if (UNLIKELY(!B))
-        return nullptr;
+    bool PrintStats = false;
+    {
+      ScopedLock L(Region->Mutex);
+      TransferBatch *B = popBatchImpl(C, ClassId, Region);
+      if (LIKELY(B)) {
+        Region->Stats.PoppedBlocks += B->getCount();
+        return B;
+      }
+
+      const bool RegionIsExhausted = Region->Exhausted;
+      if (UNLIKELY(RegionIsExhausted ||
+                   !populateFreeList(C, ClassId, Region))) {
+        PrintStats = !RegionIsExhausted && Region->Exhausted;
+      } else {
+        B = popBatchImpl(C, ClassId, Region);
+        // if `populateFreeList` succeeded, we are supposed to get free blocks.
+        DCHECK_NE(B, nullptr);
+        Region->Stats.PoppedBlocks += B->getCount();
+        return B;
+      }
     }
-    DCHECK_GT(B->getCount(), 0);
-    Region->Stats.PoppedBlocks += B->getCount();
-    return B;
+
+    // Note that `getStats()` requires locking each region so we can't call it
+    // while locking the Region->Mutex in the above.
+    if (UNLIKELY(PrintStats)) {
+      ScopedString Str;
+      getStats(&Str);
+      Str.append(
+          "Scudo OOM: The process has exhausted %zuM for size class %zu.\n",
+          RegionSize >> 20, getSizeByClassId(ClassId));
+      Str.output();
+    }
+    return nullptr;
   }
 
-  void pushBatch(uptr ClassId, TransferBatch *B) {
-    DCHECK_GT(B->getCount(), 0);
+  // Push the array of free blocks to the designated batch group.
+  void pushBlocks(CacheT *C, uptr ClassId, CompactPtrT *Array, u32 Size) {
+    DCHECK_LT(ClassId, NumClasses);
+    DCHECK_GT(Size, 0);
+
     RegionInfo *Region = getRegionInfo(ClassId);
+    if (ClassId == SizeClassMap::BatchClassId) {
+      bool PrintStats = false;
+      {
+        ScopedLock L(Region->Mutex);
+        // Constructing a batch group in the free list will use two blocks in
+        // BatchClassId. If we are pushing BatchClassId blocks, we will use the
+        // blocks in the array directly (can't delegate local cache which will
+        // cause a recursive allocation). However, The number of free blocks may
+        // be less than two. Therefore, populate the free list before inserting
+        // the blocks.
+        const bool NeedToRefill = Size == 1U && Region->FreeList.empty();
+        // If BatchClass has been exhausted, the program should have been
+        // aborted.
+        DCHECK(!Region->Exhausted);
+
+        if (UNLIKELY(
+                NeedToRefill &&
+                !populateFreeList(C, SizeClassMap::BatchClassId, Region))) {
+          PrintStats = true;
+        } else {
+          pushBlocksImpl(C, SizeClassMap::BatchClassId, Region, Array, Size);
+          Region->Stats.PushedBlocks += Size;
+        }
+      }
+
+      // Note that `getStats()` requires the lock of each region so we can't
+      // call it while locking the Region->Mutex in the above.
+      if (UNLIKELY(PrintStats)) {
+        ScopedString Str;
+        getStats(&Str);
+        Str.append(
+            "Scudo OOM: The process has exhausted %zuM for size class %zu.\n",
+            RegionSize >> 20, getSizeByClassId(ClassId));
+        Str.output();
+        // Theoretically, BatchClass shouldn't be used up. Abort immediately
+        // when it happens.
+        reportOutOfBatchClass();
+      }
+
+      return;
+    }
+
+    // TODO(chiahungduan): Consider not doing grouping if the group size is not
+    // greater than the block size with a certain scale.
+
+    // Sort the blocks so that blocks belonging to the same group can be pushed
+    // together.
+    bool SameGroup = true;
+    for (u32 I = 1; I < Size; ++I) {
+      if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I]))
+        SameGroup = false;
+      CompactPtrT Cur = Array[I];
+      u32 J = I;
+      while (J > 0 && compactPtrGroup(Cur) < compactPtrGroup(Array[J - 1])) {
+        Array[J] = Array[J - 1];
+        --J;
+      }
+      Array[J] = Cur;
+    }
+
     ScopedLock L(Region->Mutex);
-    Region->FreeList.push_front(B);
-    Region->Stats.PushedBlocks += B->getCount();
+    pushBlocksImpl(C, ClassId, Region, Array, Size, SameGroup);
+
+    Region->Stats.PushedBlocks += Size;
     if (ClassId != SizeClassMap::BatchClassId)
       releaseToOSMaybe(Region, ClassId);
   }
 
-  void disable() {
+  void disable() NO_THREAD_SAFETY_ANALYSIS {
     // The BatchClassId must be locked last since other classes can use it.
     for (sptr I = static_cast<sptr>(NumClasses) - 1; I >= 0; I--) {
       if (static_cast<uptr>(I) == SizeClassMap::BatchClassId)
@@ -132,7 +269,7 @@
     getRegionInfo(SizeClassMap::BatchClassId)->Mutex.lock();
   }
 
-  void enable() {
+  void enable() NO_THREAD_SAFETY_ANALYSIS {
     getRegionInfo(SizeClassMap::BatchClassId)->Mutex.unlock();
     for (uptr I = 0; I < NumClasses; I++) {
       if (I == SizeClassMap::BatchClassId)
@@ -145,7 +282,11 @@
     for (uptr I = 0; I < NumClasses; I++) {
       if (I == SizeClassMap::BatchClassId)
         continue;
-      const RegionInfo *Region = getRegionInfo(I);
+      RegionInfo *Region = getRegionInfo(I);
+      // TODO: The call of `iterateOverBlocks` requires disabling
+      // SizeClassAllocator64. We may consider locking each region on demand
+      // only.
+      Region->Mutex.assertHeld();
       const uptr BlockSize = getSizeByClassId(I);
       const uptr From = Region->RegionBeg;
       const uptr To = From + Region->AllocatedUser;
@@ -161,6 +302,7 @@
     uptr PushedBlocks = 0;
     for (uptr I = 0; I < NumClasses; I++) {
       RegionInfo *Region = getRegionInfo(I);
+      ScopedLock L(Region->Mutex);
       if (Region->MappedUser)
         TotalMapped += Region->MappedUser;
       PoppedBlocks += Region->Stats.PoppedBlocks;
@@ -171,8 +313,11 @@
                 TotalMapped >> 20, 0U, PoppedBlocks,
                 PoppedBlocks - PushedBlocks);
 
-    for (uptr I = 0; I < NumClasses; I++)
-      getStats(Str, I, 0);
+    for (uptr I = 0; I < NumClasses; I++) {
+      RegionInfo *Region = getRegionInfo(I);
+      ScopedLock L(Region->Mutex);
+      getStats(Str, I, Region, 0);
+    }
   }
 
   bool setOption(Option O, sptr Value) {
@@ -187,14 +332,14 @@
     return true;
   }
 
-  uptr releaseToOS() {
+  uptr releaseToOS(ReleaseToOS ReleaseType) {
     uptr TotalReleasedBytes = 0;
     for (uptr I = 0; I < NumClasses; I++) {
       if (I == SizeClassMap::BatchClassId)
         continue;
       RegionInfo *Region = getRegionInfo(I);
       ScopedLock L(Region->Mutex);
-      TotalReleasedBytes += releaseToOSMaybe(Region, I, /*Force=*/true);
+      TotalReleasedBytes += releaseToOSMaybe(Region, I, ReleaseType);
     }
     return TotalReleasedBytes;
   }
@@ -206,9 +351,6 @@
   static uptr getRegionInfoArraySize() { return sizeof(RegionInfoArray); }
 
   uptr getCompactPtrBaseByClassId(uptr ClassId) {
-    // If we are not compacting pointers, base everything off of 0.
-    if (sizeof(CompactPtrT) == sizeof(uptr) && CompactPtrScale == 0)
-      return 0;
     return getRegionInfo(ClassId)->RegionBeg;
   }
 
@@ -223,15 +365,23 @@
         decompactPtrInternal(getCompactPtrBaseByClassId(ClassId), CompactPtr));
   }
 
-  static BlockInfo findNearestBlock(const char *RegionInfoData, uptr Ptr) {
+  static BlockInfo findNearestBlock(const char *RegionInfoData,
+                                    uptr Ptr) NO_THREAD_SAFETY_ANALYSIS {
     const RegionInfo *RegionInfoArray =
         reinterpret_cast<const RegionInfo *>(RegionInfoData);
+
     uptr ClassId;
     uptr MinDistance = -1UL;
     for (uptr I = 0; I != NumClasses; ++I) {
       if (I == SizeClassMap::BatchClassId)
         continue;
       uptr Begin = RegionInfoArray[I].RegionBeg;
+      // TODO(chiahungduan): In fact, We need to lock the RegionInfo::Mutex.
+      // However, the RegionInfoData is passed with const qualifier and lock the
+      // mutex requires modifying RegionInfoData, which means we need to remove
+      // the const qualifier. This may lead to another undefined behavior (The
+      // first one is accessing `AllocatedUser` without locking. It's better to
+      // pass `RegionInfoData` as `void *` then we can lock the mutex properly.
       uptr End = Begin + RegionInfoArray[I].AllocatedUser;
       if (Begin > End || End - Begin < SizeClassMap::getSizeByClassId(I))
         continue;
@@ -284,7 +434,7 @@
   };
 
   struct ReleaseToOsInfo {
-    uptr PushedBlocksAtLastRelease;
+    uptr BytesInFreeListAtLastCheckpoint;
     uptr RangesReleased;
     uptr LastReleasedBytes;
     u64 LastReleaseAtNs;
@@ -292,15 +442,20 @@
 
   struct UnpaddedRegionInfo {
     HybridMutex Mutex;
-    SinglyLinkedList<TransferBatch> FreeList;
+    SinglyLinkedList<BatchGroup> FreeList GUARDED_BY(Mutex);
+    // This is initialized before thread creation.
     uptr RegionBeg = 0;
-    RegionStats Stats = {};
-    u32 RandState = 0;
-    uptr MappedUser = 0;    // Bytes mapped for user memory.
-    uptr AllocatedUser = 0; // Bytes allocated for user memory.
-    MapPlatformData Data = {};
-    ReleaseToOsInfo ReleaseInfo = {};
-    bool Exhausted = false;
+    RegionStats Stats GUARDED_BY(Mutex) = {};
+    u32 RandState GUARDED_BY(Mutex) = 0;
+    // Bytes mapped for user memory.
+    uptr MappedUser GUARDED_BY(Mutex) = 0;
+    // Bytes allocated for user memory.
+    uptr AllocatedUser GUARDED_BY(Mutex) = 0;
+    // The minimum size of pushed blocks to trigger page release.
+    uptr TryReleaseThreshold GUARDED_BY(Mutex) = 0;
+    MemMapT MemMap = {};
+    ReleaseToOsInfo ReleaseInfo GUARDED_BY(Mutex) = {};
+    bool Exhausted GUARDED_BY(Mutex) = false;
   };
   struct RegionInfo : UnpaddedRegionInfo {
     char Padding[SCUDO_CACHE_LINE_SIZE -
@@ -308,8 +463,13 @@
   };
   static_assert(sizeof(RegionInfo) % SCUDO_CACHE_LINE_SIZE == 0, "");
 
+  // TODO: `PrimaryBase` can be obtained from ReservedMemory. This needs to be
+  // deprecated.
   uptr PrimaryBase = 0;
-  MapPlatformData Data = {};
+  ReservedMemoryT ReservedMemory = {};
+  // The minimum size of pushed blocks that we will try to release the pages in
+  // that size class.
+  uptr SmallerBlockReleasePageDelta = 0;
   atomic_s32 ReleaseToOsIntervalMs = {};
   alignas(SCUDO_CACHE_LINE_SIZE) RegionInfo RegionInfoArray[NumClasses];
 
@@ -318,8 +478,10 @@
     return &RegionInfoArray[ClassId];
   }
 
-  uptr getRegionBaseByClassId(uptr ClassId) const {
-    return PrimaryBase + (ClassId << Config::PrimaryRegionSizeLog);
+  uptr getRegionBaseByClassId(uptr ClassId) {
+    return roundDown(getRegionInfo(ClassId)->RegionBeg - PrimaryBase,
+                     RegionSize) +
+           PrimaryBase;
   }
 
   static CompactPtrT compactPtrInternal(uptr Base, uptr Ptr) {
@@ -330,10 +492,248 @@
     return Base + (static_cast<uptr>(CompactPtr) << CompactPtrScale);
   }
 
-  NOINLINE TransferBatch *populateFreeList(CacheT *C, uptr ClassId,
-                                           RegionInfo *Region) {
+  static uptr compactPtrGroup(CompactPtrT CompactPtr) {
+    const uptr Mask = (static_cast<uptr>(1) << GroupScale) - 1;
+    return static_cast<uptr>(CompactPtr) & ~Mask;
+  }
+  static uptr decompactGroupBase(uptr Base, uptr CompactPtrGroupBase) {
+    DCHECK_EQ(CompactPtrGroupBase % (static_cast<uptr>(1) << (GroupScale)), 0U);
+    return Base + (CompactPtrGroupBase << CompactPtrScale);
+  }
+
+  ALWAYS_INLINE static bool isSmallBlock(uptr BlockSize) {
+    const uptr PageSize = getPageSizeCached();
+    return BlockSize < PageSize / 16U;
+  }
+
+  // Push the blocks to their batch group. The layout will be like,
+  //
+  // FreeList - > BG -> BG -> BG
+  //              |     |     |
+  //              v     v     v
+  //              TB    TB    TB
+  //              |
+  //              v
+  //              TB
+  //
+  // Each BlockGroup(BG) will associate with unique group id and the free blocks
+  // are managed by a list of TransferBatch(TB). To reduce the time of inserting
+  // blocks, BGs are sorted and the input `Array` are supposed to be sorted so
+  // that we can get better performance of maintaining sorted property.
+  // Use `SameGroup=true` to indicate that all blocks in the array are from the
+  // same group then we will skip checking the group id of each block.
+  //
+  // The region mutex needs to be held while calling this method.
+  void pushBlocksImpl(CacheT *C, uptr ClassId, RegionInfo *Region,
+                      CompactPtrT *Array, u32 Size, bool SameGroup = false)
+      REQUIRES(Region->Mutex) {
+    DCHECK_GT(Size, 0U);
+
+    auto CreateGroup = [&](uptr CompactPtrGroupBase) {
+      BatchGroup *BG = nullptr;
+      TransferBatch *TB = nullptr;
+      if (ClassId == SizeClassMap::BatchClassId) {
+        DCHECK_GE(Size, 2U);
+
+        // Free blocks are recorded by TransferBatch in freelist, blocks of
+        // BatchClassId are included. In order not to use additional memory to
+        // record blocks of BatchClassId, they are self-contained. I.e., A
+        // TransferBatch may record the block address of itself. See the figure
+        // below:
+        //
+        // TransferBatch at 0xABCD
+        // +----------------------------+
+        // | Free blocks' addr          |
+        // | +------+------+------+     |
+        // | |0xABCD|...   |...   |     |
+        // | +------+------+------+     |
+        // +----------------------------+
+        //
+        // The safeness of manipulating TransferBatch is kept by the invariant,
+        //
+        //   The unit of each pop-block request is a TransferBatch. Return
+        //   part of the blocks in a TransferBatch is not allowed.
+        //
+        // This ensures that TransferBatch won't leak the address itself while
+        // it's still holding other valid data.
+        //
+        // Besides, BatchGroup uses the same size-class as TransferBatch does
+        // and its address is recorded in the TransferBatch too. To maintain the
+        // safeness, the invariant to keep is,
+        //
+        //   The address of itself is always recorded in the last TransferBatch
+        //   of the freelist (also imply that the freelist should only be
+        //   updated with push_front). Once the last TransferBatch is popped,
+        //   the BatchGroup becomes invalid.
+        //
+        // As a result, the blocks used by BatchGroup and TransferBatch are
+        // reusable and don't need additional space for them.
+        BG = reinterpret_cast<BatchGroup *>(
+            decompactPtr(ClassId, Array[Size - 1]));
+        BG->Batches.clear();
+
+        TB = reinterpret_cast<TransferBatch *>(
+            decompactPtr(ClassId, Array[Size - 2]));
+        TB->clear();
+
+        // Append the blocks used by BatchGroup and TransferBatch immediately so
+        // that we ensure that they are in the last TransBatch.
+        TB->appendFromArray(Array + Size - 2, 2);
+        Size -= 2;
+      } else {
+        BG = C->createGroup();
+        BG->Batches.clear();
+
+        TB = C->createBatch(ClassId, nullptr);
+        TB->clear();
+      }
+
+      BG->CompactPtrGroupBase = CompactPtrGroupBase;
+      // TODO(chiahungduan): Avoid the use of push_back() in `Batches`.
+      BG->Batches.push_front(TB);
+      BG->PushedBlocks = 0;
+      BG->BytesInBGAtLastCheckpoint = 0;
+      BG->MaxCachedPerBatch =
+          TransferBatch::getMaxCached(getSizeByClassId(ClassId));
+
+      return BG;
+    };
+
+    auto InsertBlocks = [&](BatchGroup *BG, CompactPtrT *Array, u32 Size) {
+      SinglyLinkedList<TransferBatch> &Batches = BG->Batches;
+      TransferBatch *CurBatch = Batches.front();
+      DCHECK_NE(CurBatch, nullptr);
+
+      for (u32 I = 0; I < Size;) {
+        DCHECK_GE(BG->MaxCachedPerBatch, CurBatch->getCount());
+        u16 UnusedSlots =
+            static_cast<u16>(BG->MaxCachedPerBatch - CurBatch->getCount());
+        if (UnusedSlots == 0) {
+          CurBatch = C->createBatch(
+              ClassId,
+              reinterpret_cast<void *>(decompactPtr(ClassId, Array[I])));
+          CurBatch->clear();
+          Batches.push_front(CurBatch);
+          UnusedSlots = BG->MaxCachedPerBatch;
+        }
+        // `UnusedSlots` is u16 so the result will be also fit in u16.
+        u16 AppendSize = static_cast<u16>(Min<u32>(UnusedSlots, Size - I));
+        CurBatch->appendFromArray(&Array[I], AppendSize);
+        I += AppendSize;
+      }
+
+      BG->PushedBlocks += Size;
+    };
+
+    BatchGroup *Cur = Region->FreeList.front();
+
+    if (ClassId == SizeClassMap::BatchClassId) {
+      if (Cur == nullptr) {
+        // Don't need to classify BatchClassId.
+        Cur = CreateGroup(/*CompactPtrGroupBase=*/0);
+        Region->FreeList.push_front(Cur);
+      }
+      InsertBlocks(Cur, Array, Size);
+      return;
+    }
+
+    // In the following, `Cur` always points to the BatchGroup for blocks that
+    // will be pushed next. `Prev` is the element right before `Cur`.
+    BatchGroup *Prev = nullptr;
+
+    while (Cur != nullptr &&
+           compactPtrGroup(Array[0]) > Cur->CompactPtrGroupBase) {
+      Prev = Cur;
+      Cur = Cur->Next;
+    }
+
+    if (Cur == nullptr ||
+        compactPtrGroup(Array[0]) != Cur->CompactPtrGroupBase) {
+      Cur = CreateGroup(compactPtrGroup(Array[0]));
+      if (Prev == nullptr)
+        Region->FreeList.push_front(Cur);
+      else
+        Region->FreeList.insert(Prev, Cur);
+    }
+
+    // All the blocks are from the same group, just push without checking group
+    // id.
+    if (SameGroup) {
+      for (u32 I = 0; I < Size; ++I)
+        DCHECK_EQ(compactPtrGroup(Array[I]), Cur->CompactPtrGroupBase);
+
+      InsertBlocks(Cur, Array, Size);
+      return;
+    }
+
+    // The blocks are sorted by group id. Determine the segment of group and
+    // push them to their group together.
+    u32 Count = 1;
+    for (u32 I = 1; I < Size; ++I) {
+      if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I])) {
+        DCHECK_EQ(compactPtrGroup(Array[I - 1]), Cur->CompactPtrGroupBase);
+        InsertBlocks(Cur, Array + I - Count, Count);
+
+        while (Cur != nullptr &&
+               compactPtrGroup(Array[I]) > Cur->CompactPtrGroupBase) {
+          Prev = Cur;
+          Cur = Cur->Next;
+        }
+
+        if (Cur == nullptr ||
+            compactPtrGroup(Array[I]) != Cur->CompactPtrGroupBase) {
+          Cur = CreateGroup(compactPtrGroup(Array[I]));
+          DCHECK_NE(Prev, nullptr);
+          Region->FreeList.insert(Prev, Cur);
+        }
+
+        Count = 1;
+      } else {
+        ++Count;
+      }
+    }
+
+    InsertBlocks(Cur, Array + Size - Count, Count);
+  }
+
+  // Pop one TransferBatch from a BatchGroup. The BatchGroup with the smallest
+  // group id will be considered first.
+  //
+  // The region mutex needs to be held while calling this method.
+  TransferBatch *popBatchImpl(CacheT *C, uptr ClassId, RegionInfo *Region)
+      REQUIRES(Region->Mutex) {
+    if (Region->FreeList.empty())
+      return nullptr;
+
+    SinglyLinkedList<TransferBatch> &Batches =
+        Region->FreeList.front()->Batches;
+    DCHECK(!Batches.empty());
+
+    TransferBatch *B = Batches.front();
+    Batches.pop_front();
+    DCHECK_NE(B, nullptr);
+    DCHECK_GT(B->getCount(), 0U);
+
+    if (Batches.empty()) {
+      BatchGroup *BG = Region->FreeList.front();
+      Region->FreeList.pop_front();
+
+      // We don't keep BatchGroup with zero blocks to avoid empty-checking while
+      // allocating. Note that block used by constructing BatchGroup is recorded
+      // as free blocks in the last element of BatchGroup::Batches. Which means,
+      // once we pop the last TransferBatch, the block is implicitly
+      // deallocated.
+      if (ClassId != SizeClassMap::BatchClassId)
+        C->deallocate(SizeClassMap::BatchClassId, BG);
+    }
+
+    return B;
+  }
+
+  NOINLINE bool populateFreeList(CacheT *C, uptr ClassId, RegionInfo *Region)
+      REQUIRES(Region->Mutex) {
     const uptr Size = getSizeByClassId(ClassId);
-    const u32 MaxCount = TransferBatch::getMaxCached(Size);
+    const u16 MaxCount = TransferBatch::getMaxCached(Size);
 
     const uptr RegionBeg = Region->RegionBeg;
     const uptr MappedUser = Region->MappedUser;
@@ -342,29 +742,32 @@
     if (TotalUserBytes > MappedUser) {
       // Do the mmap for the user memory.
       const uptr MapSize =
-          roundUpTo(TotalUserBytes - MappedUser, MapSizeIncrement);
+          roundUp(TotalUserBytes - MappedUser, MapSizeIncrement);
       const uptr RegionBase = RegionBeg - getRegionBaseByClassId(ClassId);
       if (UNLIKELY(RegionBase + MappedUser + MapSize > RegionSize)) {
-        if (!Region->Exhausted) {
-          Region->Exhausted = true;
-          ScopedString Str;
-          getStats(&Str);
-          Str.append(
-              "Scudo OOM: The process has exhausted %zuM for size class %zu.\n",
-              RegionSize >> 20, Size);
-          Str.output();
-        }
-        return nullptr;
+        Region->Exhausted = true;
+        return false;
       }
-      if (MappedUser == 0)
-        Region->Data = Data;
-      if (UNLIKELY(!map(
-              reinterpret_cast<void *>(RegionBeg + MappedUser), MapSize,
-              "scudo:primary",
+      // TODO: Consider allocating MemMap in init().
+      if (!Region->MemMap.isAllocated()) {
+        // TODO: Ideally, a region should reserve RegionSize because the memory
+        // between `RegionBeg` and region base is still belong to a region and
+        // it's just not used. In order to make it work on every platform (some
+        // of them don't support `remap()` across the unused range), dispatch
+        // from `RegionBeg` for now.
+        const uptr ReserveSize =
+            RegionSize - (RegionBeg - getRegionBaseByClassId(ClassId));
+        Region->MemMap = ReservedMemory.dispatch(RegionBeg, ReserveSize);
+      }
+      DCHECK(Region->MemMap.isAllocated());
+
+      if (UNLIKELY(!Region->MemMap.remap(
+              RegionBeg + MappedUser, MapSize, "scudo:primary",
               MAP_ALLOWNOMEM | MAP_RESIZABLE |
-                  (useMemoryTagging<Config>(Options.load()) ? MAP_MEMTAG : 0),
-              &Region->Data)))
-        return nullptr;
+                  (useMemoryTagging<Config>(Options.load()) ? MAP_MEMTAG
+                                                            : 0)))) {
+        return false;
+      }
       Region->MappedUser += MapSize;
       C->getStats().add(StatMapped, MapSize);
     }
@@ -383,34 +786,39 @@
     uptr P = RegionBeg + Region->AllocatedUser;
     for (u32 I = 0; I < NumberOfBlocks; I++, P += Size)
       ShuffleArray[I] = compactPtrInternal(CompactPtrBase, P);
-    // No need to shuffle the batches size class.
-    if (ClassId != SizeClassMap::BatchClassId)
-      shuffle(ShuffleArray, NumberOfBlocks, &Region->RandState);
-    for (u32 I = 0; I < NumberOfBlocks;) {
-      TransferBatch *B =
-          C->createBatch(ClassId, reinterpret_cast<void *>(decompactPtrInternal(
-                                      CompactPtrBase, ShuffleArray[I])));
-      if (UNLIKELY(!B))
-        return nullptr;
-      const u32 N = Min(MaxCount, NumberOfBlocks - I);
-      B->setFromArray(&ShuffleArray[I], N);
-      Region->FreeList.push_back(B);
-      I += N;
+
+    if (ClassId != SizeClassMap::BatchClassId) {
+      u32 N = 1;
+      uptr CurGroup = compactPtrGroup(ShuffleArray[0]);
+      for (u32 I = 1; I < NumberOfBlocks; I++) {
+        if (UNLIKELY(compactPtrGroup(ShuffleArray[I]) != CurGroup)) {
+          shuffle(ShuffleArray + I - N, N, &Region->RandState);
+          pushBlocksImpl(C, ClassId, Region, ShuffleArray + I - N, N,
+                         /*SameGroup=*/true);
+          N = 1;
+          CurGroup = compactPtrGroup(ShuffleArray[I]);
+        } else {
+          ++N;
+        }
+      }
+
+      shuffle(ShuffleArray + NumberOfBlocks - N, N, &Region->RandState);
+      pushBlocksImpl(C, ClassId, Region, &ShuffleArray[NumberOfBlocks - N], N,
+                     /*SameGroup=*/true);
+    } else {
+      pushBlocksImpl(C, ClassId, Region, ShuffleArray, NumberOfBlocks,
+                     /*SameGroup=*/true);
     }
-    TransferBatch *B = Region->FreeList.front();
-    Region->FreeList.pop_front();
-    DCHECK(B);
-    DCHECK_GT(B->getCount(), 0);
 
     const uptr AllocatedUser = Size * NumberOfBlocks;
     C->getStats().add(StatFree, AllocatedUser);
     Region->AllocatedUser += AllocatedUser;
 
-    return B;
+    return true;
   }
 
-  void getStats(ScopedString *Str, uptr ClassId, uptr Rss) {
-    RegionInfo *Region = getRegionInfo(ClassId);
+  void getStats(ScopedString *Str, uptr ClassId, RegionInfo *Region, uptr Rss)
+      REQUIRES(Region->Mutex) {
     if (Region->MappedUser == 0)
       return;
     const uptr InUse = Region->Stats.PoppedBlocks - Region->Stats.PushedBlocks;
@@ -427,7 +835,8 @@
   }
 
   NOINLINE uptr releaseToOSMaybe(RegionInfo *Region, uptr ClassId,
-                                 bool Force = false) {
+                                 ReleaseToOS ReleaseType = ReleaseToOS::Normal)
+      REQUIRES(Region->Mutex) {
     const uptr BlockSize = getSizeByClassId(ClassId);
     const uptr PageSize = getPageSizeCached();
 
@@ -435,53 +844,373 @@
     const uptr BytesInFreeList =
         Region->AllocatedUser -
         (Region->Stats.PoppedBlocks - Region->Stats.PushedBlocks) * BlockSize;
-    if (BytesInFreeList < PageSize)
-      return 0; // No chance to release anything.
-    const uptr BytesPushed = (Region->Stats.PushedBlocks -
-                              Region->ReleaseInfo.PushedBlocksAtLastRelease) *
-                             BlockSize;
-    if (BytesPushed < PageSize)
-      return 0; // Nothing new to release.
 
+    if (UNLIKELY(BytesInFreeList == 0))
+      return 0;
+
+    bool MaySkip = false;
+
+    // Always update `BytesInFreeListAtLastCheckpoint` with the smallest value
+    // so that we won't underestimate the releasable pages. For example, the
+    // following is the region usage,
+    //
+    //  BytesInFreeListAtLastCheckpoint   AllocatedUser
+    //                v                         v
+    //  |--------------------------------------->
+    //         ^                   ^
+    //  BytesInFreeList     ReleaseThreshold
+    //
+    // In general, if we have collected enough bytes and the amount of free
+    // bytes meets the ReleaseThreshold, we will try to do page release. If we
+    // don't update `BytesInFreeListAtLastCheckpoint` when the current
+    // `BytesInFreeList` is smaller, we may take longer time to wait for enough
+    // freed blocks because we miss the bytes between
+    // (BytesInFreeListAtLastCheckpoint - BytesInFreeList).
+    if (BytesInFreeList <=
+        Region->ReleaseInfo.BytesInFreeListAtLastCheckpoint) {
+      Region->ReleaseInfo.BytesInFreeListAtLastCheckpoint = BytesInFreeList;
+      MaySkip = true;
+    }
+
+    const uptr RegionPushedBytesDelta =
+        BytesInFreeList - Region->ReleaseInfo.BytesInFreeListAtLastCheckpoint;
+    if (RegionPushedBytesDelta < PageSize)
+      MaySkip = true;
+
+    const bool CheckDensity = isSmallBlock(BlockSize);
     // Releasing smaller blocks is expensive, so we want to make sure that a
     // significant amount of bytes are free, and that there has been a good
     // amount of batches pushed to the freelist before attempting to release.
-    if (BlockSize < PageSize / 16U) {
-      if (!Force && BytesPushed < Region->AllocatedUser / 16U)
-        return 0;
-      // We want 8x% to 9x% free bytes (the larger the block, the lower the %).
-      if ((BytesInFreeList * 100U) / Region->AllocatedUser <
-          (100U - 1U - BlockSize / 16U))
-        return 0;
+    if (CheckDensity) {
+      if (ReleaseType == ReleaseToOS::Normal &&
+          RegionPushedBytesDelta < Region->TryReleaseThreshold) {
+        MaySkip = true;
+      }
     }
 
-    if (!Force) {
+    if (MaySkip && ReleaseType != ReleaseToOS::ForceAll)
+      return 0;
+
+    if (ReleaseType == ReleaseToOS::Normal) {
       const s32 IntervalMs = atomic_load_relaxed(&ReleaseToOsIntervalMs);
       if (IntervalMs < 0)
         return 0;
       if (Region->ReleaseInfo.LastReleaseAtNs +
               static_cast<u64>(IntervalMs) * 1000000 >
-          getMonotonicTime()) {
+          getMonotonicTimeFast()) {
         return 0; // Memory was returned recently.
       }
     }
 
-    ReleaseRecorder Recorder(Region->RegionBeg, &Region->Data);
+    const uptr GroupSize = (1U << GroupSizeLog);
+    const uptr AllocatedUserEnd = Region->AllocatedUser + Region->RegionBeg;
     const uptr CompactPtrBase = getCompactPtrBaseByClassId(ClassId);
     auto DecompactPtr = [CompactPtrBase](CompactPtrT CompactPtr) {
       return decompactPtrInternal(CompactPtrBase, CompactPtr);
     };
+
+    // Instead of always preparing PageMap for the entire region, we only do it
+    // for the range of releasing groups. To do that, the free-block marking
+    // process includes visiting BlockGroups twice.
+
+    // The first visit is to determine the range of BatchGroups we are going to
+    // release. And we will extract those BatchGroups out and push into
+    // `GroupToRelease`.
+    SinglyLinkedList<BatchGroup> GroupToRelease;
+    GroupToRelease.clear();
+
+    // This is only used for debugging to ensure the consistency of the number
+    // of groups.
+    uptr NumberOfBatchGroups = Region->FreeList.size();
+
+    // We are examining each group and will take the minimum distance to the
+    // release threshold as the next Region::TryReleaseThreshold(). Note that if
+    // the size of free blocks has reached the release threshold, the distance
+    // to the next release will be PageSize * SmallerBlockReleasePageDelta. See
+    // the comment on `SmallerBlockReleasePageDelta` for more details.
+    uptr MinDistToThreshold = GroupSize;
+
+    for (BatchGroup *BG = Region->FreeList.front(), *Prev = nullptr;
+         BG != nullptr;) {
+      // Group boundary is always GroupSize-aligned from CompactPtr base. The
+      // layout of memory groups is like,
+      //
+      //     (CompactPtrBase)
+      // #1 CompactPtrGroupBase   #2 CompactPtrGroupBase            ...
+      //           |                       |                       |
+      //           v                       v                       v
+      //           +-----------------------+-----------------------+
+      //            \                     / \                     /
+      //             ---   GroupSize   ---   ---   GroupSize   ---
+      //
+      // After decompacting the CompactPtrGroupBase, we expect the alignment
+      // property is held as well.
+      const uptr BatchGroupBase =
+          decompactGroupBase(CompactPtrBase, BG->CompactPtrGroupBase);
+      DCHECK_LE(Region->RegionBeg, BatchGroupBase);
+      DCHECK_GE(AllocatedUserEnd, BatchGroupBase);
+      DCHECK_EQ((Region->RegionBeg - BatchGroupBase) % GroupSize, 0U);
+      const uptr BatchGroupEnd = BatchGroupBase + GroupSize;
+      const uptr AllocatedGroupSize = AllocatedUserEnd >= BatchGroupEnd
+                                          ? GroupSize
+                                          : AllocatedUserEnd - BatchGroupBase;
+      if (AllocatedGroupSize == 0) {
+        Prev = BG;
+        BG = BG->Next;
+        continue;
+      }
+
+      // TransferBatches are pushed in front of BG.Batches. The first one may
+      // not have all caches used.
+      const uptr NumBlocks = (BG->Batches.size() - 1) * BG->MaxCachedPerBatch +
+                             BG->Batches.front()->getCount();
+      const uptr BytesInBG = NumBlocks * BlockSize;
+
+      if (ReleaseType != ReleaseToOS::ForceAll &&
+          BytesInBG <= BG->BytesInBGAtLastCheckpoint) {
+        BG->BytesInBGAtLastCheckpoint = BytesInBG;
+        Prev = BG;
+        BG = BG->Next;
+        continue;
+      }
+
+      const uptr PushedBytesDelta = BG->BytesInBGAtLastCheckpoint - BytesInBG;
+
+      // Given the randomness property, we try to release the pages only if the
+      // bytes used by free blocks exceed certain proportion of group size. Note
+      // that this heuristic only applies when all the spaces in a BatchGroup
+      // are allocated.
+      if (CheckDensity) {
+        const uptr ReleaseThreshold =
+            (AllocatedGroupSize * (100 - 1U - BlockSize / 16U)) / 100U;
+        const bool HighDensity = BytesInBG >= ReleaseThreshold;
+        const bool MayHaveReleasedAll = NumBlocks >= (GroupSize / BlockSize);
+        // If all blocks in the group are released, we will do range marking
+        // which is fast. Otherwise, we will wait until we have accumulated
+        // a certain amount of free memory.
+        const bool ReachReleaseDelta =
+            MayHaveReleasedAll
+                ? true
+                : PushedBytesDelta >= PageSize * SmallerBlockReleasePageDelta;
+
+        if (!HighDensity) {
+          DCHECK_LE(BytesInBG, ReleaseThreshold);
+          // The following is the usage of a memroy group,
+          //
+          //     BytesInBG             ReleaseThreshold
+          //  /             \                 v
+          //  +---+---------------------------+-----+
+          //  |   |         |                 |     |
+          //  +---+---------------------------+-----+
+          //       \        /                       ^
+          //    PushedBytesDelta                 GroupEnd
+          MinDistToThreshold =
+              Min(MinDistToThreshold,
+                  ReleaseThreshold - BytesInBG + PushedBytesDelta);
+        } else {
+          // If it reaches high density at this round, the next time we will try
+          // to release is based on SmallerBlockReleasePageDelta
+          MinDistToThreshold =
+              Min(MinDistToThreshold, PageSize * SmallerBlockReleasePageDelta);
+        }
+
+        if (!HighDensity || !ReachReleaseDelta) {
+          Prev = BG;
+          BG = BG->Next;
+          continue;
+        }
+      }
+
+      // If `BG` is the first BatchGroup in the list, we only need to advance
+      // `BG` and call FreeList::pop_front(). No update is needed for `Prev`.
+      //
+      //         (BG)   (BG->Next)
+      // Prev     Cur      BG
+      //   |       |       |
+      //   v       v       v
+      //  nil     +--+    +--+
+      //          |X | -> |  | -> ...
+      //          +--+    +--+
+      //
+      // Otherwise, `Prev` will be used to extract the `Cur` from the
+      // `FreeList`.
+      //
+      //         (BG)   (BG->Next)
+      // Prev     Cur      BG
+      //   |       |       |
+      //   v       v       v
+      //  +--+    +--+    +--+
+      //  |  | -> |X | -> |  | -> ...
+      //  +--+    +--+    +--+
+      //
+      // After FreeList::extract(),
+      //
+      // Prev     Cur       BG
+      //   |       |        |
+      //   v       v        v
+      //  +--+    +--+     +--+
+      //  |  |-+  |X |  +->|  | -> ...
+      //  +--+ |  +--+  |  +--+
+      //       +--------+
+      //
+      // Note that we need to advance before pushing this BatchGroup to
+      // GroupToRelease because it's a destructive operation.
+
+      BatchGroup *Cur = BG;
+      BG = BG->Next;
+
+      // Ideally, we may want to update this only after successful release.
+      // However, for smaller blocks, each block marking is a costly operation.
+      // Therefore, we update it earlier.
+      // TODO: Consider updating this after page release if `ReleaseRecorder`
+      // can tell the releasd bytes in each group.
+      Cur->BytesInBGAtLastCheckpoint = BytesInBG;
+
+      if (Prev != nullptr)
+        Region->FreeList.extract(Prev, Cur);
+      else
+        Region->FreeList.pop_front();
+      GroupToRelease.push_back(Cur);
+    }
+
+    // Only small blocks have the adaptive `TryReleaseThreshold`.
+    if (isSmallBlock(BlockSize)) {
+      // If the MinDistToThreshold is not updated, that means each memory group
+      // may have only pushed less than a page size. In that case, just set it
+      // back to normal.
+      if (MinDistToThreshold == GroupSize)
+        MinDistToThreshold = PageSize * SmallerBlockReleasePageDelta;
+      Region->TryReleaseThreshold = MinDistToThreshold;
+    }
+
+    if (GroupToRelease.empty())
+      return 0;
+
+    const uptr ReleaseBase = decompactGroupBase(
+        CompactPtrBase, GroupToRelease.front()->CompactPtrGroupBase);
+    const uptr LastGroupEnd =
+        Min(decompactGroupBase(CompactPtrBase,
+                               GroupToRelease.back()->CompactPtrGroupBase) +
+                GroupSize,
+            AllocatedUserEnd);
+    // The last block may straddle the group boundary. Rounding up to BlockSize
+    // to get the exact range.
+    const uptr ReleaseEnd =
+        roundUpSlow(LastGroupEnd - Region->RegionBeg, BlockSize) +
+        Region->RegionBeg;
+    const uptr ReleaseRangeSize = ReleaseEnd - ReleaseBase;
+    const uptr ReleaseOffset = ReleaseBase - Region->RegionBeg;
+
+    RegionReleaseRecorder<MemMapT> Recorder(&Region->MemMap, Region->RegionBeg,
+                                            ReleaseOffset);
+    PageReleaseContext Context(BlockSize, /*NumberOfRegions=*/1U,
+                               ReleaseRangeSize, ReleaseOffset);
+    // We may not be able to do the page release in a rare case that we may
+    // fail on PageMap allocation.
+    if (UNLIKELY(!Context.ensurePageMapAllocated()))
+      return 0;
+
+    for (BatchGroup &BG : GroupToRelease) {
+      const uptr BatchGroupBase =
+          decompactGroupBase(CompactPtrBase, BG.CompactPtrGroupBase);
+      const uptr BatchGroupEnd = BatchGroupBase + GroupSize;
+      const uptr AllocatedGroupSize = AllocatedUserEnd >= BatchGroupEnd
+                                          ? GroupSize
+                                          : AllocatedUserEnd - BatchGroupBase;
+      const uptr BatchGroupUsedEnd = BatchGroupBase + AllocatedGroupSize;
+      const bool MayContainLastBlockInRegion =
+          BatchGroupUsedEnd == AllocatedUserEnd;
+      const bool BlockAlignedWithUsedEnd =
+          (BatchGroupUsedEnd - Region->RegionBeg) % BlockSize == 0;
+
+      uptr MaxContainedBlocks = AllocatedGroupSize / BlockSize;
+      if (!BlockAlignedWithUsedEnd)
+        ++MaxContainedBlocks;
+
+      const uptr NumBlocks = (BG.Batches.size() - 1) * BG.MaxCachedPerBatch +
+                             BG.Batches.front()->getCount();
+
+      if (NumBlocks == MaxContainedBlocks) {
+        for (const auto &It : BG.Batches)
+          for (u16 I = 0; I < It.getCount(); ++I)
+            DCHECK_EQ(compactPtrGroup(It.get(I)), BG.CompactPtrGroupBase);
+
+        Context.markRangeAsAllCounted(BatchGroupBase, BatchGroupUsedEnd,
+                                      Region->RegionBeg, /*RegionIndex=*/0,
+                                      Region->AllocatedUser);
+      } else {
+        DCHECK_LT(NumBlocks, MaxContainedBlocks);
+        // Note that we don't always visit blocks in each BatchGroup so that we
+        // may miss the chance of releasing certain pages that cross
+        // BatchGroups.
+        Context.markFreeBlocksInRegion(
+            BG.Batches, DecompactPtr, Region->RegionBeg, /*RegionIndex=*/0,
+            Region->AllocatedUser, MayContainLastBlockInRegion);
+      }
+    }
+
+    DCHECK(Context.hasBlockMarked());
+
     auto SkipRegion = [](UNUSED uptr RegionIndex) { return false; };
-    releaseFreeMemoryToOS(Region->FreeList, Region->AllocatedUser, 1U,
-                          BlockSize, &Recorder, DecompactPtr, SkipRegion);
+    releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
 
     if (Recorder.getReleasedRangesCount() > 0) {
-      Region->ReleaseInfo.PushedBlocksAtLastRelease =
-          Region->Stats.PushedBlocks;
+      Region->ReleaseInfo.BytesInFreeListAtLastCheckpoint = BytesInFreeList;
       Region->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
       Region->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
     }
-    Region->ReleaseInfo.LastReleaseAtNs = getMonotonicTime();
+    Region->ReleaseInfo.LastReleaseAtNs = getMonotonicTimeFast();
+
+    // Merge GroupToRelease back to the Region::FreeList. Note that both
+    // `Region->FreeList` and `GroupToRelease` are sorted.
+    for (BatchGroup *BG = Region->FreeList.front(), *Prev = nullptr;;) {
+      if (BG == nullptr || GroupToRelease.empty()) {
+        if (!GroupToRelease.empty())
+          Region->FreeList.append_back(&GroupToRelease);
+        break;
+      }
+
+      DCHECK_NE(BG->CompactPtrGroupBase,
+                GroupToRelease.front()->CompactPtrGroupBase);
+
+      if (BG->CompactPtrGroupBase <
+          GroupToRelease.front()->CompactPtrGroupBase) {
+        Prev = BG;
+        BG = BG->Next;
+        continue;
+      }
+
+      // At here, the `BG` is the first BatchGroup with CompactPtrGroupBase
+      // larger than the first element in `GroupToRelease`. We need to insert
+      // `GroupToRelease::front()` (which is `Cur` below)  before `BG`.
+      //
+      //   1. If `Prev` is nullptr, we simply push `Cur` to the front of
+      //      FreeList.
+      //   2. Otherwise, use `insert()` which inserts an element next to `Prev`.
+      //
+      // Afterwards, we don't need to advance `BG` because the order between
+      // `BG` and the new `GroupToRelease::front()` hasn't been checked.
+      BatchGroup *Cur = GroupToRelease.front();
+      GroupToRelease.pop_front();
+      if (Prev == nullptr)
+        Region->FreeList.push_front(Cur);
+      else
+        Region->FreeList.insert(Prev, Cur);
+      DCHECK_EQ(Cur->Next, BG);
+      Prev = Cur;
+    }
+
+    DCHECK_EQ(Region->FreeList.size(), NumberOfBatchGroups);
+    (void)NumberOfBatchGroups;
+
+    if (SCUDO_DEBUG) {
+      BatchGroup *Prev = Region->FreeList.front();
+      for (BatchGroup *Cur = Prev->Next; Cur != nullptr;
+           Prev = Cur, Cur = Cur->Next) {
+        CHECK_LT(Prev->CompactPtrGroupBase, Cur->CompactPtrGroupBase);
+      }
+    }
+
     return Recorder.getReleasedBytes();
   }
 };
diff --git a/standalone/quarantine.h b/standalone/quarantine.h
index 2d231c3..b5f8db0 100644
--- a/standalone/quarantine.h
+++ b/standalone/quarantine.h
@@ -12,6 +12,7 @@
 #include "list.h"
 #include "mutex.h"
 #include "string_utils.h"
+#include "thread_annotations.h"
 
 namespace scudo {
 
@@ -172,7 +173,7 @@
   typedef QuarantineCache<Callback> CacheT;
   using ThisT = GlobalQuarantine<Callback, Node>;
 
-  void init(uptr Size, uptr CacheSize) {
+  void init(uptr Size, uptr CacheSize) NO_THREAD_SAFETY_ANALYSIS {
     DCHECK(isAligned(reinterpret_cast<uptr>(this), alignof(ThisT)));
     DCHECK_EQ(atomic_load_relaxed(&MaxSize), 0U);
     DCHECK_EQ(atomic_load_relaxed(&MinSize), 0U);
@@ -191,22 +192,31 @@
   uptr getMaxSize() const { return atomic_load_relaxed(&MaxSize); }
   uptr getCacheSize() const { return atomic_load_relaxed(&MaxCacheSize); }
 
+  // This is supposed to be used in test only.
+  bool isEmpty() {
+    ScopedLock L(CacheMutex);
+    return Cache.getSize() == 0U;
+  }
+
   void put(CacheT *C, Callback Cb, Node *Ptr, uptr Size) {
     C->enqueue(Cb, Ptr, Size);
     if (C->getSize() > getCacheSize())
       drain(C, Cb);
   }
 
-  void NOINLINE drain(CacheT *C, Callback Cb) {
+  void NOINLINE drain(CacheT *C, Callback Cb) EXCLUDES(CacheMutex) {
+    bool needRecycle = false;
     {
       ScopedLock L(CacheMutex);
       Cache.transfer(C);
+      needRecycle = Cache.getSize() > getMaxSize();
     }
-    if (Cache.getSize() > getMaxSize() && RecycleMutex.tryLock())
+
+    if (needRecycle && RecycleMutex.tryLock())
       recycle(atomic_load_relaxed(&MinSize), Cb);
   }
 
-  void NOINLINE drainAndRecycle(CacheT *C, Callback Cb) {
+  void NOINLINE drainAndRecycle(CacheT *C, Callback Cb) EXCLUDES(CacheMutex) {
     {
       ScopedLock L(CacheMutex);
       Cache.transfer(C);
@@ -215,20 +225,21 @@
     recycle(0, Cb);
   }
 
-  void getStats(ScopedString *Str) const {
+  void getStats(ScopedString *Str) EXCLUDES(CacheMutex) {
+    ScopedLock L(CacheMutex);
     // It assumes that the world is stopped, just as the allocator's printStats.
     Cache.getStats(Str);
     Str->append("Quarantine limits: global: %zuK; thread local: %zuK\n",
                 getMaxSize() >> 10, getCacheSize() >> 10);
   }
 
-  void disable() {
+  void disable() NO_THREAD_SAFETY_ANALYSIS {
     // RecycleMutex must be locked 1st since we grab CacheMutex within recycle.
     RecycleMutex.lock();
     CacheMutex.lock();
   }
 
-  void enable() {
+  void enable() NO_THREAD_SAFETY_ANALYSIS {
     CacheMutex.unlock();
     RecycleMutex.unlock();
   }
@@ -236,13 +247,14 @@
 private:
   // Read-only data.
   alignas(SCUDO_CACHE_LINE_SIZE) HybridMutex CacheMutex;
-  CacheT Cache;
+  CacheT Cache GUARDED_BY(CacheMutex);
   alignas(SCUDO_CACHE_LINE_SIZE) HybridMutex RecycleMutex;
   atomic_uptr MinSize = {};
   atomic_uptr MaxSize = {};
   alignas(SCUDO_CACHE_LINE_SIZE) atomic_uptr MaxCacheSize = {};
 
-  void NOINLINE recycle(uptr MinSize, Callback Cb) {
+  void NOINLINE recycle(uptr MinSize, Callback Cb) RELEASE(RecycleMutex)
+      EXCLUDES(CacheMutex) {
     CacheT Tmp;
     Tmp.init();
     {
diff --git a/standalone/release.cpp b/standalone/release.cpp
index 5d7c6c5..938bb41 100644
--- a/standalone/release.cpp
+++ b/standalone/release.cpp
@@ -10,7 +10,7 @@
 
 namespace scudo {
 
-HybridMutex PackedCounterArray::Mutex = {};
-uptr PackedCounterArray::StaticBuffer[PackedCounterArray::StaticBufferCount];
+BufferPool<RegionPageMap::StaticBufferCount, RegionPageMap::StaticBufferSize>
+    RegionPageMap::Buffers;
 
 } // namespace scudo
diff --git a/standalone/release.h b/standalone/release.h
index 293a8bc..9ffc88d 100644
--- a/standalone/release.h
+++ b/standalone/release.h
@@ -11,14 +11,46 @@
 
 #include "common.h"
 #include "list.h"
+#include "mem_map.h"
 #include "mutex.h"
+#include "thread_annotations.h"
 
 namespace scudo {
 
+template <typename MemMapT> class RegionReleaseRecorder {
+public:
+  RegionReleaseRecorder(MemMapT *RegionMemMap, uptr Base, uptr Offset = 0)
+      : RegionMemMap(RegionMemMap), Base(Base), Offset(Offset) {}
+
+  uptr getReleasedRangesCount() const { return ReleasedRangesCount; }
+
+  uptr getReleasedBytes() const { return ReleasedBytes; }
+
+  uptr getBase() const { return Base; }
+
+  // Releases [From, To) range of pages back to OS. Note that `From` and `To`
+  // are offseted from `Base` + Offset.
+  void releasePageRangeToOS(uptr From, uptr To) {
+    const uptr Size = To - From;
+    RegionMemMap->releasePagesToOS(getBase() + Offset + From, Size);
+    ReleasedRangesCount++;
+    ReleasedBytes += Size;
+  }
+
+private:
+  uptr ReleasedRangesCount = 0;
+  uptr ReleasedBytes = 0;
+  MemMapT *RegionMemMap = nullptr;
+  uptr Base = 0;
+  // The release offset from Base. This is used when we know a given range after
+  // Base will not be released.
+  uptr Offset = 0;
+};
+
 class ReleaseRecorder {
 public:
-  ReleaseRecorder(uptr Base, MapPlatformData *Data = nullptr)
-      : Base(Base), Data(Data) {}
+  ReleaseRecorder(uptr Base, uptr Offset = 0, MapPlatformData *Data = nullptr)
+      : Base(Base), Offset(Offset), Data(Data) {}
 
   uptr getReleasedRangesCount() const { return ReleasedRangesCount; }
 
@@ -29,7 +61,7 @@
   // Releases [From, To) range of pages back to OS.
   void releasePageRangeToOS(uptr From, uptr To) {
     const uptr Size = To - From;
-    releasePagesToOS(Base, From, Size, Data);
+    releasePagesToOS(Base, From + Offset, Size, Data);
     ReleasedRangesCount++;
     ReleasedBytes += Size;
   }
@@ -37,31 +69,158 @@
 private:
   uptr ReleasedRangesCount = 0;
   uptr ReleasedBytes = 0;
+  // The starting address to release. Note that we may want to combine (Base +
+  // Offset) as a new Base. However, the Base is retrieved from
+  // `MapPlatformData` on Fuchsia, which means the offset won't be aware.
+  // Therefore, store them separately to make it work on all the platforms.
   uptr Base = 0;
+  // The release offset from Base. This is used when we know a given range after
+  // Base will not be released.
+  uptr Offset = 0;
   MapPlatformData *Data = nullptr;
 };
 
-// A packed array of Counters. Each counter occupies 2^N bits, enough to store
-// counter's MaxValue. Ctor will try to use a static buffer first, and if that
-// fails (the buffer is too small or already locked), will allocate the
+// A buffer pool which holds a fixed number of static buffers for fast buffer
+// allocation. If the request size is greater than `StaticBufferSize`, it'll
+// delegate the allocation to map().
+template <uptr StaticBufferCount, uptr StaticBufferSize> class BufferPool {
+public:
+  // Preserve 1 bit in the `Mask` so that we don't need to do zero-check while
+  // extracting the least significant bit from the `Mask`.
+  static_assert(StaticBufferCount < SCUDO_WORDSIZE, "");
+  static_assert(isAligned(StaticBufferSize, SCUDO_CACHE_LINE_SIZE), "");
+
+  // Return a buffer which is at least `BufferSize`.
+  uptr *getBuffer(const uptr BufferSize) {
+    if (UNLIKELY(BufferSize > StaticBufferSize))
+      return getDynamicBuffer(BufferSize);
+
+    uptr index;
+    {
+      // TODO: In general, we expect this operation should be fast so the
+      // waiting thread won't be put into sleep. The HybridMutex does implement
+      // the busy-waiting but we may want to review the performance and see if
+      // we need an explict spin lock here.
+      ScopedLock L(Mutex);
+      index = getLeastSignificantSetBitIndex(Mask);
+      if (index < StaticBufferCount)
+        Mask ^= static_cast<uptr>(1) << index;
+    }
+
+    if (index >= StaticBufferCount)
+      return getDynamicBuffer(BufferSize);
+
+    const uptr Offset = index * StaticBufferSize;
+    memset(&RawBuffer[Offset], 0, StaticBufferSize);
+    return &RawBuffer[Offset];
+  }
+
+  void releaseBuffer(uptr *Buffer, const uptr BufferSize) {
+    const uptr index = getStaticBufferIndex(Buffer, BufferSize);
+    if (index < StaticBufferCount) {
+      ScopedLock L(Mutex);
+      DCHECK_EQ((Mask & (static_cast<uptr>(1) << index)), 0U);
+      Mask |= static_cast<uptr>(1) << index;
+    } else {
+      unmap(reinterpret_cast<void *>(Buffer),
+            roundUp(BufferSize, getPageSizeCached()));
+    }
+  }
+
+  bool isStaticBufferTestOnly(uptr *Buffer, uptr BufferSize) {
+    return getStaticBufferIndex(Buffer, BufferSize) < StaticBufferCount;
+  }
+
+private:
+  uptr getStaticBufferIndex(uptr *Buffer, uptr BufferSize) {
+    if (UNLIKELY(BufferSize > StaticBufferSize))
+      return StaticBufferCount;
+
+    const uptr BufferBase = reinterpret_cast<uptr>(Buffer);
+    const uptr RawBufferBase = reinterpret_cast<uptr>(RawBuffer);
+
+    if (BufferBase < RawBufferBase ||
+        BufferBase >= RawBufferBase + sizeof(RawBuffer)) {
+      return StaticBufferCount;
+    }
+
+    DCHECK_LE(BufferSize, StaticBufferSize);
+    DCHECK_LE(BufferBase + BufferSize, RawBufferBase + sizeof(RawBuffer));
+    DCHECK_EQ((BufferBase - RawBufferBase) % StaticBufferSize, 0U);
+
+    const uptr index =
+        (BufferBase - RawBufferBase) / (StaticBufferSize * sizeof(uptr));
+    DCHECK_LT(index, StaticBufferCount);
+    return index;
+  }
+
+  uptr *getDynamicBuffer(const uptr BufferSize) {
+    // When using a heap-based buffer, precommit the pages backing the
+    // Vmar by passing |MAP_PRECOMMIT| flag. This allows an optimization
+    // where page fault exceptions are skipped as the allocated memory
+    // is accessed. So far, this is only enabled on Fuchsia. It hasn't proven a
+    // performance benefit on other platforms.
+    const uptr MmapFlags = MAP_ALLOWNOMEM | (SCUDO_FUCHSIA ? MAP_PRECOMMIT : 0);
+    return reinterpret_cast<uptr *>(
+        map(nullptr, roundUp(BufferSize, getPageSizeCached()), "scudo:counters",
+            MmapFlags, &MapData));
+  }
+
+  HybridMutex Mutex;
+  // '1' means that buffer index is not used. '0' means the buffer is in use.
+  uptr Mask GUARDED_BY(Mutex) = ~static_cast<uptr>(0);
+  uptr RawBuffer[StaticBufferCount * StaticBufferSize] GUARDED_BY(Mutex);
+  [[no_unique_address]] MapPlatformData MapData = {};
+};
+
+// A Region page map is used to record the usage of pages in the regions. It
+// implements a packed array of Counters. Each counter occupies 2^N bits, enough
+// to store counter's MaxValue. Ctor will try to use a static buffer first, and
+// if that fails (the buffer is too small or already locked), will allocate the
 // required Buffer via map(). The caller is expected to check whether the
 // initialization was successful by checking isAllocated() result. For
 // performance sake, none of the accessors check the validity of the arguments,
 // It is assumed that Index is always in [0, N) range and the value is not
 // incremented past MaxValue.
-class PackedCounterArray {
+class RegionPageMap {
 public:
-  PackedCounterArray(uptr NumberOfRegions, uptr CountersPerRegion,
-                     uptr MaxValue)
-      : Regions(NumberOfRegions), NumCounters(CountersPerRegion) {
-    DCHECK_GT(Regions, 0);
-    DCHECK_GT(NumCounters, 0);
+  RegionPageMap()
+      : Regions(0),
+        NumCounters(0),
+        CounterSizeBitsLog(0),
+        CounterMask(0),
+        PackingRatioLog(0),
+        BitOffsetMask(0),
+        SizePerRegion(0),
+        BufferSize(0),
+        Buffer(nullptr) {}
+  RegionPageMap(uptr NumberOfRegions, uptr CountersPerRegion, uptr MaxValue) {
+    reset(NumberOfRegions, CountersPerRegion, MaxValue);
+  }
+  ~RegionPageMap() {
+    if (!isAllocated())
+      return;
+    Buffers.releaseBuffer(Buffer, BufferSize);
+    Buffer = nullptr;
+  }
+
+  // Lock of `StaticBuffer` is acquired conditionally and there's no easy way to
+  // specify the thread-safety attribute properly in current code structure.
+  // Besides, it's the only place we may want to check thread safety. Therefore,
+  // it's fine to bypass the thread-safety analysis now.
+  void reset(uptr NumberOfRegion, uptr CountersPerRegion, uptr MaxValue) {
+    DCHECK_GT(NumberOfRegion, 0);
+    DCHECK_GT(CountersPerRegion, 0);
     DCHECK_GT(MaxValue, 0);
+
+    Regions = NumberOfRegion;
+    NumCounters = CountersPerRegion;
+
     constexpr uptr MaxCounterBits = sizeof(*Buffer) * 8UL;
     // Rounding counter storage size up to the power of two allows for using
     // bit shifts calculating particular counter's Index and offset.
     const uptr CounterSizeBits =
-        roundUpToPowerOfTwo(getMostSignificantSetBitIndex(MaxValue) + 1);
+        roundUpPowerOfTwo(getMostSignificantSetBitIndex(MaxValue) + 1);
     DCHECK_LE(CounterSizeBits, MaxCounterBits);
     CounterSizeBitsLog = getLog2(CounterSizeBits);
     CounterMask = ~(static_cast<uptr>(0)) >> (MaxCounterBits - CounterSizeBits);
@@ -72,27 +231,10 @@
     BitOffsetMask = PackingRatio - 1;
 
     SizePerRegion =
-        roundUpTo(NumCounters, static_cast<uptr>(1U) << PackingRatioLog) >>
+        roundUp(NumCounters, static_cast<uptr>(1U) << PackingRatioLog) >>
         PackingRatioLog;
     BufferSize = SizePerRegion * sizeof(*Buffer) * Regions;
-    if (BufferSize <= (StaticBufferCount * sizeof(Buffer[0])) &&
-        Mutex.tryLock()) {
-      Buffer = &StaticBuffer[0];
-      memset(Buffer, 0, BufferSize);
-    } else {
-      Buffer = reinterpret_cast<uptr *>(
-          map(nullptr, roundUpTo(BufferSize, getPageSizeCached()),
-              "scudo:counters", MAP_ALLOWNOMEM));
-    }
-  }
-  ~PackedCounterArray() {
-    if (!isAllocated())
-      return;
-    if (Buffer == &StaticBuffer[0])
-      Mutex.unlock();
-    else
-      unmap(reinterpret_cast<void *>(Buffer),
-            roundUpTo(BufferSize, getPageSizeCached()));
+    Buffer = Buffers.getBuffer(BufferSize);
   }
 
   bool isAllocated() const { return !!Buffer; }
@@ -112,10 +254,22 @@
     const uptr Index = I >> PackingRatioLog;
     const uptr BitOffset = (I & BitOffsetMask) << CounterSizeBitsLog;
     DCHECK_LT(BitOffset, SCUDO_WORDSIZE);
+    DCHECK_EQ(isAllCounted(Region, I), false);
     Buffer[Region * SizePerRegion + Index] += static_cast<uptr>(1U)
                                               << BitOffset;
   }
 
+  void incN(uptr Region, uptr I, uptr N) const {
+    DCHECK_GT(N, 0U);
+    DCHECK_LE(N, CounterMask);
+    DCHECK_LE(get(Region, I), CounterMask - N);
+    const uptr Index = I >> PackingRatioLog;
+    const uptr BitOffset = (I & BitOffsetMask) << CounterSizeBitsLog;
+    DCHECK_LT(BitOffset, SCUDO_WORDSIZE);
+    DCHECK_EQ(isAllCounted(Region, I), false);
+    Buffer[Region * SizePerRegion + Index] += N << BitOffset;
+  }
+
   void incRange(uptr Region, uptr From, uptr To) const {
     DCHECK_LE(From, To);
     const uptr Top = Min(To + 1, NumCounters);
@@ -123,13 +277,43 @@
       inc(Region, I);
   }
 
+  // Set the counter to the max value. Note that the max number of blocks in a
+  // page may vary. To provide an easier way to tell if all the blocks are
+  // counted for different pages, set to the same max value to denote the
+  // all-counted status.
+  void setAsAllCounted(uptr Region, uptr I) const {
+    DCHECK_LE(get(Region, I), CounterMask);
+    const uptr Index = I >> PackingRatioLog;
+    const uptr BitOffset = (I & BitOffsetMask) << CounterSizeBitsLog;
+    DCHECK_LT(BitOffset, SCUDO_WORDSIZE);
+    Buffer[Region * SizePerRegion + Index] |= CounterMask << BitOffset;
+  }
+  void setAsAllCountedRange(uptr Region, uptr From, uptr To) const {
+    DCHECK_LE(From, To);
+    const uptr Top = Min(To + 1, NumCounters);
+    for (uptr I = From; I < Top; I++)
+      setAsAllCounted(Region, I);
+  }
+
+  bool updateAsAllCountedIf(uptr Region, uptr I, uptr MaxCount) {
+    const uptr Count = get(Region, I);
+    if (Count == CounterMask)
+      return true;
+    if (Count == MaxCount) {
+      setAsAllCounted(Region, I);
+      return true;
+    }
+    return false;
+  }
+  bool isAllCounted(uptr Region, uptr I) const {
+    return get(Region, I) == CounterMask;
+  }
+
   uptr getBufferSize() const { return BufferSize; }
 
-  static const uptr StaticBufferCount = 2048U;
-
 private:
-  const uptr Regions;
-  const uptr NumCounters;
+  uptr Regions;
+  uptr NumCounters;
   uptr CounterSizeBitsLog;
   uptr CounterMask;
   uptr PackingRatioLog;
@@ -139,17 +323,20 @@
   uptr BufferSize;
   uptr *Buffer;
 
-  static HybridMutex Mutex;
-  static uptr StaticBuffer[StaticBufferCount];
+  // We may consider making this configurable if there are cases which may
+  // benefit from this.
+  static const uptr StaticBufferCount = 2U;
+  static const uptr StaticBufferSize = 512U;
+  static BufferPool<StaticBufferCount, StaticBufferSize> Buffers;
 };
 
 template <class ReleaseRecorderT> class FreePagesRangeTracker {
 public:
-  explicit FreePagesRangeTracker(ReleaseRecorderT *Recorder)
+  explicit FreePagesRangeTracker(ReleaseRecorderT &Recorder)
       : Recorder(Recorder), PageSizeLog(getLog2(getPageSizeCached())) {}
 
-  void processNextPage(bool Freed) {
-    if (Freed) {
+  void processNextPage(bool Released) {
+    if (Released) {
       if (!InRange) {
         CurrentRangeStatePage = CurrentPage;
         InRange = true;
@@ -170,113 +357,270 @@
 private:
   void closeOpenedRange() {
     if (InRange) {
-      Recorder->releasePageRangeToOS((CurrentRangeStatePage << PageSizeLog),
-                                     (CurrentPage << PageSizeLog));
+      Recorder.releasePageRangeToOS((CurrentRangeStatePage << PageSizeLog),
+                                    (CurrentPage << PageSizeLog));
       InRange = false;
     }
   }
 
-  ReleaseRecorderT *const Recorder;
+  ReleaseRecorderT &Recorder;
   const uptr PageSizeLog;
   bool InRange = false;
   uptr CurrentPage = 0;
   uptr CurrentRangeStatePage = 0;
 };
 
-template <class TransferBatchT, class ReleaseRecorderT, typename DecompactPtrT,
-          typename SkipRegionT>
-NOINLINE void
-releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList,
-                      uptr RegionSize, uptr NumberOfRegions, uptr BlockSize,
-                      ReleaseRecorderT *Recorder, DecompactPtrT DecompactPtr,
-                      SkipRegionT SkipRegion) {
-  const uptr PageSize = getPageSizeCached();
-
-  // Figure out the number of chunks per page and whether we can take a fast
-  // path (the number of chunks per page is the same for all pages).
-  uptr FullPagesBlockCountMax;
-  bool SameBlockCountPerPage;
-  if (BlockSize <= PageSize) {
-    if (PageSize % BlockSize == 0) {
-      // Same number of chunks per page, no cross overs.
-      FullPagesBlockCountMax = PageSize / BlockSize;
-      SameBlockCountPerPage = true;
-    } else if (BlockSize % (PageSize % BlockSize) == 0) {
-      // Some chunks are crossing page boundaries, which means that the page
-      // contains one or two partial chunks, but all pages contain the same
-      // number of chunks.
-      FullPagesBlockCountMax = PageSize / BlockSize + 1;
-      SameBlockCountPerPage = true;
+struct PageReleaseContext {
+  PageReleaseContext(uptr BlockSize, uptr NumberOfRegions, uptr ReleaseSize,
+                     uptr ReleaseOffset = 0)
+      : BlockSize(BlockSize), NumberOfRegions(NumberOfRegions) {
+    PageSize = getPageSizeCached();
+    if (BlockSize <= PageSize) {
+      if (PageSize % BlockSize == 0) {
+        // Same number of chunks per page, no cross overs.
+        FullPagesBlockCountMax = PageSize / BlockSize;
+        SameBlockCountPerPage = true;
+      } else if (BlockSize % (PageSize % BlockSize) == 0) {
+        // Some chunks are crossing page boundaries, which means that the page
+        // contains one or two partial chunks, but all pages contain the same
+        // number of chunks.
+        FullPagesBlockCountMax = PageSize / BlockSize + 1;
+        SameBlockCountPerPage = true;
+      } else {
+        // Some chunks are crossing page boundaries, which means that the page
+        // contains one or two partial chunks.
+        FullPagesBlockCountMax = PageSize / BlockSize + 2;
+        SameBlockCountPerPage = false;
+      }
     } else {
-      // Some chunks are crossing page boundaries, which means that the page
-      // contains one or two partial chunks.
-      FullPagesBlockCountMax = PageSize / BlockSize + 2;
-      SameBlockCountPerPage = false;
-    }
-  } else {
-    if (BlockSize % PageSize == 0) {
-      // One chunk covers multiple pages, no cross overs.
-      FullPagesBlockCountMax = 1;
-      SameBlockCountPerPage = true;
-    } else {
-      // One chunk covers multiple pages, Some chunks are crossing page
-      // boundaries. Some pages contain one chunk, some contain two.
-      FullPagesBlockCountMax = 2;
-      SameBlockCountPerPage = false;
-    }
-  }
-
-  const uptr PagesCount = roundUpTo(RegionSize, PageSize) / PageSize;
-  PackedCounterArray Counters(NumberOfRegions, PagesCount,
-                              FullPagesBlockCountMax);
-  if (!Counters.isAllocated())
-    return;
-
-  const uptr PageSizeLog = getLog2(PageSize);
-  const uptr RoundedRegionSize = PagesCount << PageSizeLog;
-  const uptr RoundedSize = NumberOfRegions * RoundedRegionSize;
-
-  // Iterate over free chunks and count how many free chunks affect each
-  // allocated page.
-  if (BlockSize <= PageSize && PageSize % BlockSize == 0) {
-    // Each chunk affects one page only.
-    for (const auto &It : FreeList) {
-      for (u32 I = 0; I < It.getCount(); I++) {
-        const uptr P = DecompactPtr(It.get(I)) - Recorder->getBase();
-        if (P >= RoundedSize)
-          continue;
-        const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
-        const uptr PInRegion = P - RegionIndex * RegionSize;
-        Counters.inc(RegionIndex, PInRegion >> PageSizeLog);
+      if (BlockSize % PageSize == 0) {
+        // One chunk covers multiple pages, no cross overs.
+        FullPagesBlockCountMax = 1;
+        SameBlockCountPerPage = true;
+      } else {
+        // One chunk covers multiple pages, Some chunks are crossing page
+        // boundaries. Some pages contain one chunk, some contain two.
+        FullPagesBlockCountMax = 2;
+        SameBlockCountPerPage = false;
       }
     }
-  } else {
-    // In all other cases chunks might affect more than one page.
-    DCHECK_GE(RegionSize, BlockSize);
-    const uptr LastBlockInRegion = ((RegionSize / BlockSize) - 1U) * BlockSize;
-    for (const auto &It : FreeList) {
-      for (u32 I = 0; I < It.getCount(); I++) {
-        const uptr P = DecompactPtr(It.get(I)) - Recorder->getBase();
-        if (P >= RoundedSize)
-          continue;
-        const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
-        uptr PInRegion = P - RegionIndex * RegionSize;
-        Counters.incRange(RegionIndex, PInRegion >> PageSizeLog,
-                          (PInRegion + BlockSize - 1) >> PageSizeLog);
-        // The last block in a region might straddle a page, so if it's
-        // free, we mark the following "pretend" memory block(s) as free.
-        if (PInRegion == LastBlockInRegion) {
-          PInRegion += BlockSize;
-          while (PInRegion < RoundedRegionSize) {
-            Counters.incRange(RegionIndex, PInRegion >> PageSizeLog,
-                              (PInRegion + BlockSize - 1) >> PageSizeLog);
-            PInRegion += BlockSize;
-          }
+
+    // TODO: For multiple regions, it's more complicated to support partial
+    // region marking (which includes the complexity of how to handle the last
+    // block in a region). We may consider this after markFreeBlocks() accepts
+    // only free blocks from the same region.
+    if (NumberOfRegions != 1)
+      DCHECK_EQ(ReleaseOffset, 0U);
+
+    PagesCount = roundUp(ReleaseSize, PageSize) / PageSize;
+    PageSizeLog = getLog2(PageSize);
+    ReleasePageOffset = ReleaseOffset >> PageSizeLog;
+  }
+
+  // PageMap is lazily allocated when markFreeBlocks() is invoked.
+  bool hasBlockMarked() const {
+    return PageMap.isAllocated();
+  }
+
+  bool ensurePageMapAllocated() {
+    if (PageMap.isAllocated())
+      return true;
+    PageMap.reset(NumberOfRegions, PagesCount, FullPagesBlockCountMax);
+    // TODO: Log some message when we fail on PageMap allocation.
+    return PageMap.isAllocated();
+  }
+
+  // Mark all the blocks in the given range [From, to). Instead of visiting all
+  // the blocks, we will just mark the page as all counted. Note the `From` and
+  // `To` has to be page aligned but with one exception, if `To` is equal to the
+  // RegionSize, it's not necessary to be aligned with page size.
+  bool markRangeAsAllCounted(uptr From, uptr To, uptr Base,
+                             const uptr RegionIndex, const uptr RegionSize) {
+    DCHECK_LT(From, To);
+    DCHECK_LE(To, Base + RegionSize);
+    DCHECK_EQ(From % PageSize, 0U);
+    DCHECK_LE(To - From, RegionSize);
+
+    if (!ensurePageMapAllocated())
+      return false;
+
+    uptr FromInRegion = From - Base;
+    uptr ToInRegion = To - Base;
+    uptr FirstBlockInRange = roundUpSlow(FromInRegion, BlockSize);
+
+    // The straddling block sits across entire range.
+    if (FirstBlockInRange >= ToInRegion)
+      return true;
+
+    // First block may not sit at the first pape in the range, move
+    // `FromInRegion` to the first block page.
+    FromInRegion = roundDown(FirstBlockInRange, PageSize);
+
+    // When The first block is not aligned to the range boundary, which means
+    // there is a block sitting acorss `From`, that looks like,
+    //
+    //   From                                             To
+    //     V                                               V
+    //     +-----------------------------------------------+
+    //  +-----+-----+-----+-----+
+    //  |     |     |     |     | ...
+    //  +-----+-----+-----+-----+
+    //     |-    first page     -||-    second page    -||- ...
+    //
+    // Therefore, we can't just mark the first page as all counted. Instead, we
+    // increment the number of blocks in the first page in the page map and
+    // then round up the `From` to the next page.
+    if (FirstBlockInRange != FromInRegion) {
+      DCHECK_GT(FromInRegion + PageSize, FirstBlockInRange);
+      uptr NumBlocksInFirstPage =
+          (FromInRegion + PageSize - FirstBlockInRange + BlockSize - 1) /
+          BlockSize;
+      PageMap.incN(RegionIndex, getPageIndex(FromInRegion),
+                   NumBlocksInFirstPage);
+      FromInRegion = roundUp(FromInRegion + 1, PageSize);
+    }
+
+    uptr LastBlockInRange = roundDownSlow(ToInRegion - 1, BlockSize);
+
+    // Note that LastBlockInRange may be smaller than `FromInRegion` at this
+    // point because it may contain only one block in the range.
+
+    // When the last block sits across `To`, we can't just mark the pages
+    // occupied by the last block as all counted. Instead, we increment the
+    // counters of those pages by 1. The exception is that if it's the last
+    // block in the region, it's fine to mark those pages as all counted.
+    if (LastBlockInRange + BlockSize != RegionSize) {
+      DCHECK_EQ(ToInRegion % PageSize, 0U);
+      // The case below is like,
+      //
+      //   From                                      To
+      //     V                                        V
+      //     +----------------------------------------+
+      //                          +-----+-----+-----+-----+
+      //                          |     |     |     |     | ...
+      //                          +-----+-----+-----+-----+
+      //                    ... -||-    last page    -||-    next page    -|
+      //
+      // The last block is not aligned to `To`, we need to increment the
+      // counter of `next page` by 1.
+      if (LastBlockInRange + BlockSize != ToInRegion) {
+        PageMap.incRange(RegionIndex, getPageIndex(ToInRegion),
+                         getPageIndex(LastBlockInRange + BlockSize - 1));
+      }
+    } else {
+      ToInRegion = RegionSize;
+    }
+
+    // After handling the first page and the last block, it's safe to mark any
+    // page in between the range [From, To).
+    if (FromInRegion < ToInRegion) {
+      PageMap.setAsAllCountedRange(RegionIndex, getPageIndex(FromInRegion),
+                                   getPageIndex(ToInRegion - 1));
+    }
+
+    return true;
+  }
+
+  template <class TransferBatchT, typename DecompactPtrT>
+  bool markFreeBlocksInRegion(const IntrusiveList<TransferBatchT> &FreeList,
+                              DecompactPtrT DecompactPtr, const uptr Base,
+                              const uptr RegionIndex, const uptr RegionSize,
+                              bool MayContainLastBlockInRegion) {
+    if (!ensurePageMapAllocated())
+      return false;
+
+    if (MayContainLastBlockInRegion) {
+      const uptr LastBlockInRegion =
+          ((RegionSize / BlockSize) - 1U) * BlockSize;
+      // The last block in a region may not use the entire page, we mark the
+      // following "pretend" memory block(s) as free in advance.
+      //
+      //     Region Boundary
+      //         v
+      //  -----+-----------------------+
+      //       |      Last Page        | <- Rounded Region Boundary
+      //  -----+-----------------------+
+      //   |-----||- trailing blocks  -|
+      //      ^
+      //   last block
+      const uptr RoundedRegionSize = roundUp(RegionSize, PageSize);
+      const uptr TrailingBlockBase = LastBlockInRegion + BlockSize;
+      // If the difference between `RoundedRegionSize` and
+      // `TrailingBlockBase` is larger than a page, that implies the reported
+      // `RegionSize` may not be accurate.
+      DCHECK_LT(RoundedRegionSize - TrailingBlockBase, PageSize);
+
+      // Only the last page touched by the last block needs to mark the trailing
+      // blocks. Note that if the last "pretend" block straddles the boundary,
+      // we still have to count it in so that the logic of counting the number
+      // of blocks on a page is consistent.
+      uptr NumTrailingBlocks =
+          (roundUpSlow(RoundedRegionSize - TrailingBlockBase, BlockSize) +
+           BlockSize - 1) /
+          BlockSize;
+      if (NumTrailingBlocks > 0) {
+        PageMap.incN(RegionIndex, getPageIndex(TrailingBlockBase),
+                     NumTrailingBlocks);
+      }
+    }
+
+    // Iterate over free chunks and count how many free chunks affect each
+    // allocated page.
+    if (BlockSize <= PageSize && PageSize % BlockSize == 0) {
+      // Each chunk affects one page only.
+      for (const auto &It : FreeList) {
+        for (u16 I = 0; I < It.getCount(); I++) {
+          const uptr PInRegion = DecompactPtr(It.get(I)) - Base;
+          DCHECK_LT(PInRegion, RegionSize);
+          PageMap.inc(RegionIndex, getPageIndex(PInRegion));
+        }
+      }
+    } else {
+      // In all other cases chunks might affect more than one page.
+      DCHECK_GE(RegionSize, BlockSize);
+      for (const auto &It : FreeList) {
+        for (u16 I = 0; I < It.getCount(); I++) {
+          const uptr PInRegion = DecompactPtr(It.get(I)) - Base;
+          PageMap.incRange(RegionIndex, getPageIndex(PInRegion),
+                           getPageIndex(PInRegion + BlockSize - 1));
         }
       }
     }
+
+    return true;
   }
 
+  uptr getPageIndex(uptr P) { return (P >> PageSizeLog) - ReleasePageOffset; }
+
+  uptr BlockSize;
+  uptr NumberOfRegions;
+  // For partial region marking, some pages in front are not needed to be
+  // counted.
+  uptr ReleasePageOffset;
+  uptr PageSize;
+  uptr PagesCount;
+  uptr PageSizeLog;
+  uptr FullPagesBlockCountMax;
+  bool SameBlockCountPerPage;
+  RegionPageMap PageMap;
+};
+
+// Try to release the page which doesn't have any in-used block, i.e., they are
+// all free blocks. The `PageMap` will record the number of free blocks in each
+// page.
+template <class ReleaseRecorderT, typename SkipRegionT>
+NOINLINE void
+releaseFreeMemoryToOS(PageReleaseContext &Context,
+                      ReleaseRecorderT &Recorder, SkipRegionT SkipRegion) {
+  const uptr PageSize = Context.PageSize;
+  const uptr BlockSize = Context.BlockSize;
+  const uptr PagesCount = Context.PagesCount;
+  const uptr NumberOfRegions = Context.NumberOfRegions;
+  const uptr ReleasePageOffset = Context.ReleasePageOffset;
+  const uptr FullPagesBlockCountMax = Context.FullPagesBlockCountMax;
+  const bool SameBlockCountPerPage = Context.SameBlockCountPerPage;
+  RegionPageMap &PageMap = Context.PageMap;
+
   // Iterate over pages detecting ranges of pages with chunk Counters equal
   // to the expected number of chunks for the particular page.
   FreePagesRangeTracker<ReleaseRecorderT> RangeTracker(Recorder);
@@ -287,9 +631,11 @@
         RangeTracker.skipPages(PagesCount);
         continue;
       }
-      for (uptr J = 0; J < PagesCount; J++)
-        RangeTracker.processNextPage(Counters.get(I, J) ==
-                                     FullPagesBlockCountMax);
+      for (uptr J = 0; J < PagesCount; J++) {
+        const bool CanRelease =
+            PageMap.updateAsAllCountedIf(I, J, FullPagesBlockCountMax);
+        RangeTracker.processNextPage(CanRelease);
+      }
     }
   } else {
     // Slow path, go through the pages keeping count how many chunks affect
@@ -308,6 +654,10 @@
       }
       uptr PrevPageBoundary = 0;
       uptr CurrentBoundary = 0;
+      if (ReleasePageOffset > 0) {
+        PrevPageBoundary = ReleasePageOffset * PageSize;
+        CurrentBoundary = roundUpSlow(PrevPageBoundary, BlockSize);
+      }
       for (uptr J = 0; J < PagesCount; J++) {
         const uptr PageBoundary = PrevPageBoundary + PageSize;
         uptr BlocksPerPage = Pn;
@@ -321,7 +671,9 @@
           }
         }
         PrevPageBoundary = PageBoundary;
-        RangeTracker.processNextPage(Counters.get(I, J) == BlocksPerPage);
+        const bool CanRelease =
+            PageMap.updateAsAllCountedIf(I, J, BlocksPerPage);
+        RangeTracker.processNextPage(CanRelease);
       }
     }
   }
diff --git a/standalone/report.cpp b/standalone/report.cpp
index 561c7c5..16eae8c 100644
--- a/standalone/report.cpp
+++ b/standalone/report.cpp
@@ -36,6 +36,18 @@
 
 inline void NORETURN trap() { __builtin_trap(); }
 
+void NORETURN reportSoftRSSLimit(uptr RssLimitMb) {
+  ScopedErrorReport Report;
+  Report.append("Soft RSS limit of %zu MB exhausted, current RSS is %zu MB\n",
+                RssLimitMb, GetRSS() >> 20);
+}
+
+void NORETURN reportHardRSSLimit(uptr RssLimitMb) {
+  ScopedErrorReport Report;
+  Report.append("Hard RSS limit of %zu MB exhausted, current RSS is %zu MB\n",
+                RssLimitMb, GetRSS() >> 20);
+}
+
 // This could potentially be called recursively if a CHECK fails in the reports.
 void NORETURN reportCheckFailed(const char *File, int Line,
                                 const char *Condition, u64 Value1, u64 Value2) {
@@ -100,6 +112,11 @@
                 UserSize, TotalSize, MaxSize);
 }
 
+void NORETURN reportOutOfBatchClass() {
+  ScopedErrorReport Report;
+  Report.append("BatchClass region is used up, can't hold any free block\n");
+}
+
 void NORETURN reportOutOfMemory(uptr RequestedSize) {
   ScopedErrorReport Report;
   Report.append("out of memory trying to allocate %zu bytes\n", RequestedSize);
diff --git a/standalone/report.h b/standalone/report.h
index 14e4e79..3a78ab6 100644
--- a/standalone/report.h
+++ b/standalone/report.h
@@ -32,7 +32,10 @@
 void NORETURN reportAlignmentTooBig(uptr Alignment, uptr MaxAlignment);
 void NORETURN reportAllocationSizeTooBig(uptr UserSize, uptr TotalSize,
                                          uptr MaxSize);
+void NORETURN reportOutOfBatchClass();
 void NORETURN reportOutOfMemory(uptr RequestedSize);
+void NORETURN reportSoftRSSLimit(uptr RssLimitMb);
+void NORETURN reportHardRSSLimit(uptr RssLimitMb);
 enum class AllocatorAction : u8 {
   Recycling,
   Deallocating,
diff --git a/standalone/rss_limit_checker.cpp b/standalone/rss_limit_checker.cpp
new file mode 100644
index 0000000..f428386
--- /dev/null
+++ b/standalone/rss_limit_checker.cpp
@@ -0,0 +1,37 @@
+//===-- common.cpp ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "rss_limit_checker.h"
+#include "atomic_helpers.h"
+#include "string_utils.h"
+
+namespace scudo {
+
+void RssLimitChecker::check(u64 NextCheck) {
+  // The interval for the checks is 250ms.
+  static constexpr u64 CheckInterval = 250 * 1000000;
+
+  // Early return in case another thread already did the calculation.
+  if (!atomic_compare_exchange_strong(&RssNextCheckAtNS, &NextCheck,
+                                      getMonotonicTime() + CheckInterval,
+                                      memory_order_relaxed)) {
+    return;
+  }
+
+  const uptr CurrentRssMb = GetRSS() >> 20;
+
+  RssLimitExceeded Result = RssLimitExceeded::Neither;
+  if (UNLIKELY(HardRssLimitMb && HardRssLimitMb < CurrentRssMb))
+    Result = RssLimitExceeded::Hard;
+  else if (UNLIKELY(SoftRssLimitMb && SoftRssLimitMb < CurrentRssMb))
+    Result = RssLimitExceeded::Soft;
+
+  atomic_store_relaxed(&RssLimitStatus, static_cast<u8>(Result));
+}
+
+} // namespace scudo
diff --git a/standalone/rss_limit_checker.h b/standalone/rss_limit_checker.h
new file mode 100644
index 0000000..29dc063
--- /dev/null
+++ b/standalone/rss_limit_checker.h
@@ -0,0 +1,63 @@
+//===-- common.h ------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_RSS_LIMIT_CHECKER_H_
+#define SCUDO_RSS_LIMIT_CHECKER_H_
+
+#include "atomic_helpers.h"
+#include "common.h"
+#include "internal_defs.h"
+
+namespace scudo {
+
+class RssLimitChecker {
+public:
+  enum RssLimitExceeded {
+    Neither,
+    Soft,
+    Hard,
+  };
+
+  void init(int SoftRssLimitMb, int HardRssLimitMb) {
+    CHECK_GE(SoftRssLimitMb, 0);
+    CHECK_GE(HardRssLimitMb, 0);
+    this->SoftRssLimitMb = static_cast<uptr>(SoftRssLimitMb);
+    this->HardRssLimitMb = static_cast<uptr>(HardRssLimitMb);
+  }
+
+  // Opportunistic RSS limit check. This will update the RSS limit status, if
+  // it can, every 250ms, otherwise it will just return the current one.
+  RssLimitExceeded getRssLimitExceeded() {
+    if (!HardRssLimitMb && !SoftRssLimitMb)
+      return RssLimitExceeded::Neither;
+
+    u64 NextCheck = atomic_load_relaxed(&RssNextCheckAtNS);
+    u64 Now = getMonotonicTime();
+
+    if (UNLIKELY(Now >= NextCheck))
+      check(NextCheck);
+
+    return static_cast<RssLimitExceeded>(atomic_load_relaxed(&RssLimitStatus));
+  }
+
+  uptr getSoftRssLimit() const { return SoftRssLimitMb; }
+  uptr getHardRssLimit() const { return HardRssLimitMb; }
+
+private:
+  void check(u64 NextCheck);
+
+  uptr SoftRssLimitMb = 0;
+  uptr HardRssLimitMb = 0;
+
+  atomic_u64 RssNextCheckAtNS = {};
+  atomic_u8 RssLimitStatus = {};
+};
+
+} // namespace scudo
+
+#endif // SCUDO_RSS_LIMIT_CHECKER_H_
diff --git a/standalone/secondary.h b/standalone/secondary.h
index 2d177576..94009f5 100644
--- a/standalone/secondary.h
+++ b/standalone/secondary.h
@@ -12,11 +12,13 @@
 #include "chunk.h"
 #include "common.h"
 #include "list.h"
+#include "mem_map.h"
 #include "memtag.h"
 #include "mutex.h"
 #include "options.h"
 #include "stats.h"
 #include "string_utils.h"
+#include "thread_annotations.h"
 
 namespace scudo {
 
@@ -36,9 +38,7 @@
   LargeBlock::Header *Next;
   uptr CommitBase;
   uptr CommitSize;
-  uptr MapBase;
-  uptr MapSize;
-  [[no_unique_address]] MapPlatformData Data;
+  MemMapT MemMap;
 };
 
 static_assert(sizeof(Header) % (1U << SCUDO_MIN_ALIGNMENT_LOG) == 0, "");
@@ -65,8 +65,11 @@
 } // namespace LargeBlock
 
 static void unmap(LargeBlock::Header *H) {
-  MapPlatformData Data = H->Data;
-  unmap(reinterpret_cast<void *>(H->MapBase), H->MapSize, UNMAP_ALL, &Data);
+  // Note that the `H->MapMap` is stored on the pages managed by itself. Take
+  // over the ownership before unmap() so that any operation along with unmap()
+  // won't touch inaccessible pages.
+  MemMapT MemMap = H->MemMap;
+  MemMap.unmap(MemMap.getBase(), MemMap.getCapacity());
 }
 
 class MapAllocatorNoCache {
@@ -96,20 +99,19 @@
 
 template <typename Config>
 void mapSecondary(Options Options, uptr CommitBase, uptr CommitSize,
-                  uptr AllocPos, uptr Flags, MapPlatformData *Data) {
+                  uptr AllocPos, uptr Flags, MemMapT &MemMap) {
   const uptr MaxUnusedCacheBytes = MaxUnusedCachePages * getPageSizeCached();
   if (useMemoryTagging<Config>(Options) && CommitSize > MaxUnusedCacheBytes) {
     const uptr UntaggedPos = Max(AllocPos, CommitBase + MaxUnusedCacheBytes);
-    map(reinterpret_cast<void *>(CommitBase), UntaggedPos - CommitBase,
-        "scudo:secondary", MAP_RESIZABLE | MAP_MEMTAG | Flags, Data);
-    map(reinterpret_cast<void *>(UntaggedPos),
-        CommitBase + CommitSize - UntaggedPos, "scudo:secondary",
-        MAP_RESIZABLE | Flags, Data);
+    MemMap.remap(CommitBase, UntaggedPos - CommitBase, "scudo:secondary",
+                 MAP_RESIZABLE | MAP_MEMTAG | Flags);
+    MemMap.remap(UntaggedPos, CommitBase + CommitSize - UntaggedPos,
+                 "scudo:secondary", MAP_RESIZABLE | Flags);
   } else {
-    map(reinterpret_cast<void *>(CommitBase), CommitSize, "scudo:secondary",
+    const uptr RemapFlags =
         MAP_RESIZABLE | (useMemoryTagging<Config>(Options) ? MAP_MEMTAG : 0) |
-            Flags,
-        Data);
+        Flags;
+    MemMap.remap(CommitBase, CommitSize, "scudo:secondary", RemapFlags);
   }
 }
 
@@ -133,7 +135,7 @@
                     Config::SecondaryCacheEntriesArraySize,
                 "");
 
-  void init(s32 ReleaseToOsInterval) {
+  void init(s32 ReleaseToOsInterval) NO_THREAD_SAFETY_ANALYSIS {
     DCHECK_EQ(EntriesCount, 0U);
     setOption(Option::MaxCacheEntriesCount,
               static_cast<sptr>(Config::SecondaryCacheDefaultMaxEntriesCount));
@@ -142,7 +144,7 @@
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
 
-  void store(Options Options, LargeBlock::Header *H) {
+  void store(Options Options, LargeBlock::Header *H) EXCLUDES(Mutex) {
     if (!canCache(H->CommitSize))
       return unmap(H);
 
@@ -154,10 +156,8 @@
     CachedBlock Entry;
     Entry.CommitBase = H->CommitBase;
     Entry.CommitSize = H->CommitSize;
-    Entry.MapBase = H->MapBase;
-    Entry.MapSize = H->MapSize;
     Entry.BlockBegin = reinterpret_cast<uptr>(H + 1);
-    Entry.Data = H->Data;
+    Entry.MemMap = H->MemMap;
     Entry.Time = Time;
     if (useMemoryTagging<Config>(Options)) {
       if (Interval == 0 && !SCUDO_FUCHSIA) {
@@ -167,13 +167,13 @@
         // on top so we just do the two syscalls there.
         Entry.Time = 0;
         mapSecondary<Config>(Options, Entry.CommitBase, Entry.CommitSize,
-                             Entry.CommitBase, MAP_NOACCESS, &Entry.Data);
+                             Entry.CommitBase, MAP_NOACCESS, Entry.MemMap);
       } else {
-        setMemoryPermission(Entry.CommitBase, Entry.CommitSize, MAP_NOACCESS,
-                            &Entry.Data);
+        Entry.MemMap.setMemoryPermission(Entry.CommitBase, Entry.CommitSize,
+                                         MAP_NOACCESS);
       }
     } else if (Interval == 0) {
-      releasePagesToOS(Entry.CommitBase, 0, Entry.CommitSize, &Entry.Data);
+      Entry.MemMap.releasePagesToOS(Entry.CommitBase, Entry.CommitSize);
       Entry.Time = 0;
     }
     do {
@@ -222,12 +222,11 @@
     else if (Interval >= 0)
       releaseOlderThan(Time - static_cast<u64>(Interval) * 1000000);
     if (!EntryCached)
-      unmap(reinterpret_cast<void *>(Entry.MapBase), Entry.MapSize, UNMAP_ALL,
-            &Entry.Data);
+      Entry.MemMap.unmap(Entry.MemMap.getBase(), Entry.MemMap.getCapacity());
   }
 
   bool retrieve(Options Options, uptr Size, uptr Alignment,
-                LargeBlock::Header **H, bool *Zeroed) {
+                LargeBlock::Header **H, bool *Zeroed) EXCLUDES(Mutex) {
     const uptr PageSize = getPageSizeCached();
     const u32 MaxCount = atomic_load_relaxed(&MaxEntriesCount);
     bool Found = false;
@@ -243,45 +242,46 @@
           continue;
         const uptr CommitSize = Entries[I].CommitSize;
         const uptr AllocPos =
-            roundDownTo(CommitBase + CommitSize - Size, Alignment);
+            roundDown(CommitBase + CommitSize - Size, Alignment);
         HeaderPos =
             AllocPos - Chunk::getHeaderSize() - LargeBlock::getHeaderSize();
         if (HeaderPos > CommitBase + CommitSize)
           continue;
         if (HeaderPos < CommitBase ||
-            AllocPos > CommitBase + PageSize * MaxUnusedCachePages)
+            AllocPos > CommitBase + PageSize * MaxUnusedCachePages) {
           continue;
+        }
         Found = true;
         Entry = Entries[I];
         Entries[I].CommitBase = 0;
+        EntriesCount--;
         break;
       }
     }
-    if (Found) {
-      *H = reinterpret_cast<LargeBlock::Header *>(
-          LargeBlock::addHeaderTag<Config>(HeaderPos));
-      *Zeroed = Entry.Time == 0;
-      if (useMemoryTagging<Config>(Options))
-        setMemoryPermission(Entry.CommitBase, Entry.CommitSize, 0, &Entry.Data);
-      uptr NewBlockBegin = reinterpret_cast<uptr>(*H + 1);
-      if (useMemoryTagging<Config>(Options)) {
-        if (*Zeroed)
-          storeTags(LargeBlock::addHeaderTag<Config>(Entry.CommitBase),
-                    NewBlockBegin);
-        else if (Entry.BlockBegin < NewBlockBegin)
-          storeTags(Entry.BlockBegin, NewBlockBegin);
-        else
-          storeTags(untagPointer(NewBlockBegin),
-                    untagPointer(Entry.BlockBegin));
+    if (!Found)
+      return false;
+
+    *H = reinterpret_cast<LargeBlock::Header *>(
+        LargeBlock::addHeaderTag<Config>(HeaderPos));
+    *Zeroed = Entry.Time == 0;
+    if (useMemoryTagging<Config>(Options))
+      Entry.MemMap.setMemoryPermission(Entry.CommitBase, Entry.CommitSize, 0);
+    uptr NewBlockBegin = reinterpret_cast<uptr>(*H + 1);
+    if (useMemoryTagging<Config>(Options)) {
+      if (*Zeroed) {
+        storeTags(LargeBlock::addHeaderTag<Config>(Entry.CommitBase),
+                  NewBlockBegin);
+      } else if (Entry.BlockBegin < NewBlockBegin) {
+        storeTags(Entry.BlockBegin, NewBlockBegin);
+      } else {
+        storeTags(untagPointer(NewBlockBegin),
+                  untagPointer(Entry.BlockBegin));
       }
-      (*H)->CommitBase = Entry.CommitBase;
-      (*H)->CommitSize = Entry.CommitSize;
-      (*H)->MapBase = Entry.MapBase;
-      (*H)->MapSize = Entry.MapSize;
-      (*H)->Data = Entry.Data;
-      EntriesCount--;
     }
-    return Found;
+    (*H)->CommitBase = Entry.CommitBase;
+    (*H)->CommitSize = Entry.CommitSize;
+    (*H)->MemMap = Entry.MemMap;
+    return true;
   }
 
   bool canCache(uptr Size) {
@@ -315,67 +315,62 @@
 
   void releaseToOS() { releaseOlderThan(UINT64_MAX); }
 
-  void disableMemoryTagging() {
+  void disableMemoryTagging() EXCLUDES(Mutex) {
     ScopedLock L(Mutex);
     for (u32 I = 0; I != Config::SecondaryCacheQuarantineSize; ++I) {
       if (Quarantine[I].CommitBase) {
-        unmap(reinterpret_cast<void *>(Quarantine[I].MapBase),
-              Quarantine[I].MapSize, UNMAP_ALL, &Quarantine[I].Data);
+        MemMapT &MemMap = Quarantine[I].MemMap;
+        MemMap.unmap(MemMap.getBase(), MemMap.getCapacity());
         Quarantine[I].CommitBase = 0;
       }
     }
     const u32 MaxCount = atomic_load_relaxed(&MaxEntriesCount);
-    for (u32 I = 0; I < MaxCount; I++)
-      if (Entries[I].CommitBase)
-        setMemoryPermission(Entries[I].CommitBase, Entries[I].CommitSize, 0,
-                            &Entries[I].Data);
+    for (u32 I = 0; I < MaxCount; I++) {
+      if (Entries[I].CommitBase) {
+        Entries[I].MemMap.setMemoryPermission(Entries[I].CommitBase,
+                                              Entries[I].CommitSize, 0);
+      }
+    }
     QuarantinePos = -1U;
   }
 
-  void disable() { Mutex.lock(); }
+  void disable() NO_THREAD_SAFETY_ANALYSIS { Mutex.lock(); }
 
-  void enable() { Mutex.unlock(); }
+  void enable() NO_THREAD_SAFETY_ANALYSIS { Mutex.unlock(); }
 
   void unmapTestOnly() { empty(); }
 
 private:
   void empty() {
-    struct {
-      void *MapBase;
-      uptr MapSize;
-      MapPlatformData Data;
-    } MapInfo[Config::SecondaryCacheEntriesArraySize];
+    MemMapT MapInfo[Config::SecondaryCacheEntriesArraySize];
     uptr N = 0;
     {
       ScopedLock L(Mutex);
       for (uptr I = 0; I < Config::SecondaryCacheEntriesArraySize; I++) {
         if (!Entries[I].CommitBase)
           continue;
-        MapInfo[N].MapBase = reinterpret_cast<void *>(Entries[I].MapBase);
-        MapInfo[N].MapSize = Entries[I].MapSize;
-        MapInfo[N].Data = Entries[I].Data;
+        MapInfo[N] = Entries[I].MemMap;
         Entries[I].CommitBase = 0;
         N++;
       }
       EntriesCount = 0;
       IsFullEvents = 0;
     }
-    for (uptr I = 0; I < N; I++)
-      unmap(MapInfo[I].MapBase, MapInfo[I].MapSize, UNMAP_ALL,
-            &MapInfo[I].Data);
+    for (uptr I = 0; I < N; I++) {
+      MemMapT &MemMap = MapInfo[I];
+      MemMap.unmap(MemMap.getBase(), MemMap.getCapacity());
+    }
   }
 
   struct CachedBlock {
-    uptr CommitBase;
-    uptr CommitSize;
-    uptr MapBase;
-    uptr MapSize;
-    uptr BlockBegin;
-    [[no_unique_address]] MapPlatformData Data;
-    u64 Time;
+    uptr CommitBase = 0;
+    uptr CommitSize = 0;
+    uptr BlockBegin = 0;
+    MemMapT MemMap = {};
+    u64 Time = 0;
   };
 
-  void releaseIfOlderThan(CachedBlock &Entry, u64 Time) {
+  void releaseIfOlderThan(CachedBlock &Entry, u64 Time) REQUIRES(Mutex) {
     if (!Entry.CommitBase || !Entry.Time)
       return;
     if (Entry.Time > Time) {
@@ -383,11 +378,11 @@
         OldestTime = Entry.Time;
       return;
     }
-    releasePagesToOS(Entry.CommitBase, 0, Entry.CommitSize, &Entry.Data);
+    Entry.MemMap.releasePagesToOS(Entry.CommitBase, Entry.CommitSize);
     Entry.Time = 0;
   }
 
-  void releaseOlderThan(u64 Time) {
+  void releaseOlderThan(u64 Time) EXCLUDES(Mutex) {
     ScopedLock L(Mutex);
     if (!EntriesCount || OldestTime == 0 || OldestTime > Time)
       return;
@@ -399,22 +394,24 @@
   }
 
   HybridMutex Mutex;
-  u32 EntriesCount = 0;
-  u32 QuarantinePos = 0;
+  u32 EntriesCount GUARDED_BY(Mutex) = 0;
+  u32 QuarantinePos GUARDED_BY(Mutex) = 0;
   atomic_u32 MaxEntriesCount = {};
   atomic_uptr MaxEntrySize = {};
-  u64 OldestTime = 0;
-  u32 IsFullEvents = 0;
+  u64 OldestTime GUARDED_BY(Mutex) = 0;
+  u32 IsFullEvents GUARDED_BY(Mutex) = 0;
   atomic_s32 ReleaseToOsIntervalMs = {};
 
-  CachedBlock Entries[Config::SecondaryCacheEntriesArraySize] = {};
+  CachedBlock
+      Entries[Config::SecondaryCacheEntriesArraySize] GUARDED_BY(Mutex) = {};
   NonZeroLengthArray<CachedBlock, Config::SecondaryCacheQuarantineSize>
-      Quarantine = {};
+      Quarantine GUARDED_BY(Mutex) = {};
 };
 
 template <typename Config> class MapAllocator {
 public:
-  void init(GlobalStats *S, s32 ReleaseToOsInterval = -1) {
+  void init(GlobalStats *S,
+            s32 ReleaseToOsInterval = -1) NO_THREAD_SAFETY_ANALYSIS {
     DCHECK_EQ(AllocatedBytes, 0U);
     DCHECK_EQ(FreedBytes, 0U);
     Cache.init(ReleaseToOsInterval);
@@ -438,19 +435,21 @@
     return getBlockEnd(Ptr) - reinterpret_cast<uptr>(Ptr);
   }
 
-  void getStats(ScopedString *Str) const;
+  void getStats(ScopedString *Str);
 
-  void disable() {
+  void disable() NO_THREAD_SAFETY_ANALYSIS {
     Mutex.lock();
     Cache.disable();
   }
 
-  void enable() {
+  void enable() NO_THREAD_SAFETY_ANALYSIS {
     Cache.enable();
     Mutex.unlock();
   }
 
   template <typename F> void iterateOverBlocks(F Callback) const {
+    Mutex.assertHeld();
+
     for (const auto &H : InUseBlocks) {
       uptr Ptr = reinterpret_cast<uptr>(&H) + LargeBlock::getHeaderSize();
       if (allocatorSupportsMemoryTagging<Config>())
@@ -472,14 +471,14 @@
 private:
   typename Config::SecondaryCache Cache;
 
-  HybridMutex Mutex;
-  DoublyLinkedList<LargeBlock::Header> InUseBlocks;
-  uptr AllocatedBytes = 0;
-  uptr FreedBytes = 0;
-  uptr LargestSize = 0;
-  u32 NumberOfAllocs = 0;
-  u32 NumberOfFrees = 0;
-  LocalStats Stats;
+  mutable HybridMutex Mutex;
+  DoublyLinkedList<LargeBlock::Header> InUseBlocks GUARDED_BY(Mutex);
+  uptr AllocatedBytes GUARDED_BY(Mutex) = 0;
+  uptr FreedBytes GUARDED_BY(Mutex) = 0;
+  uptr LargestSize GUARDED_BY(Mutex) = 0;
+  u32 NumberOfAllocs GUARDED_BY(Mutex) = 0;
+  u32 NumberOfFrees GUARDED_BY(Mutex) = 0;
+  LocalStats Stats GUARDED_BY(Mutex);
 };
 
 // As with the Primary, the size passed to this function includes any desired
@@ -502,9 +501,9 @@
   Alignment = Max(Alignment, uptr(1U) << SCUDO_MIN_ALIGNMENT_LOG);
   const uptr PageSize = getPageSizeCached();
   uptr RoundedSize =
-      roundUpTo(roundUpTo(Size, Alignment) + LargeBlock::getHeaderSize() +
-                    Chunk::getHeaderSize(),
-                PageSize);
+      roundUp(roundUp(Size, Alignment) + LargeBlock::getHeaderSize() +
+                  Chunk::getHeaderSize(),
+              PageSize);
   if (Alignment > PageSize)
     RoundedSize += Alignment - PageSize;
 
@@ -523,23 +522,26 @@
       if (FillContents && !Zeroed)
         memset(Ptr, FillContents == ZeroFill ? 0 : PatternFillByte,
                BlockEnd - PtrInt);
-      const uptr BlockSize = BlockEnd - HInt;
       {
         ScopedLock L(Mutex);
         InUseBlocks.push_back(H);
-        AllocatedBytes += BlockSize;
+        AllocatedBytes += H->CommitSize;
         NumberOfAllocs++;
-        Stats.add(StatAllocated, BlockSize);
-        Stats.add(StatMapped, H->MapSize);
+        Stats.add(StatAllocated, H->CommitSize);
+        Stats.add(StatMapped, H->MemMap.getCapacity());
       }
       return Ptr;
     }
   }
 
-  MapPlatformData Data = {};
+  ReservedMemoryT ReservedMemory;
   const uptr MapSize = RoundedSize + 2 * PageSize;
-  uptr MapBase = reinterpret_cast<uptr>(
-      map(nullptr, MapSize, nullptr, MAP_NOACCESS | MAP_ALLOWNOMEM, &Data));
+  ReservedMemory.create(/*Addr=*/0U, MapSize, nullptr, MAP_ALLOWNOMEM);
+
+  // Take the entire ownership of reserved region.
+  MemMapT MemMap = ReservedMemory.dispatch(ReservedMemory.getBase(),
+                                           ReservedMemory.getCapacity());
+  uptr MapBase = MemMap.getBase();
   if (UNLIKELY(!MapBase))
     return nullptr;
   uptr CommitBase = MapBase + PageSize;
@@ -551,27 +553,27 @@
     // For alignments greater than or equal to a page, the user pointer (eg: the
     // pointer that is returned by the C or C++ allocation APIs) ends up on a
     // page boundary , and our headers will live in the preceding page.
-    CommitBase = roundUpTo(MapBase + PageSize + 1, Alignment) - PageSize;
+    CommitBase = roundUp(MapBase + PageSize + 1, Alignment) - PageSize;
     const uptr NewMapBase = CommitBase - PageSize;
     DCHECK_GE(NewMapBase, MapBase);
     // We only trim the extra memory on 32-bit platforms: 64-bit platforms
     // are less constrained memory wise, and that saves us two syscalls.
     if (SCUDO_WORDSIZE == 32U && NewMapBase != MapBase) {
-      unmap(reinterpret_cast<void *>(MapBase), NewMapBase - MapBase, 0, &Data);
+      MemMap.unmap(MapBase, NewMapBase - MapBase);
       MapBase = NewMapBase;
     }
     const uptr NewMapEnd =
-        CommitBase + PageSize + roundUpTo(Size, PageSize) + PageSize;
+        CommitBase + PageSize + roundUp(Size, PageSize) + PageSize;
     DCHECK_LE(NewMapEnd, MapEnd);
     if (SCUDO_WORDSIZE == 32U && NewMapEnd != MapEnd) {
-      unmap(reinterpret_cast<void *>(NewMapEnd), MapEnd - NewMapEnd, 0, &Data);
+      MemMap.unmap(NewMapEnd, MapEnd - NewMapEnd);
       MapEnd = NewMapEnd;
     }
   }
 
   const uptr CommitSize = MapEnd - PageSize - CommitBase;
-  const uptr AllocPos = roundDownTo(CommitBase + CommitSize - Size, Alignment);
-  mapSecondary<Config>(Options, CommitBase, CommitSize, AllocPos, 0, &Data);
+  const uptr AllocPos = roundDown(CommitBase + CommitSize - Size, Alignment);
+  mapSecondary<Config>(Options, CommitBase, CommitSize, AllocPos, 0, MemMap);
   const uptr HeaderPos =
       AllocPos - Chunk::getHeaderSize() - LargeBlock::getHeaderSize();
   LargeBlock::Header *H = reinterpret_cast<LargeBlock::Header *>(
@@ -579,11 +581,9 @@
   if (useMemoryTagging<Config>(Options))
     storeTags(LargeBlock::addHeaderTag<Config>(CommitBase),
               reinterpret_cast<uptr>(H + 1));
-  H->MapBase = MapBase;
-  H->MapSize = MapEnd - MapBase;
   H->CommitBase = CommitBase;
   H->CommitSize = CommitSize;
-  H->Data = Data;
+  H->MemMap = MemMap;
   if (BlockEndPtr)
     *BlockEndPtr = CommitBase + CommitSize;
   {
@@ -594,13 +594,14 @@
       LargestSize = CommitSize;
     NumberOfAllocs++;
     Stats.add(StatAllocated, CommitSize);
-    Stats.add(StatMapped, H->MapSize);
+    Stats.add(StatMapped, H->MemMap.getCapacity());
   }
   return reinterpret_cast<void *>(HeaderPos + LargeBlock::getHeaderSize());
 }
 
 template <typename Config>
-void MapAllocator<Config>::deallocate(Options Options, void *Ptr) {
+void MapAllocator<Config>::deallocate(Options Options, void *Ptr)
+    EXCLUDES(Mutex) {
   LargeBlock::Header *H = LargeBlock::getHeader<Config>(Ptr);
   const uptr CommitSize = H->CommitSize;
   {
@@ -609,13 +610,14 @@
     FreedBytes += CommitSize;
     NumberOfFrees++;
     Stats.sub(StatAllocated, CommitSize);
-    Stats.sub(StatMapped, H->MapSize);
+    Stats.sub(StatMapped, H->MemMap.getCapacity());
   }
   Cache.store(Options, H);
 }
 
 template <typename Config>
-void MapAllocator<Config>::getStats(ScopedString *Str) const {
+void MapAllocator<Config>::getStats(ScopedString *Str) EXCLUDES(Mutex) {
+  ScopedLock L(Mutex);
   Str->append("Stats: MapAllocator: allocated %u times (%zuK), freed %u times "
               "(%zuK), remains %u (%zuK) max %zuM\n",
               NumberOfAllocs, AllocatedBytes >> 10, NumberOfFrees,
diff --git a/standalone/size_class_map.h b/standalone/size_class_map.h
index 6b06095..7665624 100644
--- a/standalone/size_class_map.h
+++ b/standalone/size_class_map.h
@@ -23,7 +23,7 @@
 }
 
 template <typename Config> struct SizeClassMapBase {
-  static u32 getMaxCachedHint(uptr Size) {
+  static u16 getMaxCachedHint(uptr Size) {
     DCHECK_NE(Size, 0);
     u32 N;
     // Force a 32-bit division if the template parameters allow for it.
@@ -31,7 +31,10 @@
       N = static_cast<u32>((1UL << Config::MaxBytesCachedLog) / Size);
     else
       N = (1U << Config::MaxBytesCachedLog) / static_cast<u32>(Size);
-    return Max(1U, Min(Config::MaxNumCachedHint, N));
+
+    // Note that Config::MaxNumCachedHint is u16 so the result is guaranteed to
+    // fit in u16.
+    return static_cast<u16>(Max(1U, Min<u32>(Config::MaxNumCachedHint, N)));
   }
 };
 
@@ -65,7 +68,7 @@
   static const uptr M = (1UL << S) - 1;
 
 public:
-  static const u32 MaxNumCachedHint = Config::MaxNumCachedHint;
+  static const u16 MaxNumCachedHint = Config::MaxNumCachedHint;
 
   static const uptr MaxSize = (1UL << Config::MaxSizeLog) + Config::SizeDelta;
   static const uptr NumClasses =
@@ -99,7 +102,7 @@
     return MidClass + 1 + scaledLog2(Size - 1, Config::MidSizeLog, S);
   }
 
-  static u32 getMaxCachedHint(uptr Size) {
+  static u16 getMaxCachedHint(uptr Size) {
     DCHECK_LE(Size, MaxSize);
     return Base::getMaxCachedHint(Size);
   }
@@ -178,7 +181,7 @@
   static constexpr LSBTable LTable = {};
 
 public:
-  static const u32 MaxNumCachedHint = Config::MaxNumCachedHint;
+  static const u16 MaxNumCachedHint = Config::MaxNumCachedHint;
 
   static const uptr NumClasses = ClassesSize + 1;
   static_assert(NumClasses < 256, "");
@@ -212,7 +215,7 @@
     return SzTable.Tab[scaledLog2(Size - 1, Config::MidSizeLog, S)];
   }
 
-  static u32 getMaxCachedHint(uptr Size) {
+  static u16 getMaxCachedHint(uptr Size) {
     DCHECK_LE(Size, MaxSize);
     return Base::getMaxCachedHint(Size);
   }
@@ -223,7 +226,7 @@
   static const uptr MinSizeLog = 5;
   static const uptr MidSizeLog = 8;
   static const uptr MaxSizeLog = 17;
-  static const u32 MaxNumCachedHint = 14;
+  static const u16 MaxNumCachedHint = 14;
   static const uptr MaxBytesCachedLog = 10;
   static const uptr SizeDelta = 0;
 };
@@ -235,7 +238,7 @@
   static const uptr MinSizeLog = 5;
   static const uptr MidSizeLog = 8;
   static const uptr MaxSizeLog = 17;
-  static const u32 MaxNumCachedHint = 10;
+  static const u16 MaxNumCachedHint = 12;
   static const uptr MaxBytesCachedLog = 10;
   static const uptr SizeDelta = Chunk::getHeaderSize();
 };
@@ -248,7 +251,7 @@
   static const uptr MinSizeLog = 4;
   static const uptr MidSizeLog = 6;
   static const uptr MaxSizeLog = 16;
-  static const u32 MaxNumCachedHint = 13;
+  static const u16 MaxNumCachedHint = 13;
   static const uptr MaxBytesCachedLog = 13;
 
   static constexpr u32 Classes[] = {
@@ -263,7 +266,7 @@
   static const uptr MinSizeLog = 4;
   static const uptr MidSizeLog = 7;
   static const uptr MaxSizeLog = 16;
-  static const u32 MaxNumCachedHint = 14;
+  static const u16 MaxNumCachedHint = 14;
   static const uptr MaxBytesCachedLog = 13;
 
   static constexpr u32 Classes[] = {
@@ -292,7 +295,7 @@
   static const uptr MinSizeLog = 4;
   static const uptr MidSizeLog = 8;
   static const uptr MaxSizeLog = 14;
-  static const u32 MaxNumCachedHint = 13;
+  static const u16 MaxNumCachedHint = 13;
   static const uptr MaxBytesCachedLog = 10;
   static const uptr SizeDelta = Chunk::getHeaderSize();
 #else
@@ -300,7 +303,7 @@
   static const uptr MinSizeLog = 3;
   static const uptr MidSizeLog = 7;
   static const uptr MaxSizeLog = 14;
-  static const u32 MaxNumCachedHint = 14;
+  static const u16 MaxNumCachedHint = 14;
   static const uptr MaxBytesCachedLog = 10;
   static const uptr SizeDelta = Chunk::getHeaderSize();
 #endif
@@ -315,7 +318,7 @@
   static const uptr MinSizeLog = 7;
   static const uptr MidSizeLog = 7;
   static const uptr MaxSizeLog = 7;
-  static const u32 MaxNumCachedHint = 8;
+  static const u16 MaxNumCachedHint = 12;
   static const uptr MaxBytesCachedLog = 10;
   static const uptr SizeDelta = 0;
 };
diff --git a/standalone/stats.h b/standalone/stats.h
index be5bf2d..658b758 100644
--- a/standalone/stats.h
+++ b/standalone/stats.h
@@ -12,6 +12,7 @@
 #include "atomic_helpers.h"
 #include "list.h"
 #include "mutex.h"
+#include "thread_annotations.h"
 
 #include <string.h>
 
@@ -60,19 +61,19 @@
 public:
   void init() { LocalStats::init(); }
 
-  void link(LocalStats *S) {
+  void link(LocalStats *S) EXCLUDES(Mutex) {
     ScopedLock L(Mutex);
     StatsList.push_back(S);
   }
 
-  void unlink(LocalStats *S) {
+  void unlink(LocalStats *S) EXCLUDES(Mutex) {
     ScopedLock L(Mutex);
     StatsList.remove(S);
     for (uptr I = 0; I < StatCount; I++)
       add(static_cast<StatType>(I), S->get(static_cast<StatType>(I)));
   }
 
-  void get(uptr *S) const {
+  void get(uptr *S) const EXCLUDES(Mutex) {
     ScopedLock L(Mutex);
     for (uptr I = 0; I < StatCount; I++)
       S[I] = LocalStats::get(static_cast<StatType>(I));
@@ -85,15 +86,15 @@
       S[I] = static_cast<sptr>(S[I]) >= 0 ? S[I] : 0;
   }
 
-  void lock() { Mutex.lock(); }
-  void unlock() { Mutex.unlock(); }
+  void lock() ACQUIRE(Mutex) { Mutex.lock(); }
+  void unlock() RELEASE(Mutex) { Mutex.unlock(); }
 
-  void disable() { lock(); }
-  void enable() { unlock(); }
+  void disable() ACQUIRE(Mutex) { lock(); }
+  void enable() RELEASE(Mutex) { unlock(); }
 
 private:
   mutable HybridMutex Mutex;
-  DoublyLinkedList<LocalStats> StatsList;
+  DoublyLinkedList<LocalStats> StatsList GUARDED_BY(Mutex);
 };
 
 } // namespace scudo
diff --git a/standalone/string_utils.cpp b/standalone/string_utils.cpp
index 13fdb9c..7e516f9 100644
--- a/standalone/string_utils.cpp
+++ b/standalone/string_utils.cpp
@@ -195,6 +195,28 @@
           appendChar(&Buffer, BufferEnd, static_cast<char>(va_arg(Args, int)));
       break;
     }
+    // In Scudo, `s64`/`u64` are supposed to use `lld` and `llu` respectively.
+    // However, `-Wformat` doesn't know we have a different parser for those
+    // placeholders and it keeps complaining the type mismatch on 64-bit
+    // platform which uses `ld`/`lu` for `s64`/`u64`. Therefore, in order to
+    // silence the warning, we turn to use `PRId64`/`PRIu64` for printing
+    // `s64`/`u64` and handle the `ld`/`lu` here.
+    case 'l': {
+      ++Cur;
+      RAW_CHECK(*Cur == 'd' || *Cur == 'u');
+
+      if (*Cur == 'd') {
+        DVal = va_arg(Args, s64);
+        Res +=
+            appendSignedDecimal(&Buffer, BufferEnd, DVal, Width, PadWithZero);
+      } else {
+        UVal = va_arg(Args, u64);
+        Res += appendUnsigned(&Buffer, BufferEnd, UVal, 10, Width, PadWithZero,
+                              false);
+      }
+
+      break;
+    }
     case '%': {
       RAW_CHECK_MSG(!HaveFlags, PrintfFormatsHelp);
       Res += appendChar(&Buffer, BufferEnd, '%');
diff --git a/standalone/string_utils.h b/standalone/string_utils.h
index dd6ff78..4190119 100644
--- a/standalone/string_utils.h
+++ b/standalone/string_utils.h
@@ -28,6 +28,7 @@
   void append(const char *Format, va_list Args);
   void append(const char *Format, ...) FORMAT(2, 3);
   void output() const { outputRaw(String.data()); }
+  void reserve(size_t Size) { String.reserve(Size + 1); }
 
 private:
   Vector<char> String;
diff --git a/standalone/tests/combined_test.cpp b/standalone/tests/combined_test.cpp
index 94d97df..44ba639 100644
--- a/standalone/tests/combined_test.cpp
+++ b/standalone/tests/combined_test.cpp
@@ -10,7 +10,9 @@
 #include "tests/scudo_unit_test.h"
 
 #include "allocator_config.h"
+#include "chunk.h"
 #include "combined.h"
+#include "mem_map.h"
 
 #include <condition_variable>
 #include <memory>
@@ -38,7 +40,7 @@
   if (Alignment < MinAlignment)
     Alignment = MinAlignment;
   const scudo::uptr NeededSize =
-      scudo::roundUpTo(Size, MinAlignment) +
+      scudo::roundUp(Size, MinAlignment) +
       ((Alignment > MinAlignment) ? Alignment : scudo::Chunk::getHeaderSize());
   return AllocatorT::PrimaryT::canAllocate(NeededSize);
 }
@@ -47,7 +49,7 @@
 void checkMemoryTaggingMaybe(AllocatorT *Allocator, void *P, scudo::uptr Size,
                              scudo::uptr Alignment) {
   const scudo::uptr MinAlignment = 1UL << SCUDO_MIN_ALIGNMENT_LOG;
-  Size = scudo::roundUpTo(Size, MinAlignment);
+  Size = scudo::roundUp(Size, MinAlignment);
   if (Allocator->useMemoryTaggingTestOnly())
     EXPECT_DEATH(
         {
@@ -91,7 +93,7 @@
     Allocator = std::make_unique<AllocatorT>();
   }
   ~ScudoCombinedTest() {
-    Allocator->releaseToOS();
+    Allocator->releaseToOS(scudo::ReleaseToOS::Force);
     UseQuarantine = true;
   }
 
@@ -152,7 +154,7 @@
   for (scudo::uptr AlignLog = MinAlignLog; AlignLog <= 16U; AlignLog++) {
     const scudo::uptr Align = 1U << AlignLog;
     for (scudo::sptr Delta = -32; Delta <= 32; Delta++) {
-      if (static_cast<scudo::sptr>(1U << SizeLog) + Delta <= 0)
+      if (static_cast<scudo::sptr>(1U << SizeLog) + Delta < 0)
         continue;
       const scudo::uptr Size = (1U << SizeLog) + Delta;
       void *P = Allocator->allocate(Size, Origin, Align);
@@ -165,6 +167,8 @@
       Allocator->deallocate(P, Origin, Size);
     }
   }
+
+  Allocator->printStats();
 }
 
 #define SCUDO_MAKE_BASIC_TEST(SizeLog)                                         \
@@ -411,7 +415,7 @@
     reinterpret_cast<char *>(P)[2048] = 0xaa;
     Allocator->deallocate(P, Origin);
 
-    Allocator->releaseToOS();
+    Allocator->releaseToOS(scudo::ReleaseToOS::Force);
   }
 }
 
@@ -434,7 +438,7 @@
   EXPECT_NE(Stats.find("Stats: Quarantine"), std::string::npos);
 }
 
-SCUDO_TYPED_TEST(ScudoCombinedTest, CacheDrain) {
+SCUDO_TYPED_TEST(ScudoCombinedTest, CacheDrain) NO_THREAD_SAFETY_ANALYSIS {
   auto *Allocator = this->Allocator.get();
 
   std::vector<void *> V;
@@ -446,9 +450,31 @@
 
   bool UnlockRequired;
   auto *TSD = Allocator->getTSDRegistry()->getTSDAndLock(&UnlockRequired);
-  EXPECT_TRUE(!TSD->Cache.isEmpty());
-  TSD->Cache.drain();
-  EXPECT_TRUE(TSD->Cache.isEmpty());
+  EXPECT_TRUE(!TSD->getCache().isEmpty());
+  TSD->getCache().drain();
+  EXPECT_TRUE(TSD->getCache().isEmpty());
+  if (UnlockRequired)
+    TSD->unlock();
+}
+
+SCUDO_TYPED_TEST(ScudoCombinedTest, ForceCacheDrain) NO_THREAD_SAFETY_ANALYSIS {
+  auto *Allocator = this->Allocator.get();
+
+  std::vector<void *> V;
+  for (scudo::uptr I = 0; I < 64U; I++)
+    V.push_back(Allocator->allocate(
+        rand() % (TypeParam::Primary::SizeClassMap::MaxSize / 2U), Origin));
+  for (auto P : V)
+    Allocator->deallocate(P, Origin);
+
+  // `ForceAll` will also drain the caches.
+  Allocator->releaseToOS(scudo::ReleaseToOS::ForceAll);
+
+  bool UnlockRequired;
+  auto *TSD = Allocator->getTSDRegistry()->getTSDAndLock(&UnlockRequired);
+  EXPECT_TRUE(TSD->getCache().isEmpty());
+  EXPECT_EQ(TSD->getQuarantineCache().getSize(), 0U);
+  EXPECT_TRUE(Allocator->getQuarantine()->isEmpty());
   if (UnlockRequired)
     TSD->unlock();
 }
@@ -487,18 +513,19 @@
   }
   for (auto &T : Threads)
     T.join();
-  Allocator->releaseToOS();
+  Allocator->releaseToOS(scudo::ReleaseToOS::Force);
 }
 
 // Test that multiple instantiations of the allocator have not messed up the
 // process's signal handlers (GWP-ASan used to do this).
 TEST(ScudoCombinedDeathTest, SKIP_ON_FUCHSIA(testSEGV)) {
   const scudo::uptr Size = 4 * scudo::getPageSizeCached();
-  scudo::MapPlatformData Data = {};
-  void *P = scudo::map(nullptr, Size, "testSEGV", MAP_NOACCESS, &Data);
-  EXPECT_NE(P, nullptr);
+  scudo::ReservedMemoryT ReservedMemory;
+  ASSERT_TRUE(ReservedMemory.create(/*Addr=*/0U, Size, "testSEGV"));
+  void *P = reinterpret_cast<void *>(ReservedMemory.getBase());
+  ASSERT_NE(P, nullptr);
   EXPECT_DEATH(memset(P, 0xaa, Size), "");
-  scudo::unmap(P, Size, UNMAP_ALL, &Data);
+  ReservedMemory.release();
 }
 
 struct DeathSizeClassConfig {
@@ -506,12 +533,12 @@
   static const scudo::uptr MinSizeLog = 10;
   static const scudo::uptr MidSizeLog = 10;
   static const scudo::uptr MaxSizeLog = 13;
-  static const scudo::u32 MaxNumCachedHint = 4;
+  static const scudo::u16 MaxNumCachedHint = 8;
   static const scudo::uptr MaxBytesCachedLog = 12;
   static const scudo::uptr SizeDelta = 0;
 };
 
-static const scudo::uptr DeathRegionSizeLog = 20U;
+static const scudo::uptr DeathRegionSizeLog = 21U;
 struct DeathConfig {
   static const bool MaySupportMemoryTagging = false;
 
@@ -525,6 +552,7 @@
   static const scudo::uptr PrimaryCompactPtrScale = 0;
   static const bool PrimaryEnableRandomOffset = true;
   static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
+  static const scudo::uptr PrimaryGroupSizeLog = 18;
 
   typedef scudo::MapAllocatorNoCache SecondaryCache;
   template <class A> using TSDRegistryT = scudo::TSDRegistrySharedT<A, 1U, 1U>;
@@ -599,7 +627,7 @@
 // operation without issue.
 SCUDO_TYPED_TEST(ScudoCombinedTest, ReleaseToOS) {
   auto *Allocator = this->Allocator.get();
-  Allocator->releaseToOS();
+  Allocator->releaseToOS(scudo::ReleaseToOS::Force);
 }
 
 SCUDO_TYPED_TEST(ScudoCombinedTest, OddEven) {
@@ -699,3 +727,85 @@
       Allocator->deallocate(Ptrs[i], Origin);
   }
 }
+
+SCUDO_TYPED_TEST(ScudoCombinedTest, RingBufferSize) {
+  auto *Allocator = this->Allocator.get();
+  auto Size = Allocator->getRingBufferSize();
+  if (Size > 0)
+    EXPECT_EQ(Allocator->getRingBufferAddress()[Size - 1], '\0');
+}
+
+SCUDO_TYPED_TEST(ScudoCombinedTest, RingBufferAddress) {
+  auto *Allocator = this->Allocator.get();
+  auto *Addr = Allocator->getRingBufferAddress();
+  EXPECT_NE(Addr, nullptr);
+  EXPECT_EQ(Addr, Allocator->getRingBufferAddress());
+}
+
+#if SCUDO_CAN_USE_PRIMARY64
+#if SCUDO_TRUSTY
+
+// TrustyConfig is designed for a domain-specific allocator. Add a basic test
+// which covers only simple operations and ensure the configuration is able to
+// compile.
+TEST(ScudoCombinedTest, BasicTrustyConfig) {
+  using AllocatorT = scudo::Allocator<scudo::TrustyConfig>;
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+
+  for (scudo::uptr ClassId = 1U;
+       ClassId <= scudo::TrustyConfig::SizeClassMap::LargestClassId;
+       ClassId++) {
+    const scudo::uptr Size =
+        scudo::TrustyConfig::SizeClassMap::getSizeByClassId(ClassId);
+    void *p = Allocator->allocate(Size - scudo::Chunk::getHeaderSize(), Origin);
+    ASSERT_NE(p, nullptr);
+    free(p);
+  }
+
+  bool UnlockRequired;
+  auto *TSD = Allocator->getTSDRegistry()->getTSDAndLock(&UnlockRequired);
+  TSD->getCache().drain();
+
+  Allocator->releaseToOS(scudo::ReleaseToOS::Force);
+}
+
+#endif
+#endif
+
+#if SCUDO_LINUX
+
+SCUDO_TYPED_TEST(ScudoCombinedTest, SoftRssLimit) {
+  auto *Allocator = this->Allocator.get();
+  Allocator->setRssLimitsTestOnly(1, 0, true);
+
+  size_t Megabyte = 1024 * 1024;
+  size_t ChunkSize = 16;
+  size_t Error = 256;
+
+  std::vector<void *> Ptrs;
+  for (size_t index = 0; index < Megabyte + Error; index += ChunkSize) {
+    void *Ptr = Allocator->allocate(ChunkSize, Origin);
+    Ptrs.push_back(Ptr);
+  }
+
+  EXPECT_EQ(nullptr, Allocator->allocate(ChunkSize, Origin));
+
+  for (void *Ptr : Ptrs)
+    Allocator->deallocate(Ptr, Origin);
+}
+
+SCUDO_TYPED_TEST(ScudoCombinedTest, HardRssLimit) {
+  auto *Allocator = this->Allocator.get();
+  Allocator->setRssLimitsTestOnly(0, 1, false);
+
+  size_t Megabyte = 1024 * 1024;
+
+  EXPECT_DEATH(
+      {
+        disableDebuggerdMaybe();
+        Allocator->allocate(Megabyte, Origin);
+      },
+      "");
+}
+
+#endif
diff --git a/standalone/tests/common_test.cpp b/standalone/tests/common_test.cpp
index 711e3b2..b1e55e8 100644
--- a/standalone/tests/common_test.cpp
+++ b/standalone/tests/common_test.cpp
@@ -10,6 +10,7 @@
 #include "tests/scudo_unit_test.h"
 
 #include "common.h"
+#include "mem_map.h"
 #include <algorithm>
 #include <fstream>
 
@@ -34,39 +35,64 @@
   const uptr Size = 1ull << 30;
   const uptr Threshold = Size >> 3;
 
-  MapPlatformData Data = {};
-  void *P = map(nullptr, Size, "ResidentMemorySize", 0, &Data);
-  ASSERT_NE(nullptr, P);
+  MemMapT MemMap;
+  ASSERT_TRUE(MemMap.map(/*Addr=*/0U, Size, "ResidentMemorySize"));
+  ASSERT_NE(MemMap.getBase(), 0U);
+  void *P = reinterpret_cast<void *>(MemMap.getBase());
   EXPECT_LT(getResidentMemorySize(), OnStart + Threshold);
 
   memset(P, 1, Size);
   EXPECT_GT(getResidentMemorySize(), OnStart + Size - Threshold);
 
-  releasePagesToOS((uptr)P, 0, Size, &Data);
+  MemMap.releasePagesToOS(MemMap.getBase(), Size);
   EXPECT_LT(getResidentMemorySize(), OnStart + Threshold);
 
   memset(P, 1, Size);
   EXPECT_GT(getResidentMemorySize(), OnStart + Size - Threshold);
 
-  unmap(P, Size, 0, &Data);
+  MemMap.unmap(MemMap.getBase(), Size);
 }
 
 TEST(ScudoCommonTest, Zeros) {
   const uptr Size = 1ull << 20;
 
-  MapPlatformData Data = {};
-  uptr *P = reinterpret_cast<uptr *>(map(nullptr, Size, "Zeros", 0, &Data));
-  const ptrdiff_t N = Size / sizeof(*P);
-  ASSERT_NE(nullptr, P);
+  MemMapT MemMap;
+  ASSERT_TRUE(MemMap.map(/*Addr=*/0U, Size, "Zeros"));
+  ASSERT_NE(MemMap.getBase(), 0U);
+  uptr *P = reinterpret_cast<uptr *>(MemMap.getBase());
+  const ptrdiff_t N = Size / sizeof(uptr);
   EXPECT_EQ(std::count(P, P + N, 0), N);
 
   memset(P, 1, Size);
   EXPECT_EQ(std::count(P, P + N, 0), 0);
 
-  releasePagesToOS((uptr)P, 0, Size, &Data);
+  MemMap.releasePagesToOS(MemMap.getBase(), Size);
   EXPECT_EQ(std::count(P, P + N, 0), N);
 
-  unmap(P, Size, 0, &Data);
+  MemMap.unmap(MemMap.getBase(), Size);
 }
 
+#if 0
+// This test is temorarily disabled because it may not work as expected. E.g.,
+// it doesn't dirty the pages so the pages may not be commited and it may only
+// work on the single thread environment. As a result, this test is flaky and is
+// impacting many test scenarios.
+TEST(ScudoCommonTest, GetRssFromBuffer) {
+  constexpr int64_t AllocSize = 10000000;
+  constexpr int64_t Error = 3000000;
+  constexpr size_t Runs = 10;
+
+  int64_t Rss = scudo::GetRSS();
+  EXPECT_GT(Rss, 0);
+
+  std::vector<std::unique_ptr<char[]>> Allocs(Runs);
+  for (auto &Alloc : Allocs) {
+    Alloc.reset(new char[AllocSize]());
+    int64_t Prev = Rss;
+    Rss = scudo::GetRSS();
+    EXPECT_LE(std::abs(Rss - AllocSize - Prev), Error);
+  }
+}
+#endif
+
 } // namespace scudo
diff --git a/standalone/tests/list_test.cpp b/standalone/tests/list_test.cpp
index 8e139916..140ca02 100644
--- a/standalone/tests/list_test.cpp
+++ b/standalone/tests/list_test.cpp
@@ -161,6 +161,10 @@
   setList(&L1, X);
   checkList(&L1, X);
 
+  setList(&L1, X, Y);
+  L1.insert(X, Z);
+  checkList(&L1, X, Z, Y);
+
   setList(&L1, X, Y, Z);
   setList(&L2, A, B, C);
   L1.append_back(&L2);
diff --git a/standalone/tests/map_test.cpp b/standalone/tests/map_test.cpp
index ff05258..06a56f8 100644
--- a/standalone/tests/map_test.cpp
+++ b/standalone/tests/map_test.cpp
@@ -9,6 +9,7 @@
 #include "tests/scudo_unit_test.h"
 
 #include "common.h"
+#include "mem_map.h"
 
 #include <string.h>
 #include <unistd.h>
@@ -22,11 +23,15 @@
 
 TEST(ScudoMapDeathTest, MapNoAccessUnmap) {
   const scudo::uptr Size = 4 * scudo::getPageSizeCached();
-  scudo::MapPlatformData Data = {};
-  void *P = scudo::map(nullptr, Size, MappingName, MAP_NOACCESS, &Data);
-  EXPECT_NE(P, nullptr);
-  EXPECT_DEATH(memset(P, 0xaa, Size), "");
-  scudo::unmap(P, Size, UNMAP_ALL, &Data);
+  scudo::ReservedMemoryT ReservedMemory;
+
+  ASSERT_TRUE(ReservedMemory.create(/*Addr=*/0U, Size, MappingName));
+  EXPECT_NE(ReservedMemory.getBase(), 0U);
+  EXPECT_DEATH(
+      memset(reinterpret_cast<void *>(ReservedMemory.getBase()), 0xaa, Size),
+      "");
+
+  ReservedMemory.release();
 }
 
 TEST(ScudoMapDeathTest, MapUnmap) {
@@ -36,11 +41,13 @@
         // Repeat few time to avoid missing crash if it's mmaped by unrelated
         // code.
         for (int i = 0; i < 10; ++i) {
-          void *P = scudo::map(nullptr, Size, MappingName, 0, nullptr);
-          if (!P)
+          scudo::MemMapT MemMap;
+          MemMap.map(/*Addr=*/0U, Size, MappingName);
+          scudo::uptr P = MemMap.getBase();
+          if (P == 0U)
             continue;
-          scudo::unmap(P, Size, 0, nullptr);
-          memset(P, 0xbb, Size);
+          MemMap.unmap(MemMap.getBase(), Size);
+          memset(reinterpret_cast<void *>(P), 0xbb, Size);
         }
       },
       "");
@@ -49,30 +56,36 @@
 TEST(ScudoMapDeathTest, MapWithGuardUnmap) {
   const scudo::uptr PageSize = scudo::getPageSizeCached();
   const scudo::uptr Size = 4 * PageSize;
-  scudo::MapPlatformData Data = {};
-  void *P = scudo::map(nullptr, Size + 2 * PageSize, MappingName, MAP_NOACCESS,
-                       &Data);
-  EXPECT_NE(P, nullptr);
-  void *Q =
-      reinterpret_cast<void *>(reinterpret_cast<scudo::uptr>(P) + PageSize);
-  EXPECT_EQ(scudo::map(Q, Size, MappingName, 0, &Data), Q);
-  memset(Q, 0xaa, Size);
-  EXPECT_DEATH(memset(Q, 0xaa, Size + 1), "");
-  scudo::unmap(P, Size + 2 * PageSize, UNMAP_ALL, &Data);
+  scudo::ReservedMemoryT ReservedMemory;
+  ASSERT_TRUE(
+      ReservedMemory.create(/*Addr=*/0U, Size + 2 * PageSize, MappingName));
+  ASSERT_NE(ReservedMemory.getBase(), 0U);
+
+  scudo::MemMapT MemMap =
+      ReservedMemory.dispatch(ReservedMemory.getBase(), Size + 2 * PageSize);
+  ASSERT_TRUE(MemMap.isAllocated());
+  scudo::uptr Q = MemMap.getBase() + PageSize;
+  ASSERT_TRUE(MemMap.remap(Q, Size, MappingName));
+  memset(reinterpret_cast<void *>(Q), 0xaa, Size);
+  EXPECT_DEATH(memset(reinterpret_cast<void *>(Q), 0xaa, Size + 1), "");
+  MemMap.unmap(MemMap.getBase(), MemMap.getCapacity());
 }
 
 TEST(ScudoMapTest, MapGrowUnmap) {
   const scudo::uptr PageSize = scudo::getPageSizeCached();
   const scudo::uptr Size = 4 * PageSize;
-  scudo::MapPlatformData Data = {};
-  void *P = scudo::map(nullptr, Size, MappingName, MAP_NOACCESS, &Data);
-  EXPECT_NE(P, nullptr);
-  void *Q =
-      reinterpret_cast<void *>(reinterpret_cast<scudo::uptr>(P) + PageSize);
-  EXPECT_EQ(scudo::map(Q, PageSize, MappingName, 0, &Data), Q);
-  memset(Q, 0xaa, PageSize);
-  Q = reinterpret_cast<void *>(reinterpret_cast<scudo::uptr>(Q) + PageSize);
-  EXPECT_EQ(scudo::map(Q, PageSize, MappingName, 0, &Data), Q);
-  memset(Q, 0xbb, PageSize);
-  scudo::unmap(P, Size, UNMAP_ALL, &Data);
+  scudo::ReservedMemoryT ReservedMemory;
+  ReservedMemory.create(/*Addr=*/0U, Size, MappingName);
+  ASSERT_TRUE(ReservedMemory.isCreated());
+
+  scudo::MemMapT MemMap =
+      ReservedMemory.dispatch(ReservedMemory.getBase(), Size);
+  ASSERT_TRUE(MemMap.isAllocated());
+  scudo::uptr Q = MemMap.getBase() + PageSize;
+  ASSERT_TRUE(MemMap.remap(Q, PageSize, MappingName));
+  memset(reinterpret_cast<void *>(Q), 0xaa, PageSize);
+  Q += PageSize;
+  ASSERT_TRUE(MemMap.remap(Q, PageSize, MappingName));
+  memset(reinterpret_cast<void *>(Q), 0xbb, PageSize);
+  MemMap.unmap(MemMap.getBase(), MemMap.getCapacity());
 }
diff --git a/standalone/tests/memtag_test.cpp b/standalone/tests/memtag_test.cpp
index 283edaa..d4c39aa 100644
--- a/standalone/tests/memtag_test.cpp
+++ b/standalone/tests/memtag_test.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "common.h"
+#include "mem_map.h"
 #include "memtag.h"
 #include "platform.h"
 #include "tests/scudo_unit_test.h"
@@ -45,20 +46,24 @@
       GTEST_SKIP() << "Memory tagging is not supported";
 
     BufferSize = getPageSizeCached();
-    Buffer = reinterpret_cast<u8 *>(
-        map(nullptr, BufferSize, "MemtagTest", MAP_MEMTAG, &Data));
-    Addr = reinterpret_cast<uptr>(Buffer);
+    ASSERT_FALSE(MemMap.isAllocated());
+    ASSERT_TRUE(MemMap.map(/*Addr=*/0U, BufferSize, "MemtagTest", MAP_MEMTAG));
+    ASSERT_NE(MemMap.getBase(), 0U);
+    Addr = MemMap.getBase();
+    Buffer = reinterpret_cast<u8 *>(Addr);
     EXPECT_TRUE(isAligned(Addr, archMemoryTagGranuleSize()));
     EXPECT_EQ(Addr, untagPointer(Addr));
   }
 
   void TearDown() override {
-    if (Buffer)
-      unmap(Buffer, BufferSize, 0, &Data);
+    if (Buffer) {
+      ASSERT_TRUE(MemMap.isAllocated());
+      MemMap.unmap(MemMap.getBase(), MemMap.getCapacity());
+    }
   }
 
   uptr BufferSize = 0;
-  MapPlatformData Data = {};
+  scudo::MemMapT MemMap = {};
   u8 *Buffer = nullptr;
   uptr Addr = 0;
 };
@@ -163,7 +168,7 @@
     uptr TaggedBegin = addFixedTag(NoTagBegin, Tag);
     uptr TaggedEnd = addFixedTag(NoTagEnd, Tag);
 
-    EXPECT_EQ(roundUpTo(TaggedEnd, archMemoryTagGranuleSize()),
+    EXPECT_EQ(roundUp(TaggedEnd, archMemoryTagGranuleSize()),
               storeTags(TaggedBegin, TaggedEnd));
 
     uptr LoadPtr = Addr;
@@ -179,7 +184,7 @@
     EXPECT_EQ(LoadPtr, loadTag(LoadPtr));
 
     // Reset tags without using StoreTags.
-    releasePagesToOS(Addr, 0, BufferSize, &Data);
+    MemMap.releasePagesToOS(Addr, BufferSize);
   }
 }
 
diff --git a/standalone/tests/mutex_test.cpp b/standalone/tests/mutex_test.cpp
index d3242a3..c3efeab 100644
--- a/standalone/tests/mutex_test.cpp
+++ b/standalone/tests/mutex_test.cpp
@@ -99,3 +99,10 @@
   for (scudo::u32 I = 0; I < NumberOfThreads; I++)
     pthread_join(Threads[I], 0);
 }
+
+TEST(ScudoMutexTest, MutexAssertHeld) {
+  scudo::HybridMutex M;
+  M.lock();
+  M.assertHeld();
+  M.unlock();
+}
diff --git a/standalone/tests/primary_test.cpp b/standalone/tests/primary_test.cpp
index 283e297..51a7038 100644
--- a/standalone/tests/primary_test.cpp
+++ b/standalone/tests/primary_test.cpp
@@ -12,8 +12,11 @@
 #include "primary64.h"
 #include "size_class_map.h"
 
+#include <algorithm>
+#include <chrono>
 #include <condition_variable>
 #include <mutex>
+#include <random>
 #include <stdlib.h>
 #include <thread>
 #include <vector>
@@ -24,6 +27,7 @@
 
 struct TestConfig1 {
   static const scudo::uptr PrimaryRegionSizeLog = 18U;
+  static const scudo::uptr PrimaryGroupSizeLog = 18U;
   static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
   static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
   static const bool MaySupportMemoryTagging = false;
@@ -40,6 +44,7 @@
 #else
   static const scudo::uptr PrimaryRegionSizeLog = 24U;
 #endif
+  static const scudo::uptr PrimaryGroupSizeLog = 20U;
   static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
   static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
   static const bool MaySupportMemoryTagging = false;
@@ -56,6 +61,7 @@
 #else
   static const scudo::uptr PrimaryRegionSizeLog = 24U;
 #endif
+  static const scudo::uptr PrimaryGroupSizeLog = 20U;
   static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
   static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
   static const bool MaySupportMemoryTagging = true;
@@ -65,6 +71,23 @@
   static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
 };
 
+struct TestConfig4 {
+#if defined(__mips__)
+  // Unable to allocate greater size on QEMU-user.
+  static const scudo::uptr PrimaryRegionSizeLog = 23U;
+#else
+  static const scudo::uptr PrimaryRegionSizeLog = 24U;
+#endif
+  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
+  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
+  static const bool MaySupportMemoryTagging = true;
+  static const scudo::uptr PrimaryCompactPtrScale = 3U;
+  static const scudo::uptr PrimaryGroupSizeLog = 20U;
+  typedef scudo::u32 PrimaryCompactPtrT;
+  static const bool PrimaryEnableRandomOffset = true;
+  static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
+};
+
 template <typename BaseConfig, typename SizeClassMapT>
 struct Config : public BaseConfig {
   using SizeClassMap = SizeClassMapT;
@@ -100,7 +123,8 @@
 #define SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                              \
   SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig1)                            \
   SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig2)                            \
-  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig3)
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig3)                            \
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig4)
 #endif
 
 #define SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TYPE)                             \
@@ -137,7 +161,7 @@
       Cache.deallocate(ClassId, Pointers[J]);
   }
   Cache.destroy(nullptr);
-  Allocator->releaseToOS();
+  Allocator->releaseToOS(scudo::ReleaseToOS::Force);
   scudo::ScopedString Str;
   Allocator->getStats(&Str);
   Str.output();
@@ -145,7 +169,7 @@
 
 struct SmallRegionsConfig {
   using SizeClassMap = scudo::DefaultSizeClassMap;
-  static const scudo::uptr PrimaryRegionSizeLog = 20U;
+  static const scudo::uptr PrimaryRegionSizeLog = 21U;
   static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
   static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
   static const bool MaySupportMemoryTagging = false;
@@ -153,6 +177,7 @@
   static const scudo::uptr PrimaryCompactPtrScale = 0;
   static const bool PrimaryEnableRandomOffset = true;
   static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
+  static const scudo::uptr PrimaryGroupSizeLog = 20U;
 };
 
 // The 64-bit SizeClassAllocator can be easily OOM'd with small region sizes.
@@ -170,22 +195,27 @@
   std::vector<TransferBatch *> Batches;
   const scudo::uptr ClassId = Primary::SizeClassMap::LargestClassId;
   const scudo::uptr Size = Primary::getSizeByClassId(ClassId);
+  typename Primary::CacheT::CompactPtrT Blocks[TransferBatch::MaxNumCached];
+
   for (scudo::uptr I = 0; I < 10000U; I++) {
     TransferBatch *B = Allocator.popBatch(&Cache, ClassId);
     if (!B) {
       AllocationFailed = true;
       break;
     }
-    for (scudo::u32 J = 0; J < B->getCount(); J++)
+    for (scudo::u16 J = 0; J < B->getCount(); J++)
       memset(Allocator.decompactPtr(ClassId, B->get(J)), 'B', Size);
     Batches.push_back(B);
   }
   while (!Batches.empty()) {
-    Allocator.pushBatch(ClassId, Batches.back());
+    TransferBatch *B = Batches.back();
     Batches.pop_back();
+    B->copyToArray(Blocks);
+    Allocator.pushBlocks(&Cache, ClassId, Blocks, B->getCount());
+    Cache.deallocate(Primary::SizeClassMap::BatchClassId, B);
   }
   Cache.destroy(nullptr);
-  Allocator.releaseToOS();
+  Allocator.releaseToOS(scudo::ReleaseToOS::Force);
   scudo::ScopedString Str;
   Allocator.getStats(&Str);
   Str.output();
@@ -223,7 +253,7 @@
     V.pop_back();
   }
   Cache.destroy(nullptr);
-  Allocator->releaseToOS();
+  Allocator->releaseToOS(scudo::ReleaseToOS::Force);
   scudo::ScopedString Str;
   Allocator->getStats(&Str);
   Str.output();
@@ -270,7 +300,7 @@
   }
   for (auto &T : Threads)
     T.join();
-  Allocator->releaseToOS();
+  Allocator->releaseToOS(scudo::ReleaseToOS::Force);
   scudo::ScopedString Str;
   Allocator->getStats(&Str);
   Str.output();
@@ -292,5 +322,49 @@
   EXPECT_NE(P, nullptr);
   Cache.deallocate(ClassId, P);
   Cache.destroy(nullptr);
-  EXPECT_GT(Allocator->releaseToOS(), 0U);
+  EXPECT_GT(Allocator->releaseToOS(scudo::ReleaseToOS::Force), 0U);
+}
+
+SCUDO_TYPED_TEST(ScudoPrimaryTest, MemoryGroup) {
+  using Primary = TestAllocator<TypeParam, scudo::DefaultSizeClassMap>;
+  std::unique_ptr<Primary> Allocator(new Primary);
+  Allocator->init(/*ReleaseToOsInterval=*/-1);
+  typename Primary::CacheT Cache;
+  Cache.init(nullptr, Allocator.get());
+  const scudo::uptr Size = 32U;
+  const scudo::uptr ClassId = Primary::SizeClassMap::getClassIdBySize(Size);
+
+  // We will allocate 4 times the group size memory and release all of them. We
+  // expect the free blocks will be classified with groups. Then we will
+  // allocate the same amount of memory as group size and expect the blocks will
+  // have the max address difference smaller or equal to 2 times the group size.
+  // Note that it isn't necessary to be in the range of single group size
+  // because the way we get the group id is doing compact pointer shifting.
+  // According to configuration, the compact pointer may not align to group
+  // size. As a result, the blocks can cross two groups at most.
+  const scudo::uptr GroupSizeMem = (1ULL << Primary::GroupSizeLog);
+  const scudo::uptr PeakAllocationMem = 4 * GroupSizeMem;
+  const scudo::uptr PeakNumberOfAllocations = PeakAllocationMem / Size;
+  const scudo::uptr FinalNumberOfAllocations = GroupSizeMem / Size;
+  std::vector<scudo::uptr> Blocks;
+  std::mt19937 R;
+
+  for (scudo::uptr I = 0; I < PeakNumberOfAllocations; ++I)
+    Blocks.push_back(reinterpret_cast<scudo::uptr>(Cache.allocate(ClassId)));
+
+  std::shuffle(Blocks.begin(), Blocks.end(), R);
+
+  // Release all the allocated blocks, including those held by local cache.
+  while (!Blocks.empty()) {
+    Cache.deallocate(ClassId, reinterpret_cast<void *>(Blocks.back()));
+    Blocks.pop_back();
+  }
+  Cache.drain();
+
+  for (scudo::uptr I = 0; I < FinalNumberOfAllocations; ++I)
+    Blocks.push_back(reinterpret_cast<scudo::uptr>(Cache.allocate(ClassId)));
+
+  EXPECT_LE(*std::max_element(Blocks.begin(), Blocks.end()) -
+                *std::min_element(Blocks.begin(), Blocks.end()),
+            GroupSizeMem * 2);
 }
diff --git a/standalone/tests/release_test.cpp b/standalone/tests/release_test.cpp
index 04c0289..41f0b16 100644
--- a/standalone/tests/release_test.cpp
+++ b/standalone/tests/release_test.cpp
@@ -18,19 +18,22 @@
 #include <random>
 #include <set>
 
-TEST(ScudoReleaseTest, PackedCounterArray) {
+TEST(ScudoReleaseTest, RegionPageMap) {
   for (scudo::uptr I = 0; I < SCUDO_WORDSIZE; I++) {
     // Various valid counter's max values packed into one word.
-    scudo::PackedCounterArray Counters2N(1U, 1U, 1UL << I);
-    EXPECT_EQ(sizeof(scudo::uptr), Counters2N.getBufferSize());
+    scudo::RegionPageMap PageMap2N(1U, 1U, 1UL << I);
+    ASSERT_TRUE(PageMap2N.isAllocated());
+    EXPECT_EQ(sizeof(scudo::uptr), PageMap2N.getBufferSize());
     // Check the "all bit set" values too.
-    scudo::PackedCounterArray Counters2N1_1(1U, 1U, ~0UL >> I);
-    EXPECT_EQ(sizeof(scudo::uptr), Counters2N1_1.getBufferSize());
+    scudo::RegionPageMap PageMap2N1_1(1U, 1U, ~0UL >> I);
+    ASSERT_TRUE(PageMap2N1_1.isAllocated());
+    EXPECT_EQ(sizeof(scudo::uptr), PageMap2N1_1.getBufferSize());
     // Verify the packing ratio, the counter is Expected to be packed into the
     // closest power of 2 bits.
-    scudo::PackedCounterArray Counters(1U, SCUDO_WORDSIZE, 1UL << I);
-    EXPECT_EQ(sizeof(scudo::uptr) * scudo::roundUpToPowerOfTwo(I + 1),
-              Counters.getBufferSize());
+    scudo::RegionPageMap PageMap(1U, SCUDO_WORDSIZE, 1UL << I);
+    ASSERT_TRUE(PageMap.isAllocated());
+    EXPECT_EQ(sizeof(scudo::uptr) * scudo::roundUpPowerOfTwo(I + 1),
+              PageMap.getBufferSize());
   }
 
   // Go through 1, 2, 4, 8, .. {32,64} bits per counter.
@@ -38,22 +41,44 @@
     // Make sure counters request one memory page for the buffer.
     const scudo::uptr NumCounters =
         (scudo::getPageSizeCached() / 8) * (SCUDO_WORDSIZE >> I);
-    scudo::PackedCounterArray Counters(1U, NumCounters,
+    scudo::RegionPageMap PageMap(1U, NumCounters,
                                        1UL << ((1UL << I) - 1));
-    Counters.inc(0U, 0U);
+    ASSERT_TRUE(PageMap.isAllocated());
+    PageMap.inc(0U, 0U);
     for (scudo::uptr C = 1; C < NumCounters - 1; C++) {
-      EXPECT_EQ(0UL, Counters.get(0U, C));
-      Counters.inc(0U, C);
-      EXPECT_EQ(1UL, Counters.get(0U, C - 1));
+      EXPECT_EQ(0UL, PageMap.get(0U, C));
+      PageMap.inc(0U, C);
+      EXPECT_EQ(1UL, PageMap.get(0U, C - 1));
     }
-    EXPECT_EQ(0UL, Counters.get(0U, NumCounters - 1));
-    Counters.inc(0U, NumCounters - 1);
+    EXPECT_EQ(0UL, PageMap.get(0U, NumCounters - 1));
+    PageMap.inc(0U, NumCounters - 1);
     if (I > 0) {
-      Counters.incRange(0u, 0U, NumCounters - 1);
+      PageMap.incRange(0u, 0U, NumCounters - 1);
       for (scudo::uptr C = 0; C < NumCounters; C++)
-        EXPECT_EQ(2UL, Counters.get(0U, C));
+        EXPECT_EQ(2UL, PageMap.get(0U, C));
     }
   }
+
+  // Similar to the above except that we are using incN().
+  for (scudo::uptr I = 0; (SCUDO_WORDSIZE >> I) != 0; I++) {
+    // Make sure counters request one memory page for the buffer.
+    const scudo::uptr NumCounters =
+        (scudo::getPageSizeCached() / 8) * (SCUDO_WORDSIZE >> I);
+    scudo::uptr MaxValue = 1UL << ((1UL << I) - 1);
+    if (MaxValue <= 1U)
+      continue;
+
+    scudo::RegionPageMap PageMap(1U, NumCounters, MaxValue);
+
+    scudo::uptr N = MaxValue / 2;
+    PageMap.incN(0U, 0, N);
+    for (scudo::uptr C = 1; C < NumCounters; C++) {
+      EXPECT_EQ(0UL, PageMap.get(0U, C));
+      PageMap.incN(0U, C, N);
+      EXPECT_EQ(N, PageMap.get(0U, C - 1));
+    }
+    EXPECT_EQ(N, PageMap.get(0U, NumCounters - 1));
+  }
 }
 
 class StringRangeRecorder {
@@ -102,7 +127,7 @@
 
   for (auto TestCase : TestCases) {
     StringRangeRecorder Recorder;
-    RangeTracker Tracker(&Recorder);
+    RangeTracker Tracker(Recorder);
     for (scudo::uptr I = 0; TestCase[I] != 0; I++)
       Tracker.processNextPage(TestCase[I] == 'x');
     Tracker.finish();
@@ -117,41 +142,45 @@
 
 class ReleasedPagesRecorder {
 public:
+  ReleasedPagesRecorder() = default;
+  explicit ReleasedPagesRecorder(scudo::uptr Base) : Base(Base) {}
   std::set<scudo::uptr> ReportedPages;
 
   void releasePageRangeToOS(scudo::uptr From, scudo::uptr To) {
     const scudo::uptr PageSize = scudo::getPageSizeCached();
     for (scudo::uptr I = From; I < To; I += PageSize)
-      ReportedPages.insert(I);
+      ReportedPages.insert(I + getBase());
   }
 
-  scudo::uptr getBase() const { return 0; }
+  scudo::uptr getBase() const { return Base; }
+  scudo::uptr Base = 0;
 };
 
 // Simplified version of a TransferBatch.
 template <class SizeClassMap> struct FreeBatch {
-  static const scudo::u32 MaxCount = SizeClassMap::MaxNumCachedHint;
+  static const scudo::u16 MaxCount = SizeClassMap::MaxNumCachedHint;
   void clear() { Count = 0; }
   void add(scudo::uptr P) {
     DCHECK_LT(Count, MaxCount);
     Batch[Count++] = P;
   }
-  scudo::u32 getCount() const { return Count; }
-  scudo::uptr get(scudo::u32 I) const {
+  scudo::u16 getCount() const { return Count; }
+  scudo::uptr get(scudo::u16 I) const {
     DCHECK_LE(I, Count);
     return Batch[I];
   }
   FreeBatch *Next;
 
 private:
-  scudo::u32 Count;
   scudo::uptr Batch[MaxCount];
+  scudo::u16 Count;
 };
 
 template <class SizeClassMap> void testReleaseFreeMemoryToOS() {
   typedef FreeBatch<SizeClassMap> Batch;
   const scudo::uptr PagesCount = 1024;
   const scudo::uptr PageSize = scudo::getPageSizeCached();
+  const scudo::uptr PageSizeLog = scudo::getLog2(PageSize);
   std::mt19937 R;
   scudo::u32 RandState = 42;
 
@@ -195,8 +224,15 @@
     auto SkipRegion = [](UNUSED scudo::uptr RegionIndex) { return false; };
     auto DecompactPtr = [](scudo::uptr P) { return P; };
     ReleasedPagesRecorder Recorder;
-    releaseFreeMemoryToOS(FreeList, MaxBlocks * BlockSize, 1U, BlockSize,
-                          &Recorder, DecompactPtr, SkipRegion);
+    scudo::PageReleaseContext Context(BlockSize, /*NumberOfRegions=*/1U,
+                                      /*ReleaseSize=*/MaxBlocks * BlockSize);
+    ASSERT_FALSE(Context.hasBlockMarked());
+    Context.markFreeBlocksInRegion(FreeList, DecompactPtr, Recorder.getBase(),
+                                   /*RegionIndex=*/0, MaxBlocks * BlockSize,
+                                   /*MayContainLastBlockInRegion=*/true);
+    ASSERT_TRUE(Context.hasBlockMarked());
+    releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
+    scudo::RegionPageMap &PageMap = Context.PageMap;
 
     // Verify that there are no released pages touched by used chunks and all
     // ranges of free chunks big enough to contain the entire memory pages had
@@ -223,17 +259,20 @@
           const bool PageReleased = Recorder.ReportedPages.find(J * PageSize) !=
                                     Recorder.ReportedPages.end();
           EXPECT_EQ(false, PageReleased);
+          EXPECT_EQ(false,
+                    PageMap.isAllCounted(0, (J * PageSize) >> PageSizeLog));
         }
 
         if (InFreeRange) {
           InFreeRange = false;
           // Verify that all entire memory pages covered by this range of free
           // chunks were released.
-          scudo::uptr P = scudo::roundUpTo(CurrentFreeRangeStart, PageSize);
+          scudo::uptr P = scudo::roundUp(CurrentFreeRangeStart, PageSize);
           while (P + PageSize <= CurrentBlock) {
             const bool PageReleased =
                 Recorder.ReportedPages.find(P) != Recorder.ReportedPages.end();
             EXPECT_EQ(true, PageReleased);
+            EXPECT_EQ(true, PageMap.isAllCounted(0, P >> PageSizeLog));
             VerifiedReleasedPages++;
             P += PageSize;
           }
@@ -244,13 +283,14 @@
     }
 
     if (InFreeRange) {
-      scudo::uptr P = scudo::roundUpTo(CurrentFreeRangeStart, PageSize);
+      scudo::uptr P = scudo::roundUp(CurrentFreeRangeStart, PageSize);
       const scudo::uptr EndPage =
-          scudo::roundUpTo(MaxBlocks * BlockSize, PageSize);
+          scudo::roundUp(MaxBlocks * BlockSize, PageSize);
       while (P + PageSize <= EndPage) {
         const bool PageReleased =
             Recorder.ReportedPages.find(P) != Recorder.ReportedPages.end();
         EXPECT_EQ(true, PageReleased);
+        EXPECT_EQ(true, PageMap.isAllCounted(0, P >> PageSizeLog));
         VerifiedReleasedPages++;
         P += PageSize;
       }
@@ -266,6 +306,243 @@
   }
 }
 
+template <class SizeClassMap> void testPageMapMarkRange() {
+  const scudo::uptr PageSize = scudo::getPageSizeCached();
+
+  for (scudo::uptr I = 1; I <= SizeClassMap::LargestClassId; I++) {
+    const scudo::uptr BlockSize = SizeClassMap::getSizeByClassId(I);
+
+    const scudo::uptr GroupNum = 2;
+    const scudo::uptr GroupSize = scudo::roundUp(BlockSize, PageSize) * 2;
+    const scudo::uptr RegionSize =
+        scudo::roundUpSlow(GroupSize * GroupNum, BlockSize);
+    const scudo::uptr RoundedRegionSize = scudo::roundUp(RegionSize, PageSize);
+
+    std::vector<scudo::uptr> Pages(RoundedRegionSize / PageSize, 0);
+    for (scudo::uptr Block = 0; Block < RoundedRegionSize; Block += BlockSize) {
+      for (scudo::uptr Page = Block / PageSize;
+           Page <= (Block + BlockSize - 1) / PageSize &&
+           Page < RoundedRegionSize / PageSize;
+           ++Page) {
+        ASSERT_LT(Page, Pages.size());
+        ++Pages[Page];
+      }
+    }
+
+    for (scudo::uptr GroupId = 0; GroupId < GroupNum; ++GroupId) {
+      const scudo::uptr GroupBeg = GroupId * GroupSize;
+      const scudo::uptr GroupEnd = GroupBeg + GroupSize;
+
+      scudo::PageReleaseContext Context(BlockSize, /*NumberOfRegions=*/1U,
+                                        /*ReleaseSize=*/RegionSize);
+      Context.markRangeAsAllCounted(GroupBeg, GroupEnd, /*Base=*/0U,
+                                    /*RegionIndex=*/0, RegionSize);
+
+      scudo::uptr FirstBlock =
+          ((GroupBeg + BlockSize - 1) / BlockSize) * BlockSize;
+
+      // All the pages before first block page are not supposed to be marked.
+      if (FirstBlock / PageSize > 0) {
+        for (scudo::uptr Page = 0; Page <= FirstBlock / PageSize - 1; ++Page)
+          EXPECT_EQ(Context.PageMap.get(/*Region=*/0, Page), 0U);
+      }
+
+      // Verify the pages used by the blocks in the group except that if the
+      // end of the last block is not aligned with `GroupEnd`, it'll be verified
+      // later.
+      scudo::uptr Block;
+      for (Block = FirstBlock; Block + BlockSize <= GroupEnd;
+           Block += BlockSize) {
+        for (scudo::uptr Page = Block / PageSize;
+             Page <= (Block + BlockSize - 1) / PageSize; ++Page) {
+          // First used page in the group has two cases, which are w/ and w/o
+          // block sitting across the boundary.
+          if (Page == FirstBlock / PageSize) {
+            if (FirstBlock % PageSize == 0) {
+              EXPECT_TRUE(Context.PageMap.isAllCounted(/*Region=*/0U, Page));
+            } else {
+              // There's a block straddling `GroupBeg`, it's supposed to only
+              // increment the counter and we expect it should be 1 less
+              // (exclude the straddling one) than the total blocks on the page.
+              EXPECT_EQ(Context.PageMap.get(/*Region=*/0U, Page),
+                        Pages[Page] - 1);
+            }
+          } else {
+            EXPECT_TRUE(Context.PageMap.isAllCounted(/*Region=*/0, Page));
+          }
+        }
+      }
+
+      if (Block == GroupEnd)
+        continue;
+
+      // Examine the last block which sits across the group boundary.
+      if (Block + BlockSize == RegionSize) {
+        // This is the last block in the region, it's supposed to mark all the
+        // pages as all counted.
+        for (scudo::uptr Page = Block / PageSize;
+             Page <= (Block + BlockSize - 1) / PageSize; ++Page) {
+          EXPECT_TRUE(Context.PageMap.isAllCounted(/*Region=*/0, Page));
+        }
+      } else {
+        for (scudo::uptr Page = Block / PageSize;
+             Page <= (Block + BlockSize - 1) / PageSize; ++Page) {
+          if (Page <= (GroupEnd - 1) / PageSize)
+            EXPECT_TRUE(Context.PageMap.isAllCounted(/*Region=*/0, Page));
+          else
+            EXPECT_EQ(Context.PageMap.get(/*Region=*/0U, Page), 1U);
+        }
+      }
+
+      const scudo::uptr FirstUncountedPage =
+          scudo::roundUp(Block + BlockSize, PageSize);
+      for (scudo::uptr Page = FirstUncountedPage;
+           Page <= RoundedRegionSize / PageSize; ++Page) {
+        EXPECT_EQ(Context.PageMap.get(/*Region=*/0U, Page), 0U);
+      }
+    } // Iterate each Group
+
+    // Release the entire region. This is to ensure the last page is counted.
+    scudo::PageReleaseContext Context(BlockSize, /*NumberOfRegions=*/1U,
+                                      /*ReleaseSize=*/RegionSize);
+    Context.markRangeAsAllCounted(/*From=*/0U, /*To=*/RegionSize, /*Base=*/0,
+                                  /*RegionIndex=*/0, RegionSize);
+    for (scudo::uptr Page = 0; Page < RoundedRegionSize / PageSize; ++Page)
+      EXPECT_TRUE(Context.PageMap.isAllCounted(/*Region=*/0, Page));
+  } // Iterate each size class
+}
+
+template <class SizeClassMap> void testReleasePartialRegion() {
+  typedef FreeBatch<SizeClassMap> Batch;
+  const scudo::uptr PageSize = scudo::getPageSizeCached();
+
+  for (scudo::uptr I = 1; I <= SizeClassMap::LargestClassId; I++) {
+    // In the following, we want to ensure the region includes at least 2 pages
+    // and we will release all the pages except the first one. The handling of
+    // the last block is tricky, so we always test the case that includes the
+    // last block.
+    const scudo::uptr BlockSize = SizeClassMap::getSizeByClassId(I);
+    const scudo::uptr ReleaseBase = scudo::roundUp(BlockSize, PageSize);
+    const scudo::uptr BasePageOffset = ReleaseBase / PageSize;
+    const scudo::uptr RegionSize =
+        scudo::roundUpSlow(scudo::roundUp(BlockSize, PageSize) + ReleaseBase,
+                           BlockSize) +
+        BlockSize;
+    const scudo::uptr RoundedRegionSize = scudo::roundUp(RegionSize, PageSize);
+
+    scudo::SinglyLinkedList<Batch> FreeList;
+    FreeList.clear();
+
+    // Skip the blocks in the first page and add the remaining.
+    std::vector<scudo::uptr> Pages(RoundedRegionSize / PageSize, 0);
+    for (scudo::uptr Block = scudo::roundUpSlow(ReleaseBase, BlockSize);
+         Block + BlockSize <= RoundedRegionSize; Block += BlockSize) {
+      for (scudo::uptr Page = Block / PageSize;
+           Page <= (Block + BlockSize - 1) / PageSize; ++Page) {
+        ASSERT_LT(Page, Pages.size());
+        ++Pages[Page];
+      }
+    }
+
+    // This follows the logic how we count the last page. It should be
+    // consistent with how markFreeBlocksInRegion() handles the last block.
+    if (RoundedRegionSize % BlockSize != 0)
+      ++Pages.back();
+
+    Batch *CurrentBatch = nullptr;
+    for (scudo::uptr Block = scudo::roundUpSlow(ReleaseBase, BlockSize);
+         Block < RegionSize; Block += BlockSize) {
+      if (CurrentBatch == nullptr ||
+          CurrentBatch->getCount() == Batch::MaxCount) {
+        CurrentBatch = new Batch;
+        CurrentBatch->clear();
+        FreeList.push_back(CurrentBatch);
+      }
+      CurrentBatch->add(Block);
+    }
+
+    auto VerifyReleaseToOs = [&](scudo::PageReleaseContext &Context) {
+      auto SkipRegion = [](UNUSED scudo::uptr RegionIndex) { return false; };
+      ReleasedPagesRecorder Recorder(ReleaseBase);
+      releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
+      const scudo::uptr FirstBlock = scudo::roundUpSlow(ReleaseBase, BlockSize);
+
+      for (scudo::uptr P = 0; P < RoundedRegionSize; P += PageSize) {
+        if (P < FirstBlock) {
+          // If FirstBlock is not aligned with page boundary, the first touched
+          // page will not be released either.
+          EXPECT_TRUE(Recorder.ReportedPages.find(P) ==
+                      Recorder.ReportedPages.end());
+        } else {
+          EXPECT_TRUE(Recorder.ReportedPages.find(P) !=
+                      Recorder.ReportedPages.end());
+        }
+      }
+    };
+
+    // Test marking by visiting each block.
+    {
+      auto DecompactPtr = [](scudo::uptr P) { return P; };
+      scudo::PageReleaseContext Context(BlockSize, /*NumberOfRegions=*/1U,
+                                        /*ReleaseSize=*/RegionSize - PageSize,
+                                        ReleaseBase);
+      Context.markFreeBlocksInRegion(FreeList, DecompactPtr, /*Base=*/0U,
+                                     /*RegionIndex=*/0, RegionSize,
+                                     /*MayContainLastBlockInRegion=*/true);
+      for (const Batch &It : FreeList) {
+        for (scudo::u16 I = 0; I < It.getCount(); I++) {
+          scudo::uptr Block = It.get(I);
+          for (scudo::uptr Page = Block / PageSize;
+               Page <= (Block + BlockSize - 1) / PageSize; ++Page) {
+            EXPECT_EQ(Pages[Page], Context.PageMap.get(/*Region=*/0U,
+                                                       Page - BasePageOffset));
+          }
+        }
+      }
+
+      VerifyReleaseToOs(Context);
+    }
+
+    // Test range marking.
+    {
+      scudo::PageReleaseContext Context(BlockSize, /*NumberOfRegions=*/1U,
+                                        /*ReleaseSize=*/RegionSize - PageSize,
+                                        ReleaseBase);
+      Context.markRangeAsAllCounted(ReleaseBase, RegionSize, /*Base=*/0U,
+                                    /*RegionIndex=*/0, RegionSize);
+      for (scudo::uptr Page = ReleaseBase / PageSize;
+           Page < RoundedRegionSize / PageSize; ++Page) {
+        if (Context.PageMap.get(/*Region=*/0, Page - BasePageOffset) !=
+            Pages[Page]) {
+          EXPECT_TRUE(Context.PageMap.isAllCounted(/*Region=*/0,
+                                                   Page - BasePageOffset));
+        }
+      }
+
+      VerifyReleaseToOs(Context);
+    }
+
+    // Check the buffer size of PageMap.
+    {
+      scudo::PageReleaseContext Full(BlockSize, /*NumberOfRegions=*/1U,
+                                     /*ReleaseSize=*/RegionSize);
+      Full.ensurePageMapAllocated();
+      scudo::PageReleaseContext Partial(BlockSize, /*NumberOfRegions=*/1U,
+                                        /*ReleaseSize=*/RegionSize - PageSize,
+                                        ReleaseBase);
+      Partial.ensurePageMapAllocated();
+
+      EXPECT_GE(Full.PageMap.getBufferSize(), Partial.PageMap.getBufferSize());
+    }
+
+    while (!FreeList.empty()) {
+      CurrentBatch = FreeList.front();
+      FreeList.pop_front();
+      delete CurrentBatch;
+    }
+  } // Iterate each size class
+}
+
 TEST(ScudoReleaseTest, ReleaseFreeMemoryToOSDefault) {
   testReleaseFreeMemoryToOS<scudo::DefaultSizeClassMap>();
 }
@@ -277,3 +554,106 @@
 TEST(ScudoReleaseTest, ReleaseFreeMemoryToOSSvelte) {
   testReleaseFreeMemoryToOS<scudo::SvelteSizeClassMap>();
 }
+
+TEST(ScudoReleaseTest, PageMapMarkRange) {
+  testPageMapMarkRange<scudo::DefaultSizeClassMap>();
+  testPageMapMarkRange<scudo::AndroidSizeClassMap>();
+  testPageMapMarkRange<scudo::FuchsiaSizeClassMap>();
+  testPageMapMarkRange<scudo::SvelteSizeClassMap>();
+}
+
+TEST(ScudoReleaseTest, ReleasePartialRegion) {
+  testReleasePartialRegion<scudo::DefaultSizeClassMap>();
+  testReleasePartialRegion<scudo::AndroidSizeClassMap>();
+  testReleasePartialRegion<scudo::FuchsiaSizeClassMap>();
+  testReleasePartialRegion<scudo::SvelteSizeClassMap>();
+}
+
+template <class SizeClassMap> void testReleaseRangeWithSingleBlock() {
+  const scudo::uptr PageSize = scudo::getPageSizeCached();
+
+  // We want to test if a memory group only contains single block that will be
+  // handled properly. The case is like:
+  //
+  //   From                     To
+  //     +----------------------+
+  //  +------------+------------+
+  //  |            |            |
+  //  +------------+------------+
+  //                            ^
+  //                        RegionSize
+  //
+  // Note that `From` will be page aligned.
+  //
+  // If the second from the last block is aligned at `From`, then we expect all
+  // the pages after `From` will be marked as can-be-released. Otherwise, the
+  // pages only touched by the last blocks will be marked as can-be-released.
+  for (scudo::uptr I = 1; I <= SizeClassMap::LargestClassId; I++) {
+    const scudo::uptr BlockSize = SizeClassMap::getSizeByClassId(I);
+    const scudo::uptr From = scudo::roundUp(BlockSize, PageSize);
+    const scudo::uptr To =
+        From % BlockSize == 0
+            ? From + BlockSize
+            : scudo::roundDownSlow(From + BlockSize, BlockSize) + BlockSize;
+    const scudo::uptr RoundedRegionSize = scudo::roundUp(To, PageSize);
+
+    std::vector<scudo::uptr> Pages(RoundedRegionSize / PageSize, 0);
+    for (scudo::uptr Block = (To - BlockSize); Block < RoundedRegionSize;
+         Block += BlockSize) {
+      for (scudo::uptr Page = Block / PageSize;
+           Page <= (Block + BlockSize - 1) / PageSize &&
+           Page < RoundedRegionSize / PageSize;
+           ++Page) {
+        ASSERT_LT(Page, Pages.size());
+        ++Pages[Page];
+      }
+    }
+
+    scudo::PageReleaseContext Context(BlockSize, /*NumberOfRegions=*/1U,
+                                      /*ReleaseSize=*/To,
+                                      /*ReleaseBase=*/0U);
+    Context.markRangeAsAllCounted(From, To, /*Base=*/0U, /*RegionIndex=*/0,
+                                  /*RegionSize=*/To);
+
+    for (scudo::uptr Page = 0; Page < RoundedRegionSize; Page += PageSize) {
+      if (Context.PageMap.get(/*Region=*/0U, Page / PageSize) !=
+          Pages[Page / PageSize]) {
+        EXPECT_TRUE(
+            Context.PageMap.isAllCounted(/*Region=*/0U, Page / PageSize));
+      }
+    }
+  } // for each size class
+}
+
+TEST(ScudoReleaseTest, RangeReleaseRegionWithSingleBlock) {
+  testReleaseRangeWithSingleBlock<scudo::DefaultSizeClassMap>();
+  testReleaseRangeWithSingleBlock<scudo::AndroidSizeClassMap>();
+  testReleaseRangeWithSingleBlock<scudo::FuchsiaSizeClassMap>();
+  testReleaseRangeWithSingleBlock<scudo::SvelteSizeClassMap>();
+}
+
+TEST(ScudoReleaseTest, BufferPool) {
+  constexpr scudo::uptr StaticBufferCount = SCUDO_WORDSIZE - 1;
+  constexpr scudo::uptr StaticBufferSize = 512U;
+
+  // Allocate the buffer pool on the heap because it is quite large (slightly
+  // more than StaticBufferCount * StaticBufferSize * sizeof(uptr)) and it may
+  // not fit in the stack on some platforms.
+  using BufferPool = scudo::BufferPool<StaticBufferCount, StaticBufferSize>;
+  std::unique_ptr<BufferPool> Pool(new BufferPool());
+
+  std::vector<std::pair<scudo::uptr *, scudo::uptr>> Buffers;
+  for (scudo::uptr I = 0; I < StaticBufferCount; ++I) {
+    scudo::uptr *P = Pool->getBuffer(StaticBufferSize);
+    EXPECT_TRUE(Pool->isStaticBufferTestOnly(P, StaticBufferSize));
+    Buffers.emplace_back(P, StaticBufferSize);
+  }
+
+  // The static buffer is supposed to be used up.
+  scudo::uptr *P = Pool->getBuffer(StaticBufferSize);
+  EXPECT_FALSE(Pool->isStaticBufferTestOnly(P, StaticBufferSize));
+
+  Pool->releaseBuffer(P, StaticBufferSize);
+  for (auto &Buffer : Buffers)
+    Pool->releaseBuffer(Buffer.first, Buffer.second);
+}
diff --git a/standalone/tests/scudo_hooks_test.cpp b/standalone/tests/scudo_hooks_test.cpp
new file mode 100644
index 0000000..7184ec1
--- /dev/null
+++ b/standalone/tests/scudo_hooks_test.cpp
@@ -0,0 +1,114 @@
+//===-- scudo_hooks_test.cpp ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "tests/scudo_unit_test.h"
+
+#include "allocator_config.h"
+#include "combined.h"
+
+namespace {
+void *LastAllocatedPtr = nullptr;
+size_t LastRequestSize = 0;
+void *LastDeallocatedPtr = nullptr;
+} // namespace
+
+// Scudo defines weak symbols that can be defined by a client binary
+// to register callbacks at key points in the allocation timeline.  In
+// order to enforce those invariants, we provide definitions that
+// update some global state every time they are called, so that tests
+// can inspect their effects.  An unfortunate side effect of this
+// setup is that because those symbols are part of the binary, they
+// can't be selectively enabled; that means that they will get called
+// on unrelated tests in the same compilation unit. To mitigate this
+// issue, we insulate those tests in a separate compilation unit.
+extern "C" {
+__attribute__((visibility("default"))) void __scudo_allocate_hook(void *Ptr,
+                                                                  size_t Size) {
+  LastAllocatedPtr = Ptr;
+  LastRequestSize = Size;
+}
+__attribute__((visibility("default"))) void __scudo_deallocate_hook(void *Ptr) {
+  LastDeallocatedPtr = Ptr;
+}
+}
+
+// Simple check that allocation callbacks, when registered, are called:
+//   1) __scudo_allocate_hook is called when allocating.
+//   2) __scudo_deallocate_hook is called when deallocating.
+//   3) Both hooks are called when reallocating.
+//   4) Neither are called for a no-op reallocation.
+TEST(ScudoHooksTest, AllocateHooks) {
+  scudo::Allocator<scudo::DefaultConfig> Allocator;
+  constexpr scudo::uptr DefaultSize = 16U;
+  constexpr scudo::Chunk::Origin Origin = scudo::Chunk::Origin::Malloc;
+
+  // Simple allocation and deallocation.
+  {
+    LastAllocatedPtr = nullptr;
+    LastRequestSize = 0;
+
+    void *Ptr = Allocator.allocate(DefaultSize, Origin);
+
+    EXPECT_EQ(Ptr, LastAllocatedPtr);
+    EXPECT_EQ(DefaultSize, LastRequestSize);
+
+    LastDeallocatedPtr = nullptr;
+
+    Allocator.deallocate(Ptr, Origin);
+
+    EXPECT_EQ(Ptr, LastDeallocatedPtr);
+  }
+
+  // Simple no-op, same size reallocation.
+  {
+    void *Ptr = Allocator.allocate(DefaultSize, Origin);
+
+    LastAllocatedPtr = nullptr;
+    LastRequestSize = 0;
+    LastDeallocatedPtr = nullptr;
+
+    void *NewPtr = Allocator.reallocate(Ptr, DefaultSize);
+
+    EXPECT_EQ(Ptr, NewPtr);
+    EXPECT_EQ(nullptr, LastAllocatedPtr);
+    EXPECT_EQ(0U, LastRequestSize);
+    EXPECT_EQ(nullptr, LastDeallocatedPtr);
+  }
+
+  // Reallocation in increasing size classes. This ensures that at
+  // least one of the reallocations will be meaningful.
+  {
+    void *Ptr = Allocator.allocate(0, Origin);
+
+    for (scudo::uptr ClassId = 1U;
+         ClassId <= scudo::DefaultConfig::Primary::SizeClassMap::LargestClassId;
+         ++ClassId) {
+      const scudo::uptr Size =
+          scudo::DefaultConfig::Primary::SizeClassMap::getSizeByClassId(
+              ClassId);
+
+      LastAllocatedPtr = nullptr;
+      LastRequestSize = 0;
+      LastDeallocatedPtr = nullptr;
+
+      void *NewPtr = Allocator.reallocate(Ptr, Size);
+
+      if (NewPtr != Ptr) {
+        EXPECT_EQ(NewPtr, LastAllocatedPtr);
+        EXPECT_EQ(Size, LastRequestSize);
+        EXPECT_EQ(Ptr, LastDeallocatedPtr);
+      } else {
+        EXPECT_EQ(nullptr, LastAllocatedPtr);
+        EXPECT_EQ(0U, LastRequestSize);
+        EXPECT_EQ(nullptr, LastDeallocatedPtr);
+      }
+
+      Ptr = NewPtr;
+    }
+  }
+}
diff --git a/standalone/tests/secondary_test.cpp b/standalone/tests/secondary_test.cpp
index e656466..b031901 100644
--- a/standalone/tests/secondary_test.cpp
+++ b/standalone/tests/secondary_test.cpp
@@ -64,7 +64,7 @@
   P = L->allocate(Options, Size + Align, Align);
   EXPECT_NE(P, nullptr);
   void *AlignedP = reinterpret_cast<void *>(
-      scudo::roundUpTo(reinterpret_cast<scudo::uptr>(P), Align));
+      scudo::roundUp(reinterpret_cast<scudo::uptr>(P), Align));
   memset(AlignedP, 'A', Size);
   L->deallocate(Options, P);
 
@@ -122,7 +122,7 @@
 // combined allocator.
 TEST_F(MapAllocatorTest, SecondaryCombinations) {
   constexpr scudo::uptr MinAlign = FIRST_32_SECOND_64(8, 16);
-  constexpr scudo::uptr HeaderSize = scudo::roundUpTo(8, MinAlign);
+  constexpr scudo::uptr HeaderSize = scudo::roundUp(8, MinAlign);
   for (scudo::uptr SizeLog = 0; SizeLog <= 20; SizeLog++) {
     for (scudo::uptr AlignLog = FIRST_32_SECOND_64(3, 4); AlignLog <= 16;
          AlignLog++) {
@@ -131,13 +131,13 @@
         if (static_cast<scudo::sptr>(1U << SizeLog) + Delta <= 0)
           continue;
         const scudo::uptr UserSize =
-            scudo::roundUpTo((1U << SizeLog) + Delta, MinAlign);
+            scudo::roundUp((1U << SizeLog) + Delta, MinAlign);
         const scudo::uptr Size =
             HeaderSize + UserSize + (Align > MinAlign ? Align - HeaderSize : 0);
         void *P = Allocator->allocate(Options, Size, Align);
         EXPECT_NE(P, nullptr);
         void *AlignedP = reinterpret_cast<void *>(
-            scudo::roundUpTo(reinterpret_cast<scudo::uptr>(P), Align));
+            scudo::roundUp(reinterpret_cast<scudo::uptr>(P), Align));
         memset(AlignedP, 0xff, UserSize);
         Allocator->deallocate(Options, P);
       }
diff --git a/standalone/tests/size_class_map_test.cpp b/standalone/tests/size_class_map_test.cpp
index 076f36f..b11db1e 100644
--- a/standalone/tests/size_class_map_test.cpp
+++ b/standalone/tests/size_class_map_test.cpp
@@ -33,7 +33,7 @@
   static const scudo::uptr MinSizeLog = 5;
   static const scudo::uptr MidSizeLog = 5;
   static const scudo::uptr MaxSizeLog = 5;
-  static const scudo::u32 MaxNumCachedHint = 0;
+  static const scudo::u16 MaxNumCachedHint = 0;
   static const scudo::uptr MaxBytesCachedLog = 0;
   static const scudo::uptr SizeDelta = 0;
 };
@@ -48,7 +48,7 @@
   static const scudo::uptr MinSizeLog = 4;
   static const scudo::uptr MidSizeLog = 8;
   static const scudo::uptr MaxSizeLog = 63;
-  static const scudo::u32 MaxNumCachedHint = 128;
+  static const scudo::u16 MaxNumCachedHint = 128;
   static const scudo::uptr MaxBytesCachedLog = 16;
   static const scudo::uptr SizeDelta = 0;
 };
diff --git a/standalone/tests/strings_test.cpp b/standalone/tests/strings_test.cpp
index 6d7e78a..7a69ffd 100644
--- a/standalone/tests/strings_test.cpp
+++ b/standalone/tests/strings_test.cpp
@@ -43,9 +43,11 @@
 }
 
 TEST(ScudoStringsTest, ClearLarge) {
+  constexpr char appendString[] = "123";
   scudo::ScopedString Str;
+  Str.reserve(sizeof(appendString) * 10000);
   for (int i = 0; i < 10000; ++i)
-    Str.append("123");
+    Str.append(appendString);
   Str.clear();
   EXPECT_EQ(0ul, Str.length());
   EXPECT_EQ('\0', *Str.data());
@@ -76,6 +78,7 @@
   // of it with variations of append. The expectation is for nothing to crash.
   const scudo::uptr PageSize = scudo::getPageSizeCached();
   scudo::ScopedString Str;
+  Str.reserve(2 * PageSize);
   Str.clear();
   fillString(Str, 2 * PageSize);
   Str.clear();
diff --git a/standalone/tests/timing_test.cpp b/standalone/tests/timing_test.cpp
new file mode 100644
index 0000000..09a6c31
--- /dev/null
+++ b/standalone/tests/timing_test.cpp
@@ -0,0 +1,86 @@
+//===-- timing_test.cpp -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "tests/scudo_unit_test.h"
+
+#include "timing.h"
+
+#include <string>
+
+class ScudoTimingTest : public Test {
+public:
+  void testFunc1() { scudo::ScopedTimer ST(Manager, __func__); }
+
+  void testFunc2() {
+    scudo::ScopedTimer ST(Manager, __func__);
+    testFunc1();
+  }
+
+  void testChainedCalls() {
+    scudo::ScopedTimer ST(Manager, __func__);
+    testFunc2();
+  }
+
+  void testIgnoredTimer() {
+    scudo::ScopedTimer ST(Manager, __func__);
+    ST.ignore();
+  }
+
+  void printAllTimersStats() { Manager.printAll(); }
+
+  scudo::TimingManager &getTimingManager() { return Manager; }
+
+private:
+  scudo::TimingManager Manager;
+};
+
+// Given that the output of statistics of timers are dumped through
+// `scudo::Printf` which is platform dependent, so we don't have a reliable way
+// to catch the output and verify the details. Now we only verify the number of
+// invocations on linux.
+TEST_F(ScudoTimingTest, SimpleTimer) {
+#if SCUDO_LINUX
+  testing::internal::LogToStderr();
+  testing::internal::CaptureStderr();
+#endif
+
+  testIgnoredTimer();
+  testChainedCalls();
+  printAllTimersStats();
+
+#if SCUDO_LINUX
+  std::string output = testing::internal::GetCapturedStderr();
+  EXPECT_TRUE(output.find("testIgnoredTimer (1)") == std::string::npos);
+  EXPECT_TRUE(output.find("testChainedCalls (1)") != std::string::npos);
+  EXPECT_TRUE(output.find("testFunc2 (1)") != std::string::npos);
+  EXPECT_TRUE(output.find("testFunc1 (1)") != std::string::npos);
+#endif
+}
+
+TEST_F(ScudoTimingTest, NestedTimer) {
+#if SCUDO_LINUX
+  testing::internal::LogToStderr();
+  testing::internal::CaptureStderr();
+#endif
+
+  {
+    scudo::ScopedTimer Outer(getTimingManager(), "Outer");
+    {
+      scudo::ScopedTimer Inner1(getTimingManager(), Outer, "Inner1");
+      { scudo::ScopedTimer Inner2(getTimingManager(), Inner1, "Inner2"); }
+    }
+  }
+  printAllTimersStats();
+
+#if SCUDO_LINUX
+  std::string output = testing::internal::GetCapturedStderr();
+  EXPECT_TRUE(output.find("Outer (1)") != std::string::npos);
+  EXPECT_TRUE(output.find("Inner1 (1)") != std::string::npos);
+  EXPECT_TRUE(output.find("Inner2 (1)") != std::string::npos);
+#endif
+}
diff --git a/standalone/tests/tsd_test.cpp b/standalone/tests/tsd_test.cpp
index 17387ee..a092fdd 100644
--- a/standalone/tests/tsd_test.cpp
+++ b/standalone/tests/tsd_test.cpp
@@ -25,7 +25,9 @@
 public:
   using ThisT = MockAllocator<Config>;
   using TSDRegistryT = typename Config::template TSDRegistryT<ThisT>;
-  using CacheT = struct MockCache { volatile scudo::uptr Canary; };
+  using CacheT = struct MockCache {
+    volatile scudo::uptr Canary;
+  };
   using QuarantineCacheT = struct MockQuarantine {};
 
   void init() {
@@ -80,7 +82,7 @@
   EXPECT_FALSE(Allocator->isInitialized());
 
   auto Registry = Allocator->getTSDRegistry();
-  Registry->init(Allocator.get());
+  Registry->initOnceMaybe(Allocator.get());
   EXPECT_TRUE(Allocator->isInitialized());
 }
 
@@ -100,15 +102,15 @@
   bool UnlockRequired;
   auto TSD = Registry->getTSDAndLock(&UnlockRequired);
   EXPECT_NE(TSD, nullptr);
-  EXPECT_EQ(TSD->Cache.Canary, 0U);
+  EXPECT_EQ(TSD->getCache().Canary, 0U);
   if (UnlockRequired)
     TSD->unlock();
 
   Registry->initThreadMaybe(Allocator.get(), /*MinimalInit=*/false);
   TSD = Registry->getTSDAndLock(&UnlockRequired);
   EXPECT_NE(TSD, nullptr);
-  EXPECT_EQ(TSD->Cache.Canary, 0U);
-  memset(&TSD->Cache, 0x42, sizeof(TSD->Cache));
+  EXPECT_EQ(TSD->getCache().Canary, 0U);
+  memset(&TSD->getCache(), 0x42, sizeof(TSD->getCache()));
   if (UnlockRequired)
     TSD->unlock();
 }
@@ -139,14 +141,14 @@
   // For an exclusive TSD, the cache should be empty. We cannot guarantee the
   // same for a shared TSD.
   if (!UnlockRequired)
-    EXPECT_EQ(TSD->Cache.Canary, 0U);
+    EXPECT_EQ(TSD->getCache().Canary, 0U);
   // Transform the thread id to a uptr to use it as canary.
   const scudo::uptr Canary = static_cast<scudo::uptr>(
       std::hash<std::thread::id>{}(std::this_thread::get_id()));
-  TSD->Cache.Canary = Canary;
+  TSD->getCache().Canary = Canary;
   // Loop a few times to make sure that a concurrent thread isn't modifying it.
   for (scudo::uptr I = 0; I < 4096U; I++)
-    EXPECT_EQ(TSD->Cache.Canary, Canary);
+    EXPECT_EQ(TSD->getCache().Canary, Canary);
   if (UnlockRequired)
     TSD->unlock();
 }
diff --git a/standalone/thread_annotations.h b/standalone/thread_annotations.h
new file mode 100644
index 0000000..68a1087
--- /dev/null
+++ b/standalone/thread_annotations.h
@@ -0,0 +1,70 @@
+//===-- thread_annotations.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_THREAD_ANNOTATIONS_
+#define SCUDO_THREAD_ANNOTATIONS_
+
+// Enable thread safety attributes only with clang.
+// The attributes can be safely ignored when compiling with other compilers.
+#if defined(__clang__)
+#define THREAD_ANNOTATION_ATTRIBUTE_(x) __attribute__((x))
+#else
+#define THREAD_ANNOTATION_ATTRIBUTE_(x) // no-op
+#endif
+
+#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(capability(x))
+
+#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE_(scoped_lockable)
+
+#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(guarded_by(x))
+
+#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(pt_guarded_by(x))
+
+#define ACQUIRED_BEFORE(...)                                                   \
+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_before(__VA_ARGS__))
+
+#define ACQUIRED_AFTER(...)                                                    \
+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_after(__VA_ARGS__))
+
+#define REQUIRES(...)                                                          \
+  THREAD_ANNOTATION_ATTRIBUTE_(requires_capability(__VA_ARGS__))
+
+#define REQUIRES_SHARED(...)                                                   \
+  THREAD_ANNOTATION_ATTRIBUTE_(requires_shared_capability(__VA_ARGS__))
+
+#define ACQUIRE(...)                                                           \
+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_capability(__VA_ARGS__))
+
+#define ACQUIRE_SHARED(...)                                                    \
+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_shared_capability(__VA_ARGS__))
+
+#define RELEASE(...)                                                           \
+  THREAD_ANNOTATION_ATTRIBUTE_(release_capability(__VA_ARGS__))
+
+#define RELEASE_SHARED(...)                                                    \
+  THREAD_ANNOTATION_ATTRIBUTE_(release_shared_capability(__VA_ARGS__))
+
+#define TRY_ACQUIRE(...)                                                       \
+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_capability(__VA_ARGS__))
+
+#define TRY_ACQUIRE_SHARED(...)                                                \
+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_shared_capability(__VA_ARGS__))
+
+#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE_(locks_excluded(__VA_ARGS__))
+
+#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(assert_capability(x))
+
+#define ASSERT_SHARED_CAPABILITY(x)                                            \
+  THREAD_ANNOTATION_ATTRIBUTE_(assert_shared_capability(x))
+
+#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(lock_returned(x))
+
+#define NO_THREAD_SAFETY_ANALYSIS                                              \
+  THREAD_ANNOTATION_ATTRIBUTE_(no_thread_safety_analysis)
+
+#endif // SCUDO_THREAD_ANNOTATIONS_
diff --git a/standalone/timing.cpp b/standalone/timing.cpp
new file mode 100644
index 0000000..59ae21d
--- /dev/null
+++ b/standalone/timing.cpp
@@ -0,0 +1,29 @@
+//===-- timing.cpp ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "timing.h"
+
+namespace scudo {
+
+Timer::~Timer() {
+  if (Manager)
+    Manager->report(*this);
+}
+
+ScopedTimer::ScopedTimer(TimingManager &Manager, const char *Name)
+    : Timer(Manager.getOrCreateTimer(Name)) {
+  start();
+}
+
+ScopedTimer::ScopedTimer(TimingManager &Manager, const Timer &Nest,
+                         const char *Name)
+    : Timer(Manager.nest(Nest, Name)) {
+  start();
+}
+
+} // namespace scudo
diff --git a/standalone/timing.h b/standalone/timing.h
new file mode 100644
index 0000000..84caa79
--- /dev/null
+++ b/standalone/timing.h
@@ -0,0 +1,221 @@
+//===-- timing.h ------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_TIMING_H_
+#define SCUDO_TIMING_H_
+
+#include "common.h"
+#include "mutex.h"
+#include "string_utils.h"
+#include "thread_annotations.h"
+
+#include <inttypes.h>
+#include <string.h>
+
+namespace scudo {
+
+class TimingManager;
+
+// A simple timer for evaluating execution time of code snippets. It can be used
+// along with TimingManager or standalone.
+class Timer {
+public:
+  // The use of Timer without binding to a TimingManager is supposed to do the
+  // timer logging manually. Otherwise, TimingManager will do the logging stuff
+  // for you.
+  Timer() = default;
+  Timer(Timer &&Other)
+      : StartTime(0), AccTime(Other.AccTime), Manager(Other.Manager),
+        HandleId(Other.HandleId) {
+    Other.Manager = nullptr;
+  }
+
+  Timer(const Timer &) = delete;
+
+  ~Timer();
+
+  void start() {
+    CHECK_EQ(StartTime, 0U);
+    StartTime = getMonotonicTime();
+  }
+  void stop() {
+    AccTime += getMonotonicTime() - StartTime;
+    StartTime = 0;
+  }
+  u64 getAccumulatedTime() const { return AccTime; }
+
+  // Unset the bound TimingManager so that we don't report the data back. This
+  // is useful if we only want to track subset of certain scope events.
+  void ignore() {
+    StartTime = 0;
+    AccTime = 0;
+    Manager = nullptr;
+  }
+
+protected:
+  friend class TimingManager;
+  Timer(TimingManager &Manager, u32 HandleId)
+      : Manager(&Manager), HandleId(HandleId) {}
+
+  u64 StartTime = 0;
+  u64 AccTime = 0;
+  TimingManager *Manager = nullptr;
+  u32 HandleId;
+};
+
+// A RAII-style wrapper for easy scope execution measurement. Note that in order
+// not to take additional space for the message like `Name`. It only works with
+// TimingManager.
+class ScopedTimer : public Timer {
+public:
+  ScopedTimer(TimingManager &Manager, const char *Name);
+  ScopedTimer(TimingManager &Manager, const Timer &Nest, const char *Name);
+  ~ScopedTimer() { stop(); }
+};
+
+// In Scudo, the execution time of single run of code snippets may not be
+// useful, we are more interested in the average time from several runs.
+// TimingManager lets the registered timer report their data and reports the
+// average execution time for each timer periodically.
+class TimingManager {
+public:
+  TimingManager(u32 PrintingInterval = DefaultPrintingInterval)
+      : PrintingInterval(PrintingInterval) {}
+  ~TimingManager() {
+    if (NumAllocatedTimers != 0)
+      printAll();
+  }
+
+  Timer getOrCreateTimer(const char *Name) EXCLUDES(Mutex) {
+    ScopedLock L(Mutex);
+
+    CHECK_LT(strlen(Name), MaxLenOfTimerName);
+    for (u32 I = 0; I < NumAllocatedTimers; ++I) {
+      if (strncmp(Name, Timers[I].Name, MaxLenOfTimerName) == 0)
+        return Timer(*this, I);
+    }
+
+    CHECK_LT(NumAllocatedTimers, MaxNumberOfTimers);
+    strncpy(Timers[NumAllocatedTimers].Name, Name, MaxLenOfTimerName);
+    TimerRecords[NumAllocatedTimers].AccumulatedTime = 0;
+    TimerRecords[NumAllocatedTimers].Occurrence = 0;
+    return Timer(*this, NumAllocatedTimers++);
+  }
+
+  // Add a sub-Timer associated with another Timer. This is used when we want to
+  // detail the execution time in the scope of a Timer.
+  // For example,
+  //   void Foo() {
+  //     // T1 records the time spent in both first and second tasks.
+  //     ScopedTimer T1(getTimingManager(), "Task1");
+  //     {
+  //       // T2 records the time spent in first task
+  //       ScopedTimer T2(getTimingManager, T1, "Task2");
+  //       // Do first task.
+  //     }
+  //     // Do second task.
+  //   }
+  //
+  // The report will show proper indents to indicate the nested relation like,
+  //   -- Average Operation Time -- -- Name (# of Calls) --
+  //             10.0(ns)            Task1 (1)
+  //              5.0(ns)              Task2 (1)
+  Timer nest(const Timer &T, const char *Name) EXCLUDES(Mutex) {
+    CHECK_EQ(T.Manager, this);
+    Timer Nesting = getOrCreateTimer(Name);
+
+    ScopedLock L(Mutex);
+    CHECK_NE(Nesting.HandleId, T.HandleId);
+    Timers[Nesting.HandleId].Nesting = T.HandleId;
+    return Nesting;
+  }
+
+  void report(const Timer &T) EXCLUDES(Mutex) {
+    ScopedLock L(Mutex);
+
+    const u32 HandleId = T.HandleId;
+    CHECK_LT(HandleId, MaxNumberOfTimers);
+    TimerRecords[HandleId].AccumulatedTime += T.getAccumulatedTime();
+    ++TimerRecords[HandleId].Occurrence;
+    ++NumEventsReported;
+    if (NumEventsReported % PrintingInterval == 0)
+      printAllImpl();
+  }
+
+  void printAll() EXCLUDES(Mutex) {
+    ScopedLock L(Mutex);
+    printAllImpl();
+  }
+
+private:
+  void printAllImpl() REQUIRES(Mutex) {
+    static char NameHeader[] = "-- Name (# of Calls) --";
+    static char AvgHeader[] = "-- Average Operation Time --";
+    ScopedString Str;
+    Str.append("%-15s %-15s\n", AvgHeader, NameHeader);
+
+    for (u32 I = 0; I < NumAllocatedTimers; ++I) {
+      if (Timers[I].Nesting != MaxNumberOfTimers)
+        continue;
+      printImpl(Str, I);
+    }
+
+    Str.output();
+  }
+
+  void printImpl(ScopedString &Str, const u32 HandleId,
+                 const u32 ExtraIndent = 0) REQUIRES(Mutex) {
+    const u64 AccumulatedTime = TimerRecords[HandleId].AccumulatedTime;
+    const u64 Occurrence = TimerRecords[HandleId].Occurrence;
+    const u64 Integral = Occurrence == 0 ? 0 : AccumulatedTime / Occurrence;
+    // Only keep single digit of fraction is enough and it enables easier layout
+    // maintenance.
+    const u64 Fraction =
+        Occurrence == 0 ? 0
+                        : ((AccumulatedTime % Occurrence) * 10) / Occurrence;
+
+    Str.append("%14" PRId64 ".%" PRId64 "(ns) %-11s", Integral, Fraction, " ");
+
+    for (u32 I = 0; I < ExtraIndent; ++I)
+      Str.append("%s", "  ");
+    Str.append("%s (%" PRId64 ")\n", Timers[HandleId].Name, Occurrence);
+
+    for (u32 I = 0; I < NumAllocatedTimers; ++I)
+      if (Timers[I].Nesting == HandleId)
+        printImpl(Str, I, ExtraIndent + 1);
+  }
+
+  // Instead of maintaining pages for timer registration, a static buffer is
+  // sufficient for most use cases in Scudo.
+  static constexpr u32 MaxNumberOfTimers = 50;
+  static constexpr u32 MaxLenOfTimerName = 50;
+  static constexpr u32 DefaultPrintingInterval = 100;
+
+  struct Record {
+    u64 AccumulatedTime = 0;
+    u64 Occurrence = 0;
+  };
+
+  struct TimerInfo {
+    char Name[MaxLenOfTimerName + 1];
+    u32 Nesting = MaxNumberOfTimers;
+  };
+
+  HybridMutex Mutex;
+  // The frequency of proactively dumping the timer statistics. For example, the
+  // default setting is to dump the statistics every 100 reported events.
+  u32 PrintingInterval GUARDED_BY(Mutex);
+  u64 NumEventsReported GUARDED_BY(Mutex) = 0;
+  u32 NumAllocatedTimers GUARDED_BY(Mutex) = 0;
+  TimerInfo Timers[MaxNumberOfTimers] GUARDED_BY(Mutex);
+  Record TimerRecords[MaxNumberOfTimers] GUARDED_BY(Mutex);
+};
+
+} // namespace scudo
+
+#endif // SCUDO_TIMING_H_
diff --git a/standalone/tools/compute_size_class_config.cpp b/standalone/tools/compute_size_class_config.cpp
index 8b17be0..bcaa583 100644
--- a/standalone/tools/compute_size_class_config.cpp
+++ b/standalone/tools/compute_size_class_config.cpp
@@ -140,7 +140,7 @@
   static const uptr MinSizeLog = %zu;
   static const uptr MidSizeLog = %zu;
   static const uptr MaxSizeLog = %zu;
-  static const u32 MaxNumCachedHint = 14;
+  static const u16 MaxNumCachedHint = 14;
   static const uptr MaxBytesCachedLog = 14;
 
   static constexpr u32 Classes[] = {)",
diff --git a/standalone/trusty.cpp b/standalone/trusty.cpp
index 81d6bc5..c08a4e6 100644
--- a/standalone/trusty.cpp
+++ b/standalone/trusty.cpp
@@ -37,7 +37,7 @@
   uptr Start;
   uptr End;
 
-  Start = roundUpTo(ProgramBreak, SBRK_ALIGN);
+  Start = roundUp(ProgramBreak, SBRK_ALIGN);
   // Don't actually extend the heap if MAP_NOACCESS flag is set since this is
   // the case where Scudo tries to reserve a memory region without mapping
   // physical pages.
@@ -45,7 +45,7 @@
     return reinterpret_cast<void *>(Start);
 
   // Attempt to extend the heap by Size bytes using _trusty_brk.
-  End = roundUpTo(Start + Size, SBRK_ALIGN);
+  End = roundUp(Start + Size, SBRK_ALIGN);
   ProgramBreak =
       reinterpret_cast<uptr>(_trusty_brk(reinterpret_cast<void *>(End)));
   if (ProgramBreak < End) {
@@ -76,6 +76,8 @@
 
 void HybridMutex::unlock() {}
 
+void HybridMutex::assertHeldImpl() {}
+
 u64 getMonotonicTime() {
   timespec TS;
   clock_gettime(CLOCK_MONOTONIC, &TS);
@@ -83,6 +85,17 @@
          static_cast<u64>(TS.tv_nsec);
 }
 
+u64 getMonotonicTimeFast() {
+#if defined(CLOCK_MONOTONIC_COARSE)
+  timespec TS;
+  clock_gettime(CLOCK_MONOTONIC_COARSE, &TS);
+  return static_cast<u64>(TS.tv_sec) * (1000ULL * 1000 * 1000) +
+         static_cast<u64>(TS.tv_nsec);
+#else
+  return getMonotonicTime();
+#endif
+}
+
 u32 getNumberOfCPUs() { return 0; }
 
 u32 getThreadID() { return 0; }
diff --git a/standalone/tsd.h b/standalone/tsd.h
index b400a3b..c5ed6dd 100644
--- a/standalone/tsd.h
+++ b/standalone/tsd.h
@@ -12,6 +12,7 @@
 #include "atomic_helpers.h"
 #include "common.h"
 #include "mutex.h"
+#include "thread_annotations.h"
 
 #include <limits.h> // for PTHREAD_DESTRUCTOR_ITERATIONS
 #include <pthread.h>
@@ -24,21 +25,17 @@
 namespace scudo {
 
 template <class Allocator> struct alignas(SCUDO_CACHE_LINE_SIZE) TSD {
-  typename Allocator::CacheT Cache;
-  typename Allocator::QuarantineCacheT QuarantineCache;
   using ThisT = TSD<Allocator>;
   u8 DestructorIterations = 0;
 
-  void init(Allocator *Instance) {
+  void init(Allocator *Instance) NO_THREAD_SAFETY_ANALYSIS {
     DCHECK_EQ(DestructorIterations, 0U);
     DCHECK(isAligned(reinterpret_cast<uptr>(this), alignof(ThisT)));
     Instance->initCache(&Cache);
     DestructorIterations = PTHREAD_DESTRUCTOR_ITERATIONS;
   }
 
-  void commitBack(Allocator *Instance) { Instance->commitBack(this); }
-
-  inline bool tryLock() {
+  inline bool tryLock() NO_THREAD_SAFETY_ANALYSIS {
     if (Mutex.tryLock()) {
       atomic_store_relaxed(&Precedence, 0);
       return true;
@@ -49,16 +46,40 @@
           static_cast<uptr>(getMonotonicTime() >> FIRST_32_SECOND_64(16, 0)));
     return false;
   }
-  inline void lock() {
+  inline void lock() NO_THREAD_SAFETY_ANALYSIS {
     atomic_store_relaxed(&Precedence, 0);
     Mutex.lock();
   }
-  inline void unlock() { Mutex.unlock(); }
+  inline void unlock() NO_THREAD_SAFETY_ANALYSIS { Mutex.unlock(); }
   inline uptr getPrecedence() { return atomic_load_relaxed(&Precedence); }
 
+  void commitBack(Allocator *Instance) ASSERT_CAPABILITY(Mutex) {
+    Instance->commitBack(this);
+  }
+
+  // Ideally, we may want to assert that all the operations on
+  // Cache/QuarantineCache always have the `Mutex` acquired. However, the
+  // current architecture of accessing TSD is not easy to cooperate with the
+  // thread-safety analysis because of pointer aliasing. So now we just add the
+  // assertion on the getters of Cache/QuarantineCache.
+  //
+  // TODO(chiahungduan): Ideally, we want to do `Mutex.assertHeld` but acquiring
+  // TSD doesn't always require holding the lock. Add this assertion while the
+  // lock is always acquired.
+  typename Allocator::CacheT &getCache() ASSERT_CAPABILITY(Mutex) {
+    return Cache;
+  }
+  typename Allocator::QuarantineCacheT &getQuarantineCache()
+      ASSERT_CAPABILITY(Mutex) {
+    return QuarantineCache;
+  }
+
 private:
   HybridMutex Mutex;
   atomic_uptr Precedence = {};
+
+  typename Allocator::CacheT Cache GUARDED_BY(Mutex);
+  typename Allocator::QuarantineCacheT QuarantineCache GUARDED_BY(Mutex);
 };
 
 } // namespace scudo
diff --git a/standalone/tsd_exclusive.h b/standalone/tsd_exclusive.h
index d49427b..2383674 100644
--- a/standalone/tsd_exclusive.h
+++ b/standalone/tsd_exclusive.h
@@ -11,6 +11,8 @@
 
 #include "tsd.h"
 
+#include "string_utils.h"
+
 namespace scudo {
 
 struct ThreadState {
@@ -25,7 +27,7 @@
 template <class Allocator> void teardownThread(void *Ptr);
 
 template <class Allocator> struct TSDRegistryExT {
-  void init(Allocator *Instance) {
+  void init(Allocator *Instance) REQUIRES(Mutex) {
     DCHECK(!Initialized);
     Instance->init();
     CHECK_EQ(pthread_key_create(&PThreadKey, teardownThread<Allocator>), 0);
@@ -33,14 +35,14 @@
     Initialized = true;
   }
 
-  void initOnceMaybe(Allocator *Instance) {
+  void initOnceMaybe(Allocator *Instance) EXCLUDES(Mutex) {
     ScopedLock L(Mutex);
     if (LIKELY(Initialized))
       return;
     init(Instance); // Sets Initialized.
   }
 
-  void unmapTestOnly(Allocator *Instance) {
+  void unmapTestOnly(Allocator *Instance) EXCLUDES(Mutex) {
     DCHECK(Instance);
     if (reinterpret_cast<Allocator *>(pthread_getspecific(PThreadKey))) {
       DCHECK_EQ(reinterpret_cast<Allocator *>(pthread_getspecific(PThreadKey)),
@@ -53,16 +55,32 @@
     FallbackTSD.commitBack(Instance);
     FallbackTSD = {};
     State = {};
+    ScopedLock L(Mutex);
     Initialized = false;
   }
 
+  void drainCaches(Allocator *Instance) {
+    // We don't have a way to iterate all thread local `ThreadTSD`s. Simply
+    // drain the `ThreadTSD` of current thread and `FallbackTSD`.
+    Instance->drainCache(&ThreadTSD);
+    FallbackTSD.lock();
+    Instance->drainCache(&FallbackTSD);
+    FallbackTSD.unlock();
+  }
+
   ALWAYS_INLINE void initThreadMaybe(Allocator *Instance, bool MinimalInit) {
     if (LIKELY(State.InitState != ThreadState::NotInitialized))
       return;
     initThread(Instance, MinimalInit);
   }
 
-  ALWAYS_INLINE TSD<Allocator> *getTSDAndLock(bool *UnlockRequired) {
+  // TODO(chiahungduan): Consider removing the argument `UnlockRequired` by
+  // embedding the logic into TSD or always locking the TSD. It will enable us
+  // to properly mark thread annotation here and adding proper runtime
+  // assertions in the member functions of TSD. For example, assert the lock is
+  // acquired before calling TSD::commitBack().
+  ALWAYS_INLINE TSD<Allocator> *
+  getTSDAndLock(bool *UnlockRequired) NO_THREAD_SAFETY_ANALYSIS {
     if (LIKELY(State.InitState == ThreadState::Initialized &&
                !atomic_load(&Disabled, memory_order_acquire))) {
       *UnlockRequired = false;
@@ -75,13 +93,13 @@
 
   // To disable the exclusive TSD registry, we effectively lock the fallback TSD
   // and force all threads to attempt to use it instead of their local one.
-  void disable() {
+  void disable() NO_THREAD_SAFETY_ANALYSIS {
     Mutex.lock();
     FallbackTSD.lock();
     atomic_store(&Disabled, 1U, memory_order_release);
   }
 
-  void enable() {
+  void enable() NO_THREAD_SAFETY_ANALYSIS {
     atomic_store(&Disabled, 0U, memory_order_release);
     FallbackTSD.unlock();
     Mutex.unlock();
@@ -97,6 +115,13 @@
 
   bool getDisableMemInit() { return State.DisableMemInit; }
 
+  void getStats(ScopedString *Str) {
+    // We don't have a way to iterate all thread local `ThreadTSD`s. Instead of
+    // printing only self `ThreadTSD` which may mislead the usage, we just skip
+    // it.
+    Str->append("Exclusive TSD don't support iterating each TSD\n");
+  }
+
 private:
   // Using minimal initialization allows for global initialization while keeping
   // the thread specific structure untouched. The fallback structure will be
@@ -113,7 +138,7 @@
   }
 
   pthread_key_t PThreadKey = {};
-  bool Initialized = false;
+  bool Initialized GUARDED_BY(Mutex) = false;
   atomic_u8 Disabled = {};
   TSD<Allocator> FallbackTSD;
   HybridMutex Mutex;
@@ -128,7 +153,8 @@
 template <class Allocator>
 thread_local ThreadState TSDRegistryExT<Allocator>::State;
 
-template <class Allocator> void teardownThread(void *Ptr) {
+template <class Allocator>
+void teardownThread(void *Ptr) NO_THREAD_SAFETY_ANALYSIS {
   typedef TSDRegistryExT<Allocator> TSDRegistryT;
   Allocator *Instance = reinterpret_cast<Allocator *>(Ptr);
   // The glibc POSIX thread-local-storage deallocation routine calls user
diff --git a/standalone/tsd_shared.h b/standalone/tsd_shared.h
index 1c2a880..dcb0948 100644
--- a/standalone/tsd_shared.h
+++ b/standalone/tsd_shared.h
@@ -11,6 +11,8 @@
 
 #include "tsd.h"
 
+#include "string_utils.h"
+
 #if SCUDO_HAS_PLATFORM_TLS_SLOT
 // This is a platform-provided header that needs to be on the include path when
 // Scudo is compiled. It must declare a function with the prototype:
@@ -24,7 +26,7 @@
 
 template <class Allocator, u32 TSDsArraySize, u32 DefaultTSDCount>
 struct TSDRegistrySharedT {
-  void init(Allocator *Instance) {
+  void init(Allocator *Instance) REQUIRES(Mutex) {
     DCHECK(!Initialized);
     Instance->init();
     for (u32 I = 0; I < TSDsArraySize; I++)
@@ -35,22 +37,32 @@
     Initialized = true;
   }
 
-  void initOnceMaybe(Allocator *Instance) {
+  void initOnceMaybe(Allocator *Instance) EXCLUDES(Mutex) {
     ScopedLock L(Mutex);
     if (LIKELY(Initialized))
       return;
     init(Instance); // Sets Initialized.
   }
 
-  void unmapTestOnly(Allocator *Instance) {
+  void unmapTestOnly(Allocator *Instance) EXCLUDES(Mutex) {
     for (u32 I = 0; I < TSDsArraySize; I++) {
       TSDs[I].commitBack(Instance);
       TSDs[I] = {};
     }
     setCurrentTSD(nullptr);
+    ScopedLock L(Mutex);
     Initialized = false;
   }
 
+  void drainCaches(Allocator *Instance) {
+    ScopedLock L(MutexTSDs);
+    for (uptr I = 0; I < NumberOfTSDs; ++I) {
+      TSDs[I].lock();
+      Instance->drainCache(&TSDs[I]);
+      TSDs[I].unlock();
+    }
+  }
+
   ALWAYS_INLINE void initThreadMaybe(Allocator *Instance,
                                      UNUSED bool MinimalInit) {
     if (LIKELY(getCurrentTSD()))
@@ -58,7 +70,10 @@
     initThread(Instance);
   }
 
-  ALWAYS_INLINE TSD<Allocator> *getTSDAndLock(bool *UnlockRequired) {
+  // TSDs is an array of locks and which is not supported for marking
+  // thread-safety capability.
+  ALWAYS_INLINE TSD<Allocator> *
+  getTSDAndLock(bool *UnlockRequired) NO_THREAD_SAFETY_ANALYSIS {
     TSD<Allocator> *TSD = getCurrentTSD();
     DCHECK(TSD);
     *UnlockRequired = true;
@@ -75,13 +90,13 @@
     return getTSDAndLockSlow(TSD);
   }
 
-  void disable() {
+  void disable() NO_THREAD_SAFETY_ANALYSIS {
     Mutex.lock();
     for (u32 I = 0; I < TSDsArraySize; I++)
       TSDs[I].lock();
   }
 
-  void enable() {
+  void enable() NO_THREAD_SAFETY_ANALYSIS {
     for (s32 I = static_cast<s32>(TSDsArraySize - 1); I >= 0; I--)
       TSDs[I].unlock();
     Mutex.unlock();
@@ -98,6 +113,19 @@
 
   bool getDisableMemInit() const { return *getTlsPtr() & 1; }
 
+  void getStats(ScopedString *Str) EXCLUDES(MutexTSDs) {
+    ScopedLock L(MutexTSDs);
+
+    Str->append("Stats: SharedTSDs: %u available; total %u\n", NumberOfTSDs,
+                TSDsArraySize);
+    for (uptr I = 0; I < NumberOfTSDs; ++I) {
+      TSDs[I].lock();
+      Str->append("  Shared TSD[%zu]:\n", I);
+      TSDs[I].getCache().getStats(Str);
+      TSDs[I].unlock();
+    }
+  }
+
 private:
   ALWAYS_INLINE uptr *getTlsPtr() const {
 #if SCUDO_HAS_PLATFORM_TLS_SLOT
@@ -119,7 +147,7 @@
     return reinterpret_cast<TSD<Allocator> *>(*getTlsPtr() & ~1ULL);
   }
 
-  bool setNumberOfTSDs(u32 N) {
+  bool setNumberOfTSDs(u32 N) EXCLUDES(MutexTSDs) {
     ScopedLock L(MutexTSDs);
     if (N < NumberOfTSDs)
       return false;
@@ -150,7 +178,7 @@
     *getTlsPtr() |= B;
   }
 
-  NOINLINE void initThread(Allocator *Instance) {
+  NOINLINE void initThread(Allocator *Instance) NO_THREAD_SAFETY_ANALYSIS {
     initOnceMaybe(Instance);
     // Initial context assignment is done in a plain round-robin fashion.
     const u32 Index = atomic_fetch_add(&CurrentIndex, 1U, memory_order_relaxed);
@@ -158,7 +186,10 @@
     Instance->callPostInitCallback();
   }
 
-  NOINLINE TSD<Allocator> *getTSDAndLockSlow(TSD<Allocator> *CurrentTSD) {
+  // TSDs is an array of locks which is not supported for marking thread-safety
+  // capability.
+  NOINLINE TSD<Allocator> *getTSDAndLockSlow(TSD<Allocator> *CurrentTSD)
+      EXCLUDES(MutexTSDs) {
     // Use the Precedence of the current TSD as our random seed. Since we are
     // in the slow path, it means that tryLock failed, and as a result it's
     // very likely that said Precedence is non-zero.
@@ -202,10 +233,10 @@
   }
 
   atomic_u32 CurrentIndex = {};
-  u32 NumberOfTSDs = 0;
-  u32 NumberOfCoPrimes = 0;
-  u32 CoPrimes[TSDsArraySize] = {};
-  bool Initialized = false;
+  u32 NumberOfTSDs GUARDED_BY(MutexTSDs) = 0;
+  u32 NumberOfCoPrimes GUARDED_BY(MutexTSDs) = 0;
+  u32 CoPrimes[TSDsArraySize] GUARDED_BY(MutexTSDs) = {};
+  bool Initialized GUARDED_BY(Mutex) = false;
   HybridMutex Mutex;
   HybridMutex MutexTSDs;
   TSD<Allocator> TSDs[TSDsArraySize];
diff --git a/standalone/vector.h b/standalone/vector.h
index eae774b..9f2c200 100644
--- a/standalone/vector.h
+++ b/standalone/vector.h
@@ -27,7 +27,7 @@
   }
   void destroy() {
     if (Data != &LocalData[0])
-      unmap(Data, CapacityBytes);
+      unmap(Data, CapacityBytes, 0, &MapData);
   }
   T &operator[](uptr I) {
     DCHECK_LT(I, Size);
@@ -40,7 +40,7 @@
   void push_back(const T &Element) {
     DCHECK_LE(Size, capacity());
     if (Size == capacity()) {
-      const uptr NewCapacity = roundUpToPowerOfTwo(Size + 1);
+      const uptr NewCapacity = roundUpPowerOfTwo(Size + 1);
       reallocate(NewCapacity);
     }
     memcpy(&Data[Size++], &Element, sizeof(T));
@@ -82,9 +82,9 @@
   void reallocate(uptr NewCapacity) {
     DCHECK_GT(NewCapacity, 0);
     DCHECK_LE(Size, NewCapacity);
-    NewCapacity = roundUpTo(NewCapacity * sizeof(T), getPageSizeCached());
-    T *NewData =
-        reinterpret_cast<T *>(map(nullptr, NewCapacity, "scudo:vector"));
+    NewCapacity = roundUp(NewCapacity * sizeof(T), getPageSizeCached());
+    T *NewData = reinterpret_cast<T *>(
+        map(nullptr, NewCapacity, "scudo:vector", 0, &MapData));
     memcpy(NewData, Data, Size * sizeof(T));
     destroy();
     Data = NewData;
@@ -95,6 +95,7 @@
   T LocalData[256 / sizeof(T)] = {};
   uptr CapacityBytes = 0;
   uptr Size = 0;
+  [[no_unique_address]] MapPlatformData MapData = {};
 };
 
 template <typename T> class Vector : public VectorNoCtor<T> {
diff --git a/standalone/wrappers_c.inc b/standalone/wrappers_c.inc
index bbe3617..3e495ea 100644
--- a/standalone/wrappers_c.inc
+++ b/standalone/wrappers_c.inc
@@ -54,6 +54,8 @@
   return Info;
 }
 
+// On Android, mallinfo2 is an alias of mallinfo, so don't define both.
+#if !SCUDO_ANDROID
 INTERFACE WEAK struct __scudo_mallinfo2 SCUDO_PREFIX(mallinfo2)(void) {
   struct __scudo_mallinfo2 Info = {};
   scudo::StatCounters Stats;
@@ -70,6 +72,7 @@
   Info.fordblks = Info.fsmblks;
   return Info;
 }
+#endif
 
 INTERFACE WEAK void *SCUDO_PREFIX(malloc)(size_t size) {
   return scudo::setErrnoOnNull(SCUDO_ALLOCATOR.allocate(
@@ -91,7 +94,7 @@
       alignment = 1U;
     } else {
       if (UNLIKELY(!scudo::isPowerOfTwo(alignment)))
-        alignment = scudo::roundUpToPowerOfTwo(alignment);
+        alignment = scudo::roundUpPowerOfTwo(alignment);
     }
   } else {
     if (UNLIKELY(!scudo::isPowerOfTwo(alignment))) {
@@ -131,9 +134,9 @@
     scudo::reportPvallocOverflow(size);
   }
   // pvalloc(0) should allocate one page.
-  return scudo::setErrnoOnNull(SCUDO_ALLOCATOR.allocate(
-      size ? scudo::roundUpTo(size, PageSize) : PageSize,
-      scudo::Chunk::Origin::Memalign, PageSize));
+  return scudo::setErrnoOnNull(
+      SCUDO_ALLOCATOR.allocate(size ? scudo::roundUp(size, PageSize) : PageSize,
+                               scudo::Chunk::Origin::Memalign, PageSize));
 }
 
 INTERFACE WEAK void *SCUDO_PREFIX(realloc)(void *ptr, size_t size) {
@@ -188,7 +191,10 @@
                               static_cast<scudo::sptr>(value));
     return 1;
   } else if (param == M_PURGE) {
-    SCUDO_ALLOCATOR.releaseToOS();
+    SCUDO_ALLOCATOR.releaseToOS(scudo::ReleaseToOS::Force);
+    return 1;
+  } else if (param == M_PURGE_ALL) {
+    SCUDO_ALLOCATOR.releaseToOS(scudo::ReleaseToOS::ForceAll);
     return 1;
   } else {
     scudo::Option option;
@@ -238,7 +244,10 @@
     if (size < max_size)
       sizes[size]++;
   };
+
+  SCUDO_ALLOCATOR.disable();
   SCUDO_ALLOCATOR.iterateOverChunks(0, -1ul, callback, sizes);
+  SCUDO_ALLOCATOR.enable();
 
   fputs("<malloc version=\"scudo-1\">\n", stream);
   for (scudo::uptr i = 0; i != max_size; ++i)
diff --git a/standalone/wrappers_c_bionic.cpp b/standalone/wrappers_c_bionic.cpp
index 18c3bf2..1b9fe67 100644
--- a/standalone/wrappers_c_bionic.cpp
+++ b/standalone/wrappers_c_bionic.cpp
@@ -32,21 +32,6 @@
 #undef SCUDO_ALLOCATOR
 #undef SCUDO_PREFIX
 
-// Svelte MallocDispatch definitions.
-#define SCUDO_PREFIX(name) CONCATENATE(scudo_svelte_, name)
-#define SCUDO_ALLOCATOR SvelteAllocator
-
-extern "C" void SCUDO_PREFIX(malloc_postinit)();
-SCUDO_REQUIRE_CONSTANT_INITIALIZATION
-static scudo::Allocator<scudo::AndroidSvelteConfig,
-                        SCUDO_PREFIX(malloc_postinit)>
-    SCUDO_ALLOCATOR;
-
-#include "wrappers_c.inc"
-
-#undef SCUDO_ALLOCATOR
-#undef SCUDO_PREFIX
-
 // TODO(kostyak): support both allocators.
 INTERFACE void __scudo_print_stats(void) { Allocator.printStats(); }
 
diff --git a/standalone/wrappers_c_checks.h b/standalone/wrappers_c_checks.h
index 815d400..9cd48e8 100644
--- a/standalone/wrappers_c_checks.h
+++ b/standalone/wrappers_c_checks.h
@@ -64,7 +64,7 @@
 // Returns true if the size passed to pvalloc overflows when rounded to the next
 // multiple of PageSize.
 inline bool checkForPvallocOverflow(uptr Size, uptr PageSize) {
-  return roundUpTo(Size, PageSize) < Size;
+  return roundUp(Size, PageSize) < Size;
 }
 
 } // namespace scudo
diff --git a/standalone/wrappers_cpp.cpp b/standalone/wrappers_cpp.cpp
index 16f495b..374e36d 100644
--- a/standalone/wrappers_cpp.cpp
+++ b/standalone/wrappers_cpp.cpp
@@ -54,26 +54,28 @@
                             static_cast<scudo::uptr>(align));
 }
 
-INTERFACE WEAK void operator delete(void *ptr)NOEXCEPT {
+INTERFACE WEAK void operator delete(void *ptr) NOEXCEPT {
   Allocator.deallocate(ptr, scudo::Chunk::Origin::New);
 }
 INTERFACE WEAK void operator delete[](void *ptr) NOEXCEPT {
   Allocator.deallocate(ptr, scudo::Chunk::Origin::NewArray);
 }
-INTERFACE WEAK void operator delete(void *ptr, std::nothrow_t const &)NOEXCEPT {
+INTERFACE WEAK void operator delete(void *ptr,
+                                    std::nothrow_t const &) NOEXCEPT {
   Allocator.deallocate(ptr, scudo::Chunk::Origin::New);
 }
 INTERFACE WEAK void operator delete[](void *ptr,
                                       std::nothrow_t const &) NOEXCEPT {
   Allocator.deallocate(ptr, scudo::Chunk::Origin::NewArray);
 }
-INTERFACE WEAK void operator delete(void *ptr, size_t size)NOEXCEPT {
+INTERFACE WEAK void operator delete(void *ptr, size_t size) NOEXCEPT {
   Allocator.deallocate(ptr, scudo::Chunk::Origin::New, size);
 }
 INTERFACE WEAK void operator delete[](void *ptr, size_t size) NOEXCEPT {
   Allocator.deallocate(ptr, scudo::Chunk::Origin::NewArray, size);
 }
-INTERFACE WEAK void operator delete(void *ptr, std::align_val_t align)NOEXCEPT {
+INTERFACE WEAK void operator delete(void *ptr,
+                                    std::align_val_t align) NOEXCEPT {
   Allocator.deallocate(ptr, scudo::Chunk::Origin::New, 0,
                        static_cast<scudo::uptr>(align));
 }
@@ -83,7 +85,7 @@
                        static_cast<scudo::uptr>(align));
 }
 INTERFACE WEAK void operator delete(void *ptr, std::align_val_t align,
-                                    std::nothrow_t const &)NOEXCEPT {
+                                    std::nothrow_t const &) NOEXCEPT {
   Allocator.deallocate(ptr, scudo::Chunk::Origin::New, 0,
                        static_cast<scudo::uptr>(align));
 }
@@ -93,7 +95,7 @@
                        static_cast<scudo::uptr>(align));
 }
 INTERFACE WEAK void operator delete(void *ptr, size_t size,
-                                    std::align_val_t align)NOEXCEPT {
+                                    std::align_val_t align) NOEXCEPT {
   Allocator.deallocate(ptr, scudo::Chunk::Origin::New, size,
                        static_cast<scudo::uptr>(align));
 }