Use high-performance cores on Apple Silicon for worker pool

On macbook with M4 Max which has 12 high performance + 4 efficient
cores, using only high performance core is faster than 8 threads for `gn
gen`. To reduce lock contention in lock for DependencyCache, this also
introduces sharding for that.

`gn gen` for build config with 30k targets,

```
$ hyperfine --warmup 1 "~/ghq/gn.googlesource.com/gn/out/gn_main gen out/Default" "~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/Default"
Benchmark 1: ~/ghq/gn.googlesource.com/gn/out/gn_main gen out/Default
  Time (mean ± σ):      1.962 s ±  0.024 s    [User: 9.090 s, System: 2.124 s]
  Range (min … max):    1.927 s …  1.997 s    10 runs

Benchmark 2: ~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/Default
  Time (mean ± σ):      1.818 s ±  0.018 s    [User: 9.632 s, System: 3.990 s]
  Range (min … max):    1.794 s …  1.848 s    10 runs

Summary
  ~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/Default ran
    1.08 ± 0.02 times faster than ~/ghq/gn.googlesource.com/gn/out/gn_main gen out/Default
```

`gn gen --check` for build config with 30k targets,

```
$ hyperfine --warmup 1 "~/ghq/gn.googlesource.com/gn/out/gn_main gen out/Default --check" "~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/Default --check"
Benchmark 1: ~/ghq/gn.googlesource.com/gn/out/gn_main gen out/Default --check
  Time (mean ± σ):      6.569 s ±  0.063 s    [User: 21.740 s, System: 15.753 s]
  Range (min … max):    6.500 s …  6.687 s    10 runs

Benchmark 2: ~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/Default --check
  Time (mean ± σ):      4.968 s ±  0.035 s    [User: 21.436 s, System: 24.465 s]
  Range (min … max):    4.903 s …  5.033 s    10 runs

Summary
  ~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/Default --check ran
    1.32 ± 0.02 times faster than ~/ghq/gn.googlesource.com/gn/out/gn_main gen out/Default --check
```

`gn gen` for build config with 40k targets,

```
$ hyperfine --warmup 1 "~/ghq/gn.googlesource.com/gn/out/gn_main gen out/no_clang_modules" "~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/no_clang_modules"
Benchmark 1: ~/ghq/gn.googlesource.com/gn/out/gn_main gen out/no_clang_modules
  Time (mean ± σ):      2.854 s ±  0.047 s    [User: 13.619 s, System: 3.021 s]
  Range (min … max):    2.760 s …  2.905 s    10 runs

Benchmark 2: ~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/no_clang_modules
  Time (mean ± σ):      2.496 s ±  0.028 s    [User: 14.474 s, System: 5.201 s]
  Range (min … max):    2.458 s …  2.545 s    10 runs

Summary
  ~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/no_clang_modules ran
    1.14 ± 0.02 times faster than ~/ghq/gn.googlesource.com/gn/out/gn_main gen out/no_clang_modules
```

`gn gen --check` for build config with 40k targets,

```
$ hyperfine --warmup 1 "~/ghq/gn.googlesource.com/gn/out/gn_main gen out/no_clang_modules --check" "~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/no_clang_modules --check"
Benchmark 1: ~/ghq/gn.googlesource.com/gn/out/gn_main gen out/no_clang_modules --check
  Time (mean ± σ):     21.681 s ±  0.178 s    [User: 133.529 s, System: 16.690 s]
  Range (min … max):   21.454 s … 21.901 s    10 runs

Benchmark 2: ~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/no_clang_modules --check
  Time (mean ± σ):     15.762 s ±  0.212 s    [User: 143.211 s, System: 26.597 s]
  Range (min … max):   15.537 s … 16.215 s    10 runs

Summary
  ~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/no_clang_modules --check ran
    1.38 ± 0.02 times faster than ~/ghq/gn.googlesource.com/gn/out/gn_main gen out/no_clang_modules --check
```

This also improves --check performance on linux,

```
$ hyperfine --warmup 1 "~/ghq/gn.googlesource.com/gn/out/gn_main gen out/Default --check" "~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/Default --check"
Benchmark 1: ~/ghq/gn.googlesource.com/gn/out/gn_main gen out/Default --check
  Time (mean ± σ):      8.296 s ±  0.136 s    [User: 76.245 s, System: 55.341 s]
  Range (min … max):    8.106 s …  8.492 s    10 runs

Benchmark 2: ~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/Default --check
  Time (mean ± σ):      5.575 s ±  0.069 s    [User: 76.813 s, System: 100.234 s]
  Range (min … max):    5.403 s …  5.653 s    10 runs

Summary
  ~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/Default --check ran
    1.49 ± 0.03 times faster than ~/ghq/gn.googlesource.com/gn/out/gn_main gen out/Default --check
```

Bug: 484862257
Bug: 484863025
Change-Id: I7d54762dcdfaaddb81aff6fdeabb3f7efb3f9590
Reviewed-on: https://gn-review.googlesource.com/c/gn/+/21100
Reviewed-by: David Turner <digit@google.com>
Reviewed-by: Sylvain Defresne <sdefresne@chromium.org>
Commit-Queue: Takuto Ikuta <tikuta@google.com>
diff --git a/build/gen.py b/build/gen.py
index eb07c22..6ad60af 100755
--- a/build/gen.py
+++ b/build/gen.py
@@ -912,6 +912,7 @@
         'src/gn/xcode_object_unittest.cc',
         'src/gn/xml_element_writer_unittest.cc',
         'src/util/atomic_write_unittest.cc',
+        'src/util/sys_info_unittest.cc',
         'src/util/test/gn_test.cc',
       ], 'libs': []},
   }
diff --git a/src/gn/header_checker.cc b/src/gn/header_checker.cc
index d363591..70fd300 100644
--- a/src/gn/header_checker.cc
+++ b/src/gn/header_checker.cc
@@ -177,7 +177,7 @@
       check_generated_(check_generated),
       check_system_(check_system),
       targets_count_(targets.size()),
-      lock_(),
+      errors_lock_(),
       task_count_cv_() {
   for (auto* target : targets)
     AddTargetToFileMap(target, &file_map_);
@@ -247,7 +247,7 @@
 void HeaderChecker::DoWork(const Target* target, const SourceFile& file) {
   std::vector<Err> errors;
   if (!CheckFile(target, file, &errors)) {
-    std::lock_guard<std::shared_mutex> lock(lock_);
+    std::lock_guard<std::mutex> lock(errors_lock_);
     errors_.insert(errors_.end(), errors.begin(), errors.end());
   }
 
@@ -581,10 +581,15 @@
     return false;
   }
 
+  size_t hash_for = search_for->label().hash();
+  size_t hash_from = search_from->label().hash();
+  size_t shard_index = (hash_for ^ hash_from) % kNumShards;
+  auto& shard = dependency_cache_[shard_index];
+
   {
-    std::shared_lock<std::shared_mutex> lock(lock_);
-    auto it = dependency_cache_.find(std::make_pair(search_for, search_from));
-    if (it != dependency_cache_.end()) {
+    std::shared_lock<std::shared_mutex> lock(shard.lock);
+    auto it = shard.cache.find(std::make_pair(search_for, search_from));
+    if (it != shard.cache.end()) {
       if (it->second == DependencyState::kNotADependency) {
         *is_permitted = false;
         return false;
@@ -604,8 +609,8 @@
   // Find the shortest public dependency chain.
   if (IsDependencyOf(search_for, search_from, true, chain)) {
     *is_permitted = true;
-    std::unique_lock<std::shared_mutex> lock(lock_);
-    dependency_cache_[std::make_pair(search_for, search_from)] =
+    std::unique_lock<std::shared_mutex> lock(shard.lock);
+    shard.cache[std::make_pair(search_for, search_from)] =
         DependencyState::kPermittedDependency;
     return true;
   }
@@ -613,15 +618,15 @@
   // If not, try to find any dependency chain at all.
   if (IsDependencyOf(search_for, search_from, false, chain)) {
     *is_permitted = false;
-    std::unique_lock<std::shared_mutex> lock(lock_);
-    dependency_cache_[std::make_pair(search_for, search_from)] =
+    std::unique_lock<std::shared_mutex> lock(shard.lock);
+    shard.cache[std::make_pair(search_for, search_from)] =
         DependencyState::kNonPermittedDependency;
     return true;
   }
 
   *is_permitted = false;
-  std::unique_lock<std::shared_mutex> lock(lock_);
-  dependency_cache_[std::make_pair(search_for, search_from)] =
+  std::unique_lock<std::shared_mutex> lock(shard.lock);
+  shard.cache[std::make_pair(search_for, search_from)] =
       DependencyState::kNotADependency;
   return false;
 }
diff --git a/src/gn/header_checker.h b/src/gn/header_checker.h
index 3f313fb..8150bdf 100644
--- a/src/gn/header_checker.h
+++ b/src/gn/header_checker.h
@@ -5,6 +5,7 @@
 #ifndef TOOLS_GN_HEADER_CHECKER_H_
 #define TOOLS_GN_HEADER_CHECKER_H_
 
+#include <array>
 #include <condition_variable>
 #include <functional>
 #include <map>
@@ -203,15 +204,21 @@
   using DependencyCache =
       std::map<std::pair<const Target*, const Target*>, DependencyState>;
 
+  static constexpr size_t kNumShards = 64;
+  struct DependencyCacheShard {
+    mutable std::shared_mutex lock;
+    DependencyCache cache;
+  };
+
   // Locked variables ----------------------------------------------------------
   //
   // These are mutable during runtime and require locking.
 
-  mutable std::shared_mutex lock_;
+  mutable std::mutex errors_lock_;
 
   std::vector<Err> errors_;
 
-  mutable DependencyCache dependency_cache_;
+  mutable std::array<DependencyCacheShard, kNumShards> dependency_cache_;
 
   // Separate lock for task count synchronization since std::condition_variable
   // only works with std::unique_lock<std::mutex>.
diff --git a/src/util/sys_info.cc b/src/util/sys_info.cc
index cc75c45..386ba8b 100644
--- a/src/util/sys_info.cc
+++ b/src/util/sys_info.cc
@@ -12,6 +12,11 @@
 #include <unistd.h>
 #endif
 
+#if defined(OS_MACOSX)
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#endif
+
 #if defined(OS_WIN)
 #include <windows.h>
 #include "base/win/registry.h"
@@ -135,3 +140,21 @@
 #error
 #endif
 }
+
+int NumberOfPerformanceProcessors() {
+#if defined(OS_MACOSX) && defined(ARCH_CPU_ARM64)
+  int nperflevels;
+  size_t len = sizeof(nperflevels);
+  if (sysctlbyname("hw.nperflevels", &nperflevels, &len, nullptr, 0) == 0) {
+    if (nperflevels > 0) {
+      int perflevel0_cores;
+      len = sizeof(perflevel0_cores);
+      if (sysctlbyname("hw.perflevel0.physicalcpu", &perflevel0_cores, &len,
+                       nullptr, 0) == 0) {
+        return perflevel0_cores;
+      }
+    }
+  }
+#endif
+  return NumberOfProcessors();
+}
diff --git a/src/util/sys_info.h b/src/util/sys_info.h
index 7a6924d..904e568 100644
--- a/src/util/sys_info.h
+++ b/src/util/sys_info.h
@@ -11,4 +11,9 @@
 std::string OperatingSystemArchitecture();
 int NumberOfProcessors();
 
+// Returns the number of high-performance processors on the system.
+// currently only implemented on macOS.
+// On other platforms, returns NumberOfProcessors().
+int NumberOfPerformanceProcessors();
+
 #endif  // UTIL_SYS_INFO_H_
diff --git a/src/util/sys_info_unittest.cc b/src/util/sys_info_unittest.cc
new file mode 100644
index 0000000..ab39cc8
--- /dev/null
+++ b/src/util/sys_info_unittest.cc
@@ -0,0 +1,28 @@
+// Copyright 2026 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "util/sys_info.h"
+
+#include "util/build_config.h"
+#include "util/test/test.h"
+
+TEST(SysInfoTest, NumberOfProcessors) {
+  int num_processors = NumberOfProcessors();
+  EXPECT_GT(num_processors, 0);
+}
+
+TEST(SysInfoTest, NumberOfPerformanceProcessors) {
+  int num_perf_processors = NumberOfPerformanceProcessors();
+  // On all platforms this should be at least 1 (if implemented) or same as NumberOfProcessors
+  EXPECT_GT(num_perf_processors, 0);
+
+#if defined(OS_MACOSX) && defined(ARCH_CPU_ARM64)
+  // Apple Silicon has both performance and efficiency cores, so the number of
+  // performance cores should be less than the total number of processors.
+  EXPECT_LE(num_perf_processors, NumberOfProcessors());
+#else
+  // On other platforms, it returns NumberOfProcessors().
+  EXPECT_EQ(num_perf_processors, NumberOfProcessors());
+#endif
+}
diff --git a/src/util/worker_pool.cc b/src/util/worker_pool.cc
index 9efc1d2..ac903fa 100644
--- a/src/util/worker_pool.cc
+++ b/src/util/worker_pool.cc
@@ -24,6 +24,12 @@
     return result;
   }
 
+#if defined(OS_MACOSX) && defined(ARCH_CPU_ARM64)
+  // On Apple Silicon, we want to use only the high-performance cores.
+  // These cores are not hyperthreaded.
+  return NumberOfPerformanceProcessors();
+#endif
+
   // Almost all CPUs now are hyperthreaded.
   int num_cores = NumberOfProcessors() / 2;