Use high-performance cores on Apple Silicon for worker pool
On macbook with M4 Max which has 12 high performance + 4 efficient
cores, using only high performance core is faster than 8 threads for `gn
gen`. To reduce lock contention in lock for DependencyCache, this also
introduces sharding for that.
`gn gen` for build config with 30k targets,
```
$ hyperfine --warmup 1 "~/ghq/gn.googlesource.com/gn/out/gn_main gen out/Default" "~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/Default"
Benchmark 1: ~/ghq/gn.googlesource.com/gn/out/gn_main gen out/Default
Time (mean ± σ): 1.962 s ± 0.024 s [User: 9.090 s, System: 2.124 s]
Range (min … max): 1.927 s … 1.997 s 10 runs
Benchmark 2: ~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/Default
Time (mean ± σ): 1.818 s ± 0.018 s [User: 9.632 s, System: 3.990 s]
Range (min … max): 1.794 s … 1.848 s 10 runs
Summary
~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/Default ran
1.08 ± 0.02 times faster than ~/ghq/gn.googlesource.com/gn/out/gn_main gen out/Default
```
`gn gen --check` for build config with 30k targets,
```
$ hyperfine --warmup 1 "~/ghq/gn.googlesource.com/gn/out/gn_main gen out/Default --check" "~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/Default --check"
Benchmark 1: ~/ghq/gn.googlesource.com/gn/out/gn_main gen out/Default --check
Time (mean ± σ): 6.569 s ± 0.063 s [User: 21.740 s, System: 15.753 s]
Range (min … max): 6.500 s … 6.687 s 10 runs
Benchmark 2: ~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/Default --check
Time (mean ± σ): 4.968 s ± 0.035 s [User: 21.436 s, System: 24.465 s]
Range (min … max): 4.903 s … 5.033 s 10 runs
Summary
~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/Default --check ran
1.32 ± 0.02 times faster than ~/ghq/gn.googlesource.com/gn/out/gn_main gen out/Default --check
```
`gn gen` for build config with 40k targets,
```
$ hyperfine --warmup 1 "~/ghq/gn.googlesource.com/gn/out/gn_main gen out/no_clang_modules" "~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/no_clang_modules"
Benchmark 1: ~/ghq/gn.googlesource.com/gn/out/gn_main gen out/no_clang_modules
Time (mean ± σ): 2.854 s ± 0.047 s [User: 13.619 s, System: 3.021 s]
Range (min … max): 2.760 s … 2.905 s 10 runs
Benchmark 2: ~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/no_clang_modules
Time (mean ± σ): 2.496 s ± 0.028 s [User: 14.474 s, System: 5.201 s]
Range (min … max): 2.458 s … 2.545 s 10 runs
Summary
~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/no_clang_modules ran
1.14 ± 0.02 times faster than ~/ghq/gn.googlesource.com/gn/out/gn_main gen out/no_clang_modules
```
`gn gen --check` for build config with 40k targets,
```
$ hyperfine --warmup 1 "~/ghq/gn.googlesource.com/gn/out/gn_main gen out/no_clang_modules --check" "~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/no_clang_modules --check"
Benchmark 1: ~/ghq/gn.googlesource.com/gn/out/gn_main gen out/no_clang_modules --check
Time (mean ± σ): 21.681 s ± 0.178 s [User: 133.529 s, System: 16.690 s]
Range (min … max): 21.454 s … 21.901 s 10 runs
Benchmark 2: ~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/no_clang_modules --check
Time (mean ± σ): 15.762 s ± 0.212 s [User: 143.211 s, System: 26.597 s]
Range (min … max): 15.537 s … 16.215 s 10 runs
Summary
~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/no_clang_modules --check ran
1.38 ± 0.02 times faster than ~/ghq/gn.googlesource.com/gn/out/gn_main gen out/no_clang_modules --check
```
This also improves --check performance on linux,
```
$ hyperfine --warmup 1 "~/ghq/gn.googlesource.com/gn/out/gn_main gen out/Default --check" "~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/Default --check"
Benchmark 1: ~/ghq/gn.googlesource.com/gn/out/gn_main gen out/Default --check
Time (mean ± σ): 8.296 s ± 0.136 s [User: 76.245 s, System: 55.341 s]
Range (min … max): 8.106 s … 8.492 s 10 runs
Benchmark 2: ~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/Default --check
Time (mean ± σ): 5.575 s ± 0.069 s [User: 76.813 s, System: 100.234 s]
Range (min … max): 5.403 s … 5.653 s 10 runs
Summary
~/ghq/gn.googlesource.com/gn/out/gn_shard gen out/Default --check ran
1.49 ± 0.03 times faster than ~/ghq/gn.googlesource.com/gn/out/gn_main gen out/Default --check
```
Bug: 484862257
Bug: 484863025
Change-Id: I7d54762dcdfaaddb81aff6fdeabb3f7efb3f9590
Reviewed-on: https://gn-review.googlesource.com/c/gn/+/21100
Reviewed-by: David Turner <digit@google.com>
Reviewed-by: Sylvain Defresne <sdefresne@chromium.org>
Commit-Queue: Takuto Ikuta <tikuta@google.com>
diff --git a/build/gen.py b/build/gen.py
index eb07c22..6ad60af 100755
--- a/build/gen.py
+++ b/build/gen.py
@@ -912,6 +912,7 @@
'src/gn/xcode_object_unittest.cc',
'src/gn/xml_element_writer_unittest.cc',
'src/util/atomic_write_unittest.cc',
+ 'src/util/sys_info_unittest.cc',
'src/util/test/gn_test.cc',
], 'libs': []},
}
diff --git a/src/gn/header_checker.cc b/src/gn/header_checker.cc
index d363591..70fd300 100644
--- a/src/gn/header_checker.cc
+++ b/src/gn/header_checker.cc
@@ -177,7 +177,7 @@
check_generated_(check_generated),
check_system_(check_system),
targets_count_(targets.size()),
- lock_(),
+ errors_lock_(),
task_count_cv_() {
for (auto* target : targets)
AddTargetToFileMap(target, &file_map_);
@@ -247,7 +247,7 @@
void HeaderChecker::DoWork(const Target* target, const SourceFile& file) {
std::vector<Err> errors;
if (!CheckFile(target, file, &errors)) {
- std::lock_guard<std::shared_mutex> lock(lock_);
+ std::lock_guard<std::mutex> lock(errors_lock_);
errors_.insert(errors_.end(), errors.begin(), errors.end());
}
@@ -581,10 +581,15 @@
return false;
}
+ size_t hash_for = search_for->label().hash();
+ size_t hash_from = search_from->label().hash();
+ size_t shard_index = (hash_for ^ hash_from) % kNumShards;
+ auto& shard = dependency_cache_[shard_index];
+
{
- std::shared_lock<std::shared_mutex> lock(lock_);
- auto it = dependency_cache_.find(std::make_pair(search_for, search_from));
- if (it != dependency_cache_.end()) {
+ std::shared_lock<std::shared_mutex> lock(shard.lock);
+ auto it = shard.cache.find(std::make_pair(search_for, search_from));
+ if (it != shard.cache.end()) {
if (it->second == DependencyState::kNotADependency) {
*is_permitted = false;
return false;
@@ -604,8 +609,8 @@
// Find the shortest public dependency chain.
if (IsDependencyOf(search_for, search_from, true, chain)) {
*is_permitted = true;
- std::unique_lock<std::shared_mutex> lock(lock_);
- dependency_cache_[std::make_pair(search_for, search_from)] =
+ std::unique_lock<std::shared_mutex> lock(shard.lock);
+ shard.cache[std::make_pair(search_for, search_from)] =
DependencyState::kPermittedDependency;
return true;
}
@@ -613,15 +618,15 @@
// If not, try to find any dependency chain at all.
if (IsDependencyOf(search_for, search_from, false, chain)) {
*is_permitted = false;
- std::unique_lock<std::shared_mutex> lock(lock_);
- dependency_cache_[std::make_pair(search_for, search_from)] =
+ std::unique_lock<std::shared_mutex> lock(shard.lock);
+ shard.cache[std::make_pair(search_for, search_from)] =
DependencyState::kNonPermittedDependency;
return true;
}
*is_permitted = false;
- std::unique_lock<std::shared_mutex> lock(lock_);
- dependency_cache_[std::make_pair(search_for, search_from)] =
+ std::unique_lock<std::shared_mutex> lock(shard.lock);
+ shard.cache[std::make_pair(search_for, search_from)] =
DependencyState::kNotADependency;
return false;
}
diff --git a/src/gn/header_checker.h b/src/gn/header_checker.h
index 3f313fb..8150bdf 100644
--- a/src/gn/header_checker.h
+++ b/src/gn/header_checker.h
@@ -5,6 +5,7 @@
#ifndef TOOLS_GN_HEADER_CHECKER_H_
#define TOOLS_GN_HEADER_CHECKER_H_
+#include <array>
#include <condition_variable>
#include <functional>
#include <map>
@@ -203,15 +204,21 @@
using DependencyCache =
std::map<std::pair<const Target*, const Target*>, DependencyState>;
+ static constexpr size_t kNumShards = 64;
+ struct DependencyCacheShard {
+ mutable std::shared_mutex lock;
+ DependencyCache cache;
+ };
+
// Locked variables ----------------------------------------------------------
//
// These are mutable during runtime and require locking.
- mutable std::shared_mutex lock_;
+ mutable std::mutex errors_lock_;
std::vector<Err> errors_;
- mutable DependencyCache dependency_cache_;
+ mutable std::array<DependencyCacheShard, kNumShards> dependency_cache_;
// Separate lock for task count synchronization since std::condition_variable
// only works with std::unique_lock<std::mutex>.
diff --git a/src/util/sys_info.cc b/src/util/sys_info.cc
index cc75c45..386ba8b 100644
--- a/src/util/sys_info.cc
+++ b/src/util/sys_info.cc
@@ -12,6 +12,11 @@
#include <unistd.h>
#endif
+#if defined(OS_MACOSX)
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#endif
+
#if defined(OS_WIN)
#include <windows.h>
#include "base/win/registry.h"
@@ -135,3 +140,21 @@
#error
#endif
}
+
+int NumberOfPerformanceProcessors() {
+#if defined(OS_MACOSX) && defined(ARCH_CPU_ARM64)
+ int nperflevels;
+ size_t len = sizeof(nperflevels);
+ if (sysctlbyname("hw.nperflevels", &nperflevels, &len, nullptr, 0) == 0) {
+ if (nperflevels > 0) {
+ int perflevel0_cores;
+ len = sizeof(perflevel0_cores);
+ if (sysctlbyname("hw.perflevel0.physicalcpu", &perflevel0_cores, &len,
+ nullptr, 0) == 0) {
+ return perflevel0_cores;
+ }
+ }
+ }
+#endif
+ return NumberOfProcessors();
+}
diff --git a/src/util/sys_info.h b/src/util/sys_info.h
index 7a6924d..904e568 100644
--- a/src/util/sys_info.h
+++ b/src/util/sys_info.h
@@ -11,4 +11,9 @@
std::string OperatingSystemArchitecture();
int NumberOfProcessors();
+// Returns the number of high-performance processors on the system.
+// currently only implemented on macOS.
+// On other platforms, returns NumberOfProcessors().
+int NumberOfPerformanceProcessors();
+
#endif // UTIL_SYS_INFO_H_
diff --git a/src/util/sys_info_unittest.cc b/src/util/sys_info_unittest.cc
new file mode 100644
index 0000000..ab39cc8
--- /dev/null
+++ b/src/util/sys_info_unittest.cc
@@ -0,0 +1,28 @@
+// Copyright 2026 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "util/sys_info.h"
+
+#include "util/build_config.h"
+#include "util/test/test.h"
+
+TEST(SysInfoTest, NumberOfProcessors) {
+ int num_processors = NumberOfProcessors();
+ EXPECT_GT(num_processors, 0);
+}
+
+TEST(SysInfoTest, NumberOfPerformanceProcessors) {
+ int num_perf_processors = NumberOfPerformanceProcessors();
+ // On all platforms this should be at least 1 (if implemented) or same as NumberOfProcessors
+ EXPECT_GT(num_perf_processors, 0);
+
+#if defined(OS_MACOSX) && defined(ARCH_CPU_ARM64)
+ // Apple Silicon has both performance and efficiency cores, so the number of
+ // performance cores should be less than the total number of processors.
+ EXPECT_LE(num_perf_processors, NumberOfProcessors());
+#else
+ // On other platforms, it returns NumberOfProcessors().
+ EXPECT_EQ(num_perf_processors, NumberOfProcessors());
+#endif
+}
diff --git a/src/util/worker_pool.cc b/src/util/worker_pool.cc
index 9efc1d2..ac903fa 100644
--- a/src/util/worker_pool.cc
+++ b/src/util/worker_pool.cc
@@ -24,6 +24,12 @@
return result;
}
+#if defined(OS_MACOSX) && defined(ARCH_CPU_ARM64)
+ // On Apple Silicon, we want to use only the high-performance cores.
+ // These cores are not hyperthreaded.
+ return NumberOfPerformanceProcessors();
+#endif
+
// Almost all CPUs now are hyperthreaded.
int num_cores = NumberOfProcessors() / 2;