Speed up generation of input files list. The list of all input files used by a project is very large for projects like Chromium and especially Fuchsia. This CL refactors the InputFileManager::GetAllPhysicalInputFiles() method in order to avoid un-necessary memory allocations and std::set<> operations. This is done by introducing a new VectorSetSorter class in "gn/vector_utils.h" that can be used to efficiently merge and sort sets of items with minimal memory usage. Change-Id: I2e4b22c7294e93ffaec28dda765d371b3f0cc301 Reviewed-on: https://gn-review.googlesource.com/c/gn/+/7941 Commit-Queue: David Turner <digit@google.com> Reviewed-by: Scott Graham <scottmg@chromium.org>
diff --git a/build/gen.py b/build/gen.py index 582562c..8083aac 100755 --- a/build/gen.py +++ b/build/gen.py
@@ -675,6 +675,7 @@ 'src/gn/tokenizer_unittest.cc', 'src/gn/unique_vector_unittest.cc', 'src/gn/value_unittest.cc', + 'src/gn/vector_utils_unittest.cc', 'src/gn/visibility_unittest.cc', 'src/gn/visual_studio_utils_unittest.cc', 'src/gn/visual_studio_writer_unittest.cc',
diff --git a/src/gn/input_file_manager.cc b/src/gn/input_file_manager.cc index 1c1fe01..7e0d7ee 100644 --- a/src/gn/input_file_manager.cc +++ b/src/gn/input_file_manager.cc
@@ -14,6 +14,7 @@ #include "gn/scope_per_file_provider.h" #include "gn/tokenizer.h" #include "gn/trace.h" +#include "gn/vector_utils.h" namespace { @@ -263,13 +264,13 @@ return static_cast<int>(input_files_.size()); } -void InputFileManager::GetAllPhysicalInputFileNames( - std::vector<base::FilePath>* result) const { +void InputFileManager::AddAllPhysicalInputFileNamesToVectorSetSorter( + VectorSetSorter<base::FilePath>* sorter) const { std::lock_guard<std::mutex> lock(lock_); - result->reserve(input_files_.size()); + for (const auto& file : input_files_) { if (!file.second->file.physical_name().empty()) - result->push_back(file.second->file.physical_name()); + sorter->Add(file.second->file.physical_name()); } }
diff --git a/src/gn/input_file_manager.h b/src/gn/input_file_manager.h index f5ac565..d5e5553 100644 --- a/src/gn/input_file_manager.h +++ b/src/gn/input_file_manager.h
@@ -18,6 +18,7 @@ #include "gn/input_file.h" #include "gn/parse_tree.h" #include "gn/settings.h" +#include "gn/vector_utils.h" #include "util/auto_reset_event.h" class BuildSettings; @@ -91,8 +92,16 @@ // Does not count dynamic input. int GetInputFileCount() const; - // Fills the vector with all input files. - void GetAllPhysicalInputFileNames(std::vector<base::FilePath>* result) const; + // Add all physical input files to a VectorSetSorter instance. + // This allows fast merging and sorting with other file paths sets. + // + // This is more memory efficient than returning a vector of base::FilePath + // instance, especially with projects with a very large number of input files, + // but note that the VectorSetSorter only holds pointers to the + // items recorded in this InputFileManager instance, and it is up to the + // caller to ensure these will not change until the sorter is destroyed. + void AddAllPhysicalInputFileNamesToVectorSetSorter( + VectorSetSorter<base::FilePath>* sorter) const; void set_load_file_callback(SyncLoadFileCallback load_file_callback) { load_file_callback_ = load_file_callback;
diff --git a/src/gn/json_project_writer.cc b/src/gn/json_project_writer.cc index 7c04ea1..c478379 100644 --- a/src/gn/json_project_writer.cc +++ b/src/gn/json_project_writer.cc
@@ -209,26 +209,34 @@ "default_toolchain", base::Value(default_toolchain_label.GetUserVisibleName(false))); - std::vector<base::FilePath> input_files; - g_scheduler->input_file_manager()->GetAllPhysicalInputFileNames(&input_files); - // Other files read by the build. std::vector<base::FilePath> other_files = g_scheduler->GetGenDependencies(); - // Sort the input files to order them deterministically. - // Additionally, remove duplicate filepaths that seem to creep in. - std::set<base::FilePath> fileset(input_files.begin(), input_files.end()); - fileset.insert(other_files.begin(), other_files.end()); + const InputFileManager* input_file_manager = + g_scheduler->input_file_manager(); base::ListValue inputs; - const auto &build_path = build_settings->root_path(); - for (const auto& other_file : fileset) { - std::string file; - if (MakeAbsolutePathRelativeIfPossible(FilePathToUTF8(build_path), - FilePathToUTF8(other_file), &file)) { - inputs.Append(std::make_unique<base::Value>(std::move(file))); - } + { + VectorSetSorter<base::FilePath> sorter( + input_file_manager->GetInputFileCount() + other_files.size()); + + input_file_manager->AddAllPhysicalInputFileNamesToVectorSetSorter(&sorter); + + sorter.Add(other_files.begin(), other_files.end()); + + std::string build_path = FilePathToUTF8(build_settings->root_path()); + auto item_callback = [&inputs, + &build_path](const base::FilePath& input_file) { + std::string file; + if (MakeAbsolutePathRelativeIfPossible( + build_path, FilePathToUTF8(input_file), &file)) { + inputs.Append(std::make_unique<base::Value>(std::move(file))); + } + }; + + sorter.IterateOver(item_callback); } + settings->SetKey("gen_input_files", std::move(inputs)); auto output = std::make_unique<base::DictionaryValue>();
diff --git a/src/gn/ninja_build_writer.cc b/src/gn/ninja_build_writer.cc index 12ccaa7..b5b491b 100644 --- a/src/gn/ninja_build_writer.cc +++ b/src/gn/ninja_build_writer.cc
@@ -292,30 +292,35 @@ // exist and will error if they don't. When files are listed in a depfile, // missing files are ignored. dep_out_ << "build.ninja:"; - std::vector<base::FilePath> input_files; - g_scheduler->input_file_manager()->GetAllPhysicalInputFileNames(&input_files); // Other files read by the build. std::vector<base::FilePath> other_files = g_scheduler->GetGenDependencies(); - // Sort the input files to order them deterministically. - // Additionally, remove duplicate filepaths that seem to creep in. - std::set<base::FilePath> fileset(input_files.begin(), input_files.end()); - fileset.insert(other_files.begin(), other_files.end()); + const InputFileManager* input_file_manager = + g_scheduler->input_file_manager(); + + VectorSetSorter<base::FilePath> sorter( + input_file_manager->GetInputFileCount() + other_files.size()); + + input_file_manager->AddAllPhysicalInputFileNamesToVectorSetSorter(&sorter); + sorter.Add(other_files.begin(), other_files.end()); const base::FilePath build_path = build_settings_->build_dir().Resolve(build_settings_->root_path()); EscapeOptions depfile_escape; depfile_escape.mode = ESCAPE_DEPFILE; - for (const auto& other_file : fileset) { + auto item_callback = [this, &depfile_escape, + &build_path](const base::FilePath& input_file) { const base::FilePath file = - MakeAbsoluteFilePathRelativeIfPossible(build_path, other_file); + MakeAbsoluteFilePathRelativeIfPossible(build_path, input_file); dep_out_ << " "; EscapeStringToStream(dep_out_, FilePathToUTF8(file.NormalizePathSeparatorsTo('/')), depfile_escape); - } + }; + + sorter.IterateOver(item_callback); out_ << std::endl; }
diff --git a/src/gn/vector_utils.h b/src/gn/vector_utils.h new file mode 100644 index 0000000..8366dda --- /dev/null +++ b/src/gn/vector_utils.h
@@ -0,0 +1,103 @@ +// Copyright 2020 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TOOLS_GN_VECTOR_UTILS_H_ +#define TOOLS_GN_VECTOR_UTILS_H_ + +#include <algorithm> +#include <utility> +#include <vector> + +// A VectorSetSorter is a convenience class used to efficiently sort and +// de-duplicate one or more sets of items of type T, then iterate over the +// result, or get it as a simple vector. Usage is the following: +// +// For performance reasons, this implementation only stores pointers to the +// input items in order to minimize memory usage. Callers should ensure the +// items added to this sorter do not change until the instance is destroyed. +// +// 1) Create instance, passing an optional initial capacity. +// +// 2) Add items using one of the Add() methods, as many times as +// necessary. Note that this records only pointers to said items +// so their content should not change until the instance is destroyed. +// +// 3) Call IteratorOver() to iterate over all sorted and de-duplicated +// items. +// +// 4) Alternatively, call AsVector() to return a new vector that contains +// copies of the original sorted / deduplicated items. +// +template <typename T> +class VectorSetSorter { + public: + // Constructor. |initial_capacity| might be provided to minimize the number + // of allocations performed by this instance, if the maximum number of + // input items is known in advance. + VectorSetSorter(size_t initial_capacity = 0) { + ptrs_.reserve(initial_capacity); + } + + // Add one single item to the sorter. + void Add(const T& item) { + ptrs_.push_back(&item); + sorted_ = false; + } + + // Add one range of items to the sorter. + void Add(typename std::vector<T>::const_iterator begin, + typename std::vector<T>::const_iterator end) { + for (; begin != end; ++begin) + ptrs_.push_back(&(*begin)); + sorted_ = false; + } + + // Add one range of items to the sorter. + void Add(const T* start, const T* end) { + for (; start != end; ++start) + ptrs_.push_back(start); + sorted_ = false; + } + + // Iterate over all sorted items, removing duplicates from the loop. + // |item_callback| is a callable that will be invoked for each item in the + // result. + template <typename ITEM_CALLBACK> + void IterateOver(ITEM_CALLBACK item_callback) { + if (!sorted_) { + Sort(); + } + const T* prev_item = nullptr; + for (const T* item : ptrs_) { + if (!prev_item || *prev_item != *item) { + item_callback(*item); + prev_item = item; + } + } + } + + // Return the sorted and de-duplicated resulting set as a vector of items. + // Note that this copies the input items. + std::vector<T> AsVector() { + std::vector<T> result; + result.reserve(ptrs_.size()); + IterateOver([&result](const T& item) { result.push_back(item); }); + return result; + } + + private: + // Sort all items previously added to this instance. + // Must be called after adding all desired items, and before + // calling IterateOver() or AsVector(). + void Sort() { + std::sort(ptrs_.begin(), ptrs_.end(), + [](const T* a, const T* b) { return *a < *b; }); + sorted_ = true; + } + + std::vector<const T*> ptrs_; + bool sorted_ = false; +}; + +#endif // TOOLS_GN_VECTOR_UTILS_H_
diff --git a/src/gn/vector_utils_unittest.cc b/src/gn/vector_utils_unittest.cc new file mode 100644 index 0000000..c7cee01 --- /dev/null +++ b/src/gn/vector_utils_unittest.cc
@@ -0,0 +1,47 @@ +// Copyright 2020 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "gn/vector_utils.h" + +#include "util/test/test.h" + +#include <string> + +TEST(VectorSetSorter, AsVectorWithStrings) { + VectorSetSorter<std::string> sorter; + + std::vector<std::string> input = { + "World!", "Hello", "bonjour", "Hello", "monde!", "World!", + }; + + sorter.Add(input.begin(), input.end()); + auto result = sorter.AsVector(); + + ASSERT_EQ(result.size(), 4u) << result.size(); + EXPECT_STREQ(result[0].c_str(), "Hello"); + EXPECT_STREQ(result[1].c_str(), "World!"); + EXPECT_STREQ(result[2].c_str(), "bonjour"); + EXPECT_STREQ(result[3].c_str(), "monde!"); +} + +TEST(VectorSetSorter, IterateOverWithStrings) { + VectorSetSorter<std::string> sorter; + + std::vector<std::string> input = { + "World!", "Hello", "bonjour", "Hello", "monde!", "World!", + }; + + sorter.Add(input.begin(), input.end()); + + std::vector<std::string> result; + + sorter.IterateOver( + [&result](const std::string& str) { result.push_back(str); }); + + ASSERT_EQ(result.size(), 4u) << result.size(); + EXPECT_STREQ(result[0].c_str(), "Hello"); + EXPECT_STREQ(result[1].c_str(), "World!"); + EXPECT_STREQ(result[2].c_str(), "bonjour"); + EXPECT_STREQ(result[3].c_str(), "monde!"); +}