Implement `string_hash` function. Bug: chromium:463302946 Change-Id: Iffb1f9071ac23e3a0fc55f5f3bec9f40e142b254 Reviewed-on: https://gn-review.googlesource.com/c/gn/+/20480 Reviewed-by: Andrew Grieve <agrieve@google.com> Commit-Queue: Ćukasz Anforowicz <lukasza@chromium.org>
diff --git a/docs/reference.md b/docs/reference.md index 61a227a..1b490c7 100644 --- a/docs/reference.md +++ b/docs/reference.md
@@ -65,6 +65,7 @@ * [set_default_toolchain: Sets the default toolchain name.](#func_set_default_toolchain) * [set_defaults: Set default values for a target type.](#func_set_defaults) * [split_list: Splits a list into N different sub-lists.](#func_split_list) + * [string_hash: Calculates a stable hash of the given string.](#func_string_hash) * [string_join: Concatenates a list of strings with a separator.](#func_string_join) * [string_replace: Replaces substring in the given string.](#func_string_replace) * [string_split: Split string into a list of strings.](#func_string_split) @@ -3455,6 +3456,33 @@ Will print: [[1, 2], [3, 4], [5, 6] ``` +### <a name="func_string_hash"></a>**string_hash**: Calculates a stable hash of the given string. [Back to Top](#gn-reference) + +``` + hash = string_hash(long_string) + + `string_hash` returns a string that contains a hash of the argument. The hash + is computed by first calculating an MD5 hash of the argument, and then + returning the first 8 characters of the lowercase-ASCII, hexadecimal encoding + of the MD5 hash. + + `string_hash` is intended to be used when it is desirable to translate, + globally unique strings (such as GN labels) into short filenames that are + still globally unique. This is useful when supporting filesystems and build + systems which impose limits on the length of the supported filenames and/or on + the total path length. + + Warning: This hash should never be used for cryptographic purposes. + Unique inputs can be assumed to result in unique hashes if the inputs + are trustworthy, but malicious inputs may be able to trigger collisions. + Directories and names of GN labels are usually considered trustworthy. +``` + +#### **Examples** + +``` + string_hash("abc") --> "90015098" +``` ### <a name="func_string_join"></a>**string_join**: Concatenates a list of strings with a separator. [Back to Top](#gn-reference) ```
diff --git a/src/gn/functions.cc b/src/gn/functions.cc index 8386de5..85da001 100644 --- a/src/gn/functions.cc +++ b/src/gn/functions.cc
@@ -10,6 +10,7 @@ #include <utility> #include "base/environment.h" +#include "base/md5.h" #include "base/strings/string_util.h" #include "gn/build_settings.h" #include "gn/config.h" @@ -1134,6 +1135,79 @@ return result; } +// string_hash ----------------------------------------------------------------- + +const char kStringHash[] = "string_hash"; +const char kStringHash_HelpShort[] = + "string_hash: Calculates a stable hash of the given string."; +const char kStringHash_Help[] = + R"(string_hash: Calculates a stable hash of the given string. + + hash = string_hash(long_string) + + `string_hash` returns a string that contains a hash of the argument. The hash + is computed by first calculating an MD5 hash of the argument, and then + returning the first 8 characters of the lowercase-ASCII, hexadecimal encoding + of the MD5 hash. + + `string_hash` is intended to be used when it is desirable to translate, + globally unique strings (such as GN labels) into short filenames that are + still globally unique. This is useful when supporting filesystems and build + systems which impose limits on the length of the supported filenames and/or on + the total path length. + + Warning: This hash should never be used for cryptographic purposes. + Unique inputs can be assumed to result in unique hashes if the inputs + are trustworthy, but malicious inputs may be able to trigger collisions. + Directories and names of GN labels are usually considered trustworthy. + +Examples: + + string_hash("abc") --> "90015098" +)"; + +Value RunStringHash(Scope* scope, + const FunctionCallNode* function, + const std::vector<Value>& args, + Err* err) { + // Check usage: Number of arguments. + if (args.size() != 1) { + *err = Err(function, "Wrong number of arguments to string_hash().", + "Expecting exactly one. usage: string_hash(string)"); + return Value(); + } + + // Check usage: argument is a string. + if (!args[0].VerifyTypeIs(Value::STRING, err)) { + *err = Err(function, + "argument of string_hash is not a string", + "Expecting argument to be a string."); + return Value(); + } + const std::string& arg = args[0].string_value(); + + // Arguments looks good; do the hash. + + // MD5 has been chosen as the hash algorithm, because: + // + // 1. MD5 implementation has been readily available in GN repo. + // 2. It fits the requirements of the motivating scenario + // (see https://crbug.com/463302946). In particular: + // 2.1. This scenario doesn't require cryptographic-strength hashing. + // 2.2. MD5 produces slightly shorter hashes than SHA1 and this scenario + // cares about keeping the filenames short, and somewhat ergonomic. + // 2.3. MD5 is a well-known hashing algorithm and this scenario needs + // to replicate the same hash outside of GN. + std::string md5 = base::MD5String(arg); + + // Trimming to 32 bits for improved ergonomics. Probability of collisions + // should still be sufficiently low (see https://crbug.com/46330294 for more + // discussion). + std::string trimmed = md5.substr(0, 8); + + return Value(function, trimmed); +} + // string_join ----------------------------------------------------------------- const char kStringJoin[] = "string_join"; @@ -1487,6 +1561,7 @@ INSERT_FUNCTION(SetDefaults, false) INSERT_FUNCTION(SetDefaultToolchain, false) INSERT_FUNCTION(SplitList, false) + INSERT_FUNCTION(StringHash, false) INSERT_FUNCTION(StringJoin, false) INSERT_FUNCTION(StringReplace, false) INSERT_FUNCTION(StringSplit, false)
diff --git a/src/gn/functions_unittest.cc b/src/gn/functions_unittest.cc index c51c487..c51351a 100644 --- a/src/gn/functions_unittest.cc +++ b/src/gn/functions_unittest.cc
@@ -200,6 +200,50 @@ setup.print_output()); } +TEST(Functions, StringHash) { + TestWithScope setup; + + // Verify output when string_hash() is called correctly. + { + TestParseInput input(R"gn( + print("<" + string_hash("abc") + ">") + + # Empty string + print("<" + string_hash("") + ">") + )gn"); + ASSERT_FALSE(input.has_error()); + + Err err; + input.parsed()->Execute(setup.scope(), &err); + ASSERT_FALSE(err.has_error()) << err.message(); + + EXPECT_EQ( + "<90015098>\n" + "<d41d8cd9>\n", + setup.print_output()) + << setup.print_output(); + } + + // Verify usage errors are detected. + std::vector<std::string> bad_usage_examples = { + // Number of arguments. + R"gn(string_hash())gn", + R"gn(string_hash(1,2))gn", + + // Argument type. + R"gn(string_hash(1))gn", + R"gn(string_hash(["oops"]))gn", + }; + for (const auto& bad_usage_example : bad_usage_examples) { + TestParseInput input(bad_usage_example); + ASSERT_FALSE(input.has_error()); + + Err err; + input.parsed()->Execute(setup.scope(), &err); + ASSERT_TRUE(err.has_error()) << bad_usage_example; + } +} + TEST(Functions, StringJoin) { TestWithScope setup;