Correctly handle encoding for cache hash generation (#10128)

author Brian Neradt <brian.neradt@gmail.com>

Tue, 1 Aug 2023 18:51:44 +0000 (13:51 -0500)

committer Adrian Bunk <bunk@debian.org>

Sat, 30 Sep 2023 14:35:12 +0000 (15:35 +0100)
author Brian Neradt <brian.neradt@gmail.com>
Tue, 1 Aug 2023 18:51:44 +0000 (13:51 -0500)
committer Adrian Bunk <bunk@debian.org>
Sat, 30 Sep 2023 14:35:12 +0000 (15:35 +0100)
diff --git a/proxy/hdrs/URL.cc b/proxy/hdrs/URL.cc

index 9edf1a43ccf0cfa1e9254f2814052204d697826c..e38df3379ba12bb655c7dd15be191cb00d15d42b 100644 (file)
--- a/proxy/hdrs/URL.cc
+++ b/proxy/hdrs/URL.cc
@@ -1776,6 +1776,16 @@ url_CryptoHash_get_general(const URLImpl *url, CryptoContext &ctx, CryptoHash &h
        while (t < ends[i]) {
          if ((i == 0) || (i == 6)) { // scheme and host
            unescape_str_tolower(p, e, t, ends[i], s);
+        } else if (i == 8 || i == 10 || i == 12) { // path, params, query
+          // Don't unescape the parts of the URI that are processed by the
+          // origin since it may behave differently based upon whether these are
+          // escaped or not. Therefore differently encoded strings should be
+          // cached separately via differentiated hashes.
+          int path_len = ends[i] - t;
+          int min_len  = std::min(path_len, static_cast<int>(e - p));
+          memcpy(p, t, min_len);
+          p += min_len;
+          t += min_len;
          } else {
            unescape_str(p, e, t, ends[i], s);
          }
diff --git a/proxy/hdrs/unit_tests/test_URL.cc b/proxy/hdrs/unit_tests/test_URL.cc

index b022d068e409ce27f0fbf01cb3f499a6aa80cf8a..44f621c76f7f910f838fae9fb739d3f850e94d73 100644 (file)
--- a/proxy/hdrs/unit_tests/test_URL.cc
+++ b/proxy/hdrs/unit_tests/test_URL.cc
@@ -44,3 +44,104 @@ TEST_CASE("Validate Scheme", "[proxy][validscheme]")
      }
    }
  }
+
+struct get_hash_test_case {
+  const std::string description;
+  const std::string uri_1;
+  const std::string uri_2;
+  const bool has_equal_hash;
+};
+
+constexpr bool HAS_EQUAL_HASH = true;
+
+// clang-format off
+std::vector<get_hash_test_case> get_hash_test_cases = {
+  {
+    "No encoding: equal hashes",
+    "http://one.example.com/a/path?name=value#some=value?with_question#fragment",
+    "http://one.example.com/a/path?name=value#some=value?with_question#fragment",
+    HAS_EQUAL_HASH,
+  },
+  {
+    "Scheme encoded: equal hashes",
+    "http%3C://one.example.com/a/path?name=value#some=value?with_question#fragment",
+    "http<://one.example.com/a/path?name=value#some=value?with_question#fragment",
+    HAS_EQUAL_HASH,
+  },
+  {
+    "Host encoded: equal hashes",
+    "http://one%2Eexample.com/a/path?name=value#some=value?with_question#fragment",
+    "http://one.example.com/a/path?name=value#some=value?with_question#fragment",
+    HAS_EQUAL_HASH,
+  },
+  {
+    "Path encoded: differing hashes",
+    "http://one.example.com/a%2Fpath?name=value#some=value?with_question#fragment",
+    "http://one.example.com/a/path?name=value#some=value?with_question#fragment",
+    !HAS_EQUAL_HASH,
+  },
+  {
+    "Query = encoded: differing hashes",
+    "http://one.example.com/a/path?name%3Dvalue#some=value?with_question#fragment",
+    "http://one.example.com/a/path?name=value#some=value?with_question#fragment",
+    !HAS_EQUAL_HASH,
+  },
+  {
+    "Query internal encoded: differing hashes",
+    "http://one.example.com/a/path?name=valu%5D#some=value?with_question#fragment",
+    "http://one.example.com/a/path?name=valu]#some=value?with_question#fragment",
+    !HAS_EQUAL_HASH,
+  },
+  {
+    "Fragment encoded: fragment is not part of the hash",
+    "http://one.example.com/a/path?name=value#some=value?with_question#frag%7Dent",
+    "http://one.example.com/a/path?name=value#some=value?with_question/frag}ent",
+    HAS_EQUAL_HASH,
+  },
+  {
+    "Username encoded: equal hashes",
+    "mysql://my%7Eser:mypassword@localhost/mydatabase",
+    "mysql://my~ser:mypassword@localhost/mydatabase",
+    HAS_EQUAL_HASH,
+  },
+  {
+    "Password encoded: equal hashes",
+    "mysql://myuser:mypa%24sword@localhost/mydatabase",
+    "mysql://myuser:mypa$sword@localhost/mydatabase",
+    HAS_EQUAL_HASH,
+  },
+};
+
+/** Return the hash related to a URI.
+  *
+  * @param[in] uri The URI to hash.
+  * @return The hash of the URI.
+ */
+CryptoHash
+get_hash(const std::string &uri)
+{
+  URL url;
+  HdrHeap *heap = new_HdrHeap();
+  url.create(heap);
+  url.parse(uri.c_str(), uri.length());
+  CryptoHash hash;
+  url.hash_get(&hash);
+  heap->destroy();
+  return hash;
+}
+
+TEST_CASE("UrlHashGet", "[url][hash_get]")
+{
+  for (auto const &test_case : get_hash_test_cases) {
+    std::string description = test_case.description + ": " + test_case.uri_1 + " vs " + test_case.uri_2;
+    SECTION(description) {
+      CryptoHash hash1 = get_hash(test_case.uri_1);
+      CryptoHash hash2 = get_hash(test_case.uri_2);
+      if (test_case.has_equal_hash) {
+        CHECK(hash1 == hash2);
+      } else {
+        CHECK(hash1 != hash2);
+      }
+    }
+  }
+}
author	Brian Neradt <brian.neradt@gmail.com>
	Tue, 1 Aug 2023 18:51:44 +0000 (13:51 -0500)
committer	Adrian Bunk <bunk@debian.org>
	Sat, 30 Sep 2023 14:35:12 +0000 (15:35 +0100)
proxy/hdrs/URL.cc		patch \| blob \| history
proxy/hdrs/unit_tests/test_URL.cc		patch \| blob \| history