From: Bryan Call Date: Tue, 1 Aug 2023 21:52:34 +0000 (-0700) Subject: Remove duplicate slashes at the beginning of the incoming URL (#10133) X-Git-Tag: archive/raspbian/8.1.7-0+deb10u2+rpi1^2~1 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=3282646a402960a3899a12a62a30a6fc31797e97;p=trafficserver.git Remove duplicate slashes at the beginning of the incoming URL (#10133) Gbp-Pq: Name 0003-Remove-duplicate-slashes-at-the-beginning-of-the-inc.patch --- diff --git a/proxy/hdrs/URL.cc b/proxy/hdrs/URL.cc index e38df337..9977ad6a 100644 --- a/proxy/hdrs/URL.cc +++ b/proxy/hdrs/URL.cc @@ -1473,6 +1473,10 @@ done: if (!path_end) { path_end = cur; } + // Remove all preceding slashes + while (path_start < path_end && *path_start == '/') { + ++path_start; + } url_path_set(heap, url, path_start, path_end - path_start, copy_strings); } if (params_start) { @@ -1553,6 +1557,10 @@ url_parse_http_no_path_component_breakdown(HdrHeap *heap, URLImpl *url, const ch // path is anything that's left. if (cur < end) { + // Remove all preceding slashes + while (cur < end && *cur == '/') { + cur++; + } url_path_set(heap, url, cur, end - cur, copy_strings); cur = end; } diff --git a/proxy/hdrs/unit_tests/test_URL.cc b/proxy/hdrs/unit_tests/test_URL.cc index 44f621c7..d5fc7f47 100644 --- a/proxy/hdrs/unit_tests/test_URL.cc +++ b/proxy/hdrs/unit_tests/test_URL.cc @@ -145,3 +145,392 @@ TEST_CASE("UrlHashGet", "[url][hash_get]") } } } + +struct url_parse_test_case { + const std::string input_uri; + const std::string expected_printed_url; + const bool verify_host_characters; + const std::string expected_printed_url_regex; + const bool is_valid; + const bool is_valid_regex; +}; + +constexpr bool IS_VALID = true; +constexpr bool VERIFY_HOST_CHARACTERS = true; + +// clang-format off +std::vector url_parse_test_cases = { + { + "///dir////index.html", + "/dir////index.html", + VERIFY_HOST_CHARACTERS, + "/dir////index.html", + IS_VALID, + IS_VALID + }, + { + "/index.html", + "/index.html", + VERIFY_HOST_CHARACTERS, + "/index.html", + IS_VALID, + IS_VALID + }, + { + "//index.html", + "/index.html", + VERIFY_HOST_CHARACTERS, + "/index.html", + IS_VALID, + IS_VALID + }, + { + // The following scheme-only URI is technically valid per the spec, but we + // have historically returned this as invalid and I'm not comfortable + // changing it in case something depends upon this behavior. Besides, a + // scheme-only URI is probably not helpful to us nor something likely + // Traffic Server will see. + "http://", + "", + VERIFY_HOST_CHARACTERS, + "", + !IS_VALID, + !IS_VALID + }, + { + "https:///", + "https:///", + VERIFY_HOST_CHARACTERS, + "https:///", + IS_VALID, + IS_VALID + }, + { + // RFC 3986 section-3: When authority is not present, the path cannot begin + // with two slash characters ("//"). We have historically allowed this, + // however, and will continue to do so. + "https:////", + "https:///", + VERIFY_HOST_CHARACTERS, + "https:///", + IS_VALID, + IS_VALID + }, + { + // By convention, our url_print() function adds a path of '/' at the end of + // URLs that have no path, query, or fragment after the authority. + "mailto:Test.User@example.com", + "mailto:Test.User@example.com/", + VERIFY_HOST_CHARACTERS, + "mailto:Test.User@example.com/", + IS_VALID, + IS_VALID + }, + { + "mailto:Test.User@example.com:25", + "mailto:Test.User@example.com:25/", + VERIFY_HOST_CHARACTERS, + "mailto:Test.User@example.com:25/", + IS_VALID, + IS_VALID + }, + { + "https://www.example.com", + "https://www.example.com/", + VERIFY_HOST_CHARACTERS, + "https://www.example.com/", + IS_VALID, + IS_VALID + }, + { + "https://www.example.com/", + "https://www.example.com/", + VERIFY_HOST_CHARACTERS, + "https://www.example.com/", + IS_VALID, + IS_VALID + }, + { + "https://www.example.com//", + "https://www.example.com/", + VERIFY_HOST_CHARACTERS, + "https://www.example.com/", + IS_VALID, + IS_VALID + }, + { + "https://127.0.0.1", + "https://127.0.0.1/", + VERIFY_HOST_CHARACTERS, + "https://127.0.0.1/", + IS_VALID, + IS_VALID + }, + { + "https://[::1]", + "https://[::1]/", + VERIFY_HOST_CHARACTERS, + "https://[::1]/", + IS_VALID, + IS_VALID + }, + { + "https://127.0.0.1/", + "https://127.0.0.1/", + VERIFY_HOST_CHARACTERS, + "https://127.0.0.1/", + IS_VALID, + IS_VALID + }, + { + "https://www.example.com:8888", + "https://www.example.com:8888/", + VERIFY_HOST_CHARACTERS, + "https://www.example.com:8888/", + IS_VALID, + IS_VALID + }, + { + "https://www.example.com:8888/", + "https://www.example.com:8888/", + VERIFY_HOST_CHARACTERS, + "https://www.example.com:8888/", + IS_VALID, + IS_VALID + }, + { + "https://www.example.com/a/path", + "https://www.example.com/a/path", + VERIFY_HOST_CHARACTERS, + "https://www.example.com/a/path", + IS_VALID, + IS_VALID + }, + { + "https://www.example.com//a/path", + "https://www.example.com/a/path", + VERIFY_HOST_CHARACTERS, + "https://www.example.com/a/path", + IS_VALID, + IS_VALID + }, + + // Technically a trailing '?' with an empty query string is valid, but we + // drop the '?'. The parse_regex, however, makes no distinction between + // query, fragment, and path components so it does not cut it out. + { + "https://www.example.com/a/path?", + "https://www.example.com/a/path", + VERIFY_HOST_CHARACTERS, + "https://www.example.com/a/path?", + IS_VALID, + IS_VALID}, + { + "https://www.example.com/a/path?name=value", + "https://www.example.com/a/path?name=value", + VERIFY_HOST_CHARACTERS, + "https://www.example.com/a/path?name=value", + IS_VALID, + IS_VALID + }, + { + "https://www.example.com/a/path?name=/a/path/value", + "https://www.example.com/a/path?name=/a/path/value", + VERIFY_HOST_CHARACTERS, + "https://www.example.com/a/path?name=/a/path/value", + IS_VALID, + IS_VALID + }, + { + "https://www.example.com/a/path?name=/a/path/value;some=other_value", + "https://www.example.com/a/path?name=/a/path/value;some=other_value", + VERIFY_HOST_CHARACTERS, + "https://www.example.com/a/path?name=/a/path/value;some=other_value", + IS_VALID, + IS_VALID + }, + { + "https://www.example.com/a/path?name=/a/path/value;some=other_value/", + "https://www.example.com/a/path?name=/a/path/value;some=other_value/", + VERIFY_HOST_CHARACTERS, + "https://www.example.com/a/path?name=/a/path/value;some=other_value/", + IS_VALID, + IS_VALID + }, + + // XXX - Tests didn't pass before this change, so commenting out for now. + // Again, URL::parse drops a final '?'. + // { + // "https://www.example.com?", + // "https://www.example.com", + // VERIFY_HOST_CHARACTERS, + // "https://www.example.com?/", + // IS_VALID, + // IS_VALID + // }, + // { + // "https://www.example.com?name=value", + // "https://www.example.com?name=value", + // VERIFY_HOST_CHARACTERS, + // "https://www.example.com?name=value/", + // IS_VALID, + // IS_VALID + // }, + // { + // "https://www.example.com?name=value/", + // "https://www.example.com?name=value/", + // VERIFY_HOST_CHARACTERS, + // "https://www.example.com?name=value/", + // IS_VALID, + // IS_VALID + // }, + + // URL::parse also drops the final '#'. + // { + // "https://www.example.com#", + // "https://www.example.com", + // VERIFY_HOST_CHARACTERS, + // "https://www.example.com#/", + // IS_VALID, + // IS_VALID + // }, + // { + // "https://www.example.com#some=value", + // "https://www.example.com#some=value", + // VERIFY_HOST_CHARACTERS, + // "https://www.example.com#some=value/", + // IS_VALID, + // IS_VALID + // }, + { + "https://www.example.com/a/path#", + "https://www.example.com/a/path", + VERIFY_HOST_CHARACTERS, + "https://www.example.com/a/path#", + IS_VALID, + IS_VALID + }, + { + "https://www.example.com/a/path#some=value", + "https://www.example.com/a/path#some=value", + VERIFY_HOST_CHARACTERS, + "https://www.example.com/a/path#some=value", + IS_VALID, + IS_VALID + }, + { + // Note that this final '?' is not for a query parameter but is a part of + // the fragment. + "https://www.example.com/a/path#some=value?", + "https://www.example.com/a/path#some=value?", + VERIFY_HOST_CHARACTERS, + "https://www.example.com/a/path#some=value?", + IS_VALID, + IS_VALID + }, + { + "https://www.example.com/a/path#some=value?with_question", + "https://www.example.com/a/path#some=value?with_question", + VERIFY_HOST_CHARACTERS, + "https://www.example.com/a/path#some=value?with_question", + IS_VALID, + IS_VALID + }, + { + "https://www.example.com/a/path?name=value?_with_question#some=value?with_question/", + "https://www.example.com/a/path?name=value?_with_question#some=value?with_question/", + VERIFY_HOST_CHARACTERS, + "https://www.example.com/a/path?name=value?_with_question#some=value?with_question/", + IS_VALID, + IS_VALID + }, + + // The following are some examples of strings we expect from regex_map in + // remap.config. The "From" portion, which are regular expressions, are + // often not parsible by URL::parse but are by URL::parse_regex, which is the + // purpose of its existence. + { + R"(http://(.*)?reactivate\.mail\.yahoo\.com/)", + "", + VERIFY_HOST_CHARACTERS, + R"(http://(.*)?reactivate\.mail\.yahoo\.com/)", + !IS_VALID, + IS_VALID + }, + { + // The following is an example of a "To" URL in a regex_map line. We'll + // first verify that the '$' is flagged as invalid for a host in this case. + "http://$1reactivate.real.mail.yahoo.com/", + "http://$1reactivate.real.mail.yahoo.com/", + VERIFY_HOST_CHARACTERS, + "http://$1reactivate.real.mail.yahoo.com/", + !IS_VALID, + IS_VALID + }, + { + // Same as above, but this time we pass in !VERIFY_HOST_CHARACTERS. This is + // how RemapConfig will call this parse() function. + "http://$1reactivate.real.mail.yahoo.com/", + "http://$1reactivate.real.mail.yahoo.com/", + !VERIFY_HOST_CHARACTERS, + "http://$1reactivate.real.mail.yahoo.com/", + IS_VALID, + IS_VALID + } +}; +// clang-format on + +constexpr bool URL_PARSE = true; +constexpr bool URL_PARSE_REGEX = false; + +/** Test the specified url.parse function. + * + * URL::parse and URL::parse_regex should behave the same. This function + * performs the same behavior for each. + * + * @param[in] test_case The test case specification to run. + * + * @param[in] parse_function Whether to run parse() or + * parse_regex(). + */ +void +test_parse(url_parse_test_case const &test_case, bool parse_function) +{ + URL url; + HdrHeap *heap = new_HdrHeap(); + url.create(heap); + ParseResult result = PARSE_RESULT_OK; + if (test_case.verify_host_characters) { + result = url.parse(test_case.input_uri.c_str(), test_case.input_uri.size()); + } else { + heap->destroy(); + return; + //result = url.parse_no_host_check(test_case.input_uri.c_str(), test_case.input_uri.size()); + } + bool expected_is_valid = test_case.is_valid; + + if (expected_is_valid && result != PARSE_RESULT_DONE) { + std::printf("Parse URI: \"%s\", expected it to be valid but it was parsed invalid (%d)\n", test_case.input_uri.c_str(), result); + CHECK(false); + } else if (!expected_is_valid && result != PARSE_RESULT_ERROR) { + std::printf("Parse URI: \"%s\", expected it to be invalid but it was parsed valid (%d)\n", test_case.input_uri.c_str(), result); + CHECK(false); + } + if (result == PARSE_RESULT_DONE) { + char buf[1024]; + int index = 0; + int offset = 0; + url.print(buf, sizeof(buf), &index, &offset); + std::string printed_url{buf, static_cast(index)}; + CHECK(test_case.expected_printed_url == printed_url); + CHECK(test_case.expected_printed_url.size() == printed_url.size()); + } + heap->destroy(); +} + +TEST_CASE("UrlParse", "[proxy][parseurl]") +{ + for (auto const &test_case : url_parse_test_cases) { + test_parse(test_case, URL_PARSE); + } +}