}
}
}
+
+struct url_parse_test_case {
+ const std::string input_uri;
+ const std::string expected_printed_url;
+ const bool verify_host_characters;
+ const std::string expected_printed_url_regex;
+ const bool is_valid;
+ const bool is_valid_regex;
+};
+
+constexpr bool IS_VALID = true;
+constexpr bool VERIFY_HOST_CHARACTERS = true;
+
+// clang-format off
+std::vector<url_parse_test_case> url_parse_test_cases = {
+ {
+ "///dir////index.html",
+ "/dir////index.html",
+ VERIFY_HOST_CHARACTERS,
+ "/dir////index.html",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "/index.html",
+ "/index.html",
+ VERIFY_HOST_CHARACTERS,
+ "/index.html",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "//index.html",
+ "/index.html",
+ VERIFY_HOST_CHARACTERS,
+ "/index.html",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ // The following scheme-only URI is technically valid per the spec, but we
+ // have historically returned this as invalid and I'm not comfortable
+ // changing it in case something depends upon this behavior. Besides, a
+ // scheme-only URI is probably not helpful to us nor something likely
+ // Traffic Server will see.
+ "http://",
+ "",
+ VERIFY_HOST_CHARACTERS,
+ "",
+ !IS_VALID,
+ !IS_VALID
+ },
+ {
+ "https:///",
+ "https:///",
+ VERIFY_HOST_CHARACTERS,
+ "https:///",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ // RFC 3986 section-3: When authority is not present, the path cannot begin
+ // with two slash characters ("//"). We have historically allowed this,
+ // however, and will continue to do so.
+ "https:////",
+ "https:///",
+ VERIFY_HOST_CHARACTERS,
+ "https:///",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ // By convention, our url_print() function adds a path of '/' at the end of
+ // URLs that have no path, query, or fragment after the authority.
+ "mailto:Test.User@example.com",
+ "mailto:Test.User@example.com/",
+ VERIFY_HOST_CHARACTERS,
+ "mailto:Test.User@example.com/",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "mailto:Test.User@example.com:25",
+ "mailto:Test.User@example.com:25/",
+ VERIFY_HOST_CHARACTERS,
+ "mailto:Test.User@example.com:25/",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com",
+ "https://www.example.com/",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com/",
+ "https://www.example.com/",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com//",
+ "https://www.example.com/",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://127.0.0.1",
+ "https://127.0.0.1/",
+ VERIFY_HOST_CHARACTERS,
+ "https://127.0.0.1/",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://[::1]",
+ "https://[::1]/",
+ VERIFY_HOST_CHARACTERS,
+ "https://[::1]/",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://127.0.0.1/",
+ "https://127.0.0.1/",
+ VERIFY_HOST_CHARACTERS,
+ "https://127.0.0.1/",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com:8888",
+ "https://www.example.com:8888/",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com:8888/",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com:8888/",
+ "https://www.example.com:8888/",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com:8888/",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com/a/path",
+ "https://www.example.com/a/path",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com//a/path",
+ "https://www.example.com/a/path",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path",
+ IS_VALID,
+ IS_VALID
+ },
+
+ // Technically a trailing '?' with an empty query string is valid, but we
+ // drop the '?'. The parse_regex, however, makes no distinction between
+ // query, fragment, and path components so it does not cut it out.
+ {
+ "https://www.example.com/a/path?",
+ "https://www.example.com/a/path",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path?",
+ IS_VALID,
+ IS_VALID},
+ {
+ "https://www.example.com/a/path?name=value",
+ "https://www.example.com/a/path?name=value",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path?name=value",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com/a/path?name=/a/path/value",
+ "https://www.example.com/a/path?name=/a/path/value",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path?name=/a/path/value",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com/a/path?name=/a/path/value;some=other_value",
+ "https://www.example.com/a/path?name=/a/path/value;some=other_value",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path?name=/a/path/value;some=other_value",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com/a/path?name=/a/path/value;some=other_value/",
+ "https://www.example.com/a/path?name=/a/path/value;some=other_value/",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path?name=/a/path/value;some=other_value/",
+ IS_VALID,
+ IS_VALID
+ },
+
+ // XXX - Tests didn't pass before this change, so commenting out for now.
+ // Again, URL::parse drops a final '?'.
+ // {
+ // "https://www.example.com?",
+ // "https://www.example.com",
+ // VERIFY_HOST_CHARACTERS,
+ // "https://www.example.com?/",
+ // IS_VALID,
+ // IS_VALID
+ // },
+ // {
+ // "https://www.example.com?name=value",
+ // "https://www.example.com?name=value",
+ // VERIFY_HOST_CHARACTERS,
+ // "https://www.example.com?name=value/",
+ // IS_VALID,
+ // IS_VALID
+ // },
+ // {
+ // "https://www.example.com?name=value/",
+ // "https://www.example.com?name=value/",
+ // VERIFY_HOST_CHARACTERS,
+ // "https://www.example.com?name=value/",
+ // IS_VALID,
+ // IS_VALID
+ // },
+
+ // URL::parse also drops the final '#'.
+ // {
+ // "https://www.example.com#",
+ // "https://www.example.com",
+ // VERIFY_HOST_CHARACTERS,
+ // "https://www.example.com#/",
+ // IS_VALID,
+ // IS_VALID
+ // },
+ // {
+ // "https://www.example.com#some=value",
+ // "https://www.example.com#some=value",
+ // VERIFY_HOST_CHARACTERS,
+ // "https://www.example.com#some=value/",
+ // IS_VALID,
+ // IS_VALID
+ // },
+ {
+ "https://www.example.com/a/path#",
+ "https://www.example.com/a/path",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path#",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com/a/path#some=value",
+ "https://www.example.com/a/path#some=value",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path#some=value",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ // Note that this final '?' is not for a query parameter but is a part of
+ // the fragment.
+ "https://www.example.com/a/path#some=value?",
+ "https://www.example.com/a/path#some=value?",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path#some=value?",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com/a/path#some=value?with_question",
+ "https://www.example.com/a/path#some=value?with_question",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path#some=value?with_question",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com/a/path?name=value?_with_question#some=value?with_question/",
+ "https://www.example.com/a/path?name=value?_with_question#some=value?with_question/",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path?name=value?_with_question#some=value?with_question/",
+ IS_VALID,
+ IS_VALID
+ },
+
+ // The following are some examples of strings we expect from regex_map in
+ // remap.config. The "From" portion, which are regular expressions, are
+ // often not parsible by URL::parse but are by URL::parse_regex, which is the
+ // purpose of its existence.
+ {
+ R"(http://(.*)?reactivate\.mail\.yahoo\.com/)",
+ "",
+ VERIFY_HOST_CHARACTERS,
+ R"(http://(.*)?reactivate\.mail\.yahoo\.com/)",
+ !IS_VALID,
+ IS_VALID
+ },
+ {
+ // The following is an example of a "To" URL in a regex_map line. We'll
+ // first verify that the '$' is flagged as invalid for a host in this case.
+ "http://$1reactivate.real.mail.yahoo.com/",
+ "http://$1reactivate.real.mail.yahoo.com/",
+ VERIFY_HOST_CHARACTERS,
+ "http://$1reactivate.real.mail.yahoo.com/",
+ !IS_VALID,
+ IS_VALID
+ },
+ {
+ // Same as above, but this time we pass in !VERIFY_HOST_CHARACTERS. This is
+ // how RemapConfig will call this parse() function.
+ "http://$1reactivate.real.mail.yahoo.com/",
+ "http://$1reactivate.real.mail.yahoo.com/",
+ !VERIFY_HOST_CHARACTERS,
+ "http://$1reactivate.real.mail.yahoo.com/",
+ IS_VALID,
+ IS_VALID
+ }
+};
+// clang-format on
+
+constexpr bool URL_PARSE = true;
+constexpr bool URL_PARSE_REGEX = false;
+
+/** Test the specified url.parse function.
+ *
+ * URL::parse and URL::parse_regex should behave the same. This function
+ * performs the same behavior for each.
+ *
+ * @param[in] test_case The test case specification to run.
+ *
+ * @param[in] parse_function Whether to run parse() or
+ * parse_regex().
+ */
+void
+test_parse(url_parse_test_case const &test_case, bool parse_function)
+{
+ URL url;
+ HdrHeap *heap = new_HdrHeap();
+ url.create(heap);
+ ParseResult result = PARSE_RESULT_OK;
+ if (test_case.verify_host_characters) {
+ result = url.parse(test_case.input_uri.c_str(), test_case.input_uri.size());
+ } else {
+ heap->destroy();
+ return;
+ //result = url.parse_no_host_check(test_case.input_uri.c_str(), test_case.input_uri.size());
+ }
+ bool expected_is_valid = test_case.is_valid;
+
+ if (expected_is_valid && result != PARSE_RESULT_DONE) {
+ std::printf("Parse URI: \"%s\", expected it to be valid but it was parsed invalid (%d)\n", test_case.input_uri.c_str(), result);
+ CHECK(false);
+ } else if (!expected_is_valid && result != PARSE_RESULT_ERROR) {
+ std::printf("Parse URI: \"%s\", expected it to be invalid but it was parsed valid (%d)\n", test_case.input_uri.c_str(), result);
+ CHECK(false);
+ }
+ if (result == PARSE_RESULT_DONE) {
+ char buf[1024];
+ int index = 0;
+ int offset = 0;
+ url.print(buf, sizeof(buf), &index, &offset);
+ std::string printed_url{buf, static_cast<size_t>(index)};
+ CHECK(test_case.expected_printed_url == printed_url);
+ CHECK(test_case.expected_printed_url.size() == printed_url.size());
+ }
+ heap->destroy();
+}
+
+TEST_CASE("UrlParse", "[proxy][parseurl]")
+{
+ for (auto const &test_case : url_parse_test_cases) {
+ test_parse(test_case, URL_PARSE);
+ }
+}