diff --git a/norconex-commons-lang/pom.xml b/norconex-commons-lang/pom.xml index 691016f8..92d1c802 100644 --- a/norconex-commons-lang/pom.xml +++ b/norconex-commons-lang/pom.xml @@ -19,7 +19,7 @@ 4.0.0 com.norconex.commons norconex-commons-lang - 1.11.0 + 1.12.0 jar Norconex Commons Lang @@ -27,7 +27,7 @@ UTF-8 UTF-8 - 1.11.0 + 1.12.0 2008 diff --git a/norconex-commons-lang/src/changes/changes.xml b/norconex-commons-lang/src/changes/changes.xml index ed8f533d..ec0ec1e6 100644 --- a/norconex-commons-lang/src/changes/changes.xml +++ b/norconex-commons-lang/src/changes/changes.xml @@ -7,6 +7,19 @@ + + + New URLNormalizer#addDomainTrailingSlash() method. + + + HttpURL now preserves uppercase characters in protocol. + + + Fixed HttpURL constructing URLs with null elements when protocol + was not lowercase (e.g. HTTP). + + + New URLNormalizer#removeTrailingSlash() method. diff --git a/norconex-commons-lang/src/main/java/com/norconex/commons/lang/url/HttpURL.java b/norconex-commons-lang/src/main/java/com/norconex/commons/lang/url/HttpURL.java index 190e1fed..bc12693f 100644 --- a/norconex-commons-lang/src/main/java/com/norconex/commons/lang/url/HttpURL.java +++ b/norconex-commons-lang/src/main/java/com/norconex/commons/lang/url/HttpURL.java @@ -104,18 +104,18 @@ public HttpURL(String url, String encoding) { } else { this.encoding = encoding; } - if (url.startsWith("http")) { + if (StringUtils.startsWithIgnoreCase(url, PROTOCOL_HTTP)) { URL urlwrap; try { urlwrap = new URL(url); } catch (MalformedURLException e) { throw new URLException("Could not interpret URL: " + url, e); } - protocol = urlwrap.getProtocol(); + protocol = StringUtils.substringBefore(url, ":"); host = urlwrap.getHost(); port = urlwrap.getPort(); if (port < 0) { - if (url.startsWith(PROTOCOL_HTTPS)) { + if (StringUtils.startsWithIgnoreCase(url, PROTOCOL_HTTPS)) { port = DEFAULT_HTTPS_PORT; } else { port = DEFAULT_HTTP_PORT; @@ -384,8 +384,9 @@ public String toString() { * @since 1.8.0 */ public boolean isPortDefault() { - return PROTOCOL_HTTPS.equalsIgnoreCase(protocol) && port == DEFAULT_HTTPS_PORT - || "http".equalsIgnoreCase(protocol) + return PROTOCOL_HTTPS.equalsIgnoreCase(protocol) + && port == DEFAULT_HTTPS_PORT + || PROTOCOL_HTTP.equalsIgnoreCase(protocol) && port == DEFAULT_HTTP_PORT; } diff --git a/norconex-commons-lang/src/main/java/com/norconex/commons/lang/url/URLNormalizer.java b/norconex-commons-lang/src/main/java/com/norconex/commons/lang/url/URLNormalizer.java index 76666f27..901e2946 100644 --- a/norconex-commons-lang/src/main/java/com/norconex/commons/lang/url/URLNormalizer.java +++ b/norconex-commons-lang/src/main/java/com/norconex/commons/lang/url/URLNormalizer.java @@ -340,6 +340,29 @@ public URLNormalizer addDirectoryTrailingSlash() { } return this; } + + /** + *

Adds a trailing slash (/) right after the domain for URLs with no + * path, before any fragment (#) or query string (?).

+ * + *

Please Note: Adding a trailing slash to URLs could + * potentially break its semantic equivalence.

+ * http://www.example.com → + * http://www.example.com/ + * @return this instance + * @since 1.12.0 + */ + public URLNormalizer addDomainTrailingSlash() { + String urlRoot = HttpURL.getRoot(url); + String path = toURL().getPath(); + if (StringUtils.isNotBlank(path)) { + // there is a path so do nothing + return this; + } + String urlRootAndPath = urlRoot + "/"; + url = StringUtils.replaceOnce(url, urlRoot, urlRootAndPath); + return this; + } /** *

Adds a trailing slash (/) to a URL ending with a directory. A URL is diff --git a/norconex-commons-lang/src/test/java/com/norconex/commons/lang/url/HttpURLTest.java b/norconex-commons-lang/src/test/java/com/norconex/commons/lang/url/HttpURLTest.java index 2acc7c60..3020dc5c 100644 --- a/norconex-commons-lang/src/test/java/com/norconex/commons/lang/url/HttpURLTest.java +++ b/norconex-commons-lang/src/test/java/com/norconex/commons/lang/url/HttpURLTest.java @@ -48,6 +48,13 @@ public void tearDown() throws Exception { t = null; } + @Test + public void testKeepProtocolUpperCase() { + s = "HTTP://www.example.com"; + t = "HTTP://www.example.com"; + assertEquals(t, new HttpURL(s).toString()); + } + @Test public void testToAbsoluteRelativeToProtocol() { s = "//www.relative.com/e/f.html"; diff --git a/norconex-commons-lang/src/test/java/com/norconex/commons/lang/url/URLNormalizerTest.java b/norconex-commons-lang/src/test/java/com/norconex/commons/lang/url/URLNormalizerTest.java index 17cb1006..9965b5c3 100644 --- a/norconex-commons-lang/src/test/java/com/norconex/commons/lang/url/URLNormalizerTest.java +++ b/norconex-commons-lang/src/test/java/com/norconex/commons/lang/url/URLNormalizerTest.java @@ -86,6 +86,41 @@ public void testAllAtOnce() { assertEquals(t, n.toURI().toString()); } + @Test + public void testAddDomainTrailingSlash() { + s = "http://www.example.com"; + t = "http://www.example.com/"; + assertEquals(t, n(s).addDomainTrailingSlash().toString()); + + s = "http://www.example.com/"; + t = "http://www.example.com/"; + assertEquals(t, n(s).addDomainTrailingSlash().toString()); + + s = "http://www.example.com/blah"; + t = "http://www.example.com/blah"; + assertEquals(t, n(s).addDomainTrailingSlash().toString()); + + s = "http://www.example.com/blah/path"; + t = "http://www.example.com/blah/path"; + assertEquals(t, n(s).addDomainTrailingSlash().toString()); + + s = "http://www.example.com?param1=value1¶m2=value2"; + t = "http://www.example.com/?param1=value1¶m2=value2"; + assertEquals(t, n(s).addDomainTrailingSlash().toString()); + + s = "http://www.example.com/?param1=value1¶m2=value2"; + t = "http://www.example.com/?param1=value1¶m2=value2"; + assertEquals(t, n(s).addDomainTrailingSlash().toString()); + + s = "http://www.example.com#hash"; + t = "http://www.example.com/#hash"; + assertEquals(t, n(s).addDomainTrailingSlash().toString()); + + s = "http://www.example.com/#hash"; + t = "http://www.example.com/#hash"; + assertEquals(t, n(s).addDomainTrailingSlash().toString()); + } + @Test public void testEncodeUTF8Characters() { @@ -99,6 +134,12 @@ public void testEncodeNonURICharacters() { s = "http://www.example.com/^a [b]/c?d e="; t = "http://www.example.com/%5Ea%20%5Bb%5D/c?d+e="; assertEquals(t, n(s).encodeNonURICharacters().toString()); + + //Test for https://github.com/Norconex/collector-http/issues/294 + //Was failing when HTTP was uppercase + s = "HTTP://www.Example.com/"; + t = "HTTP://www.Example.com/"; + assertEquals(t, n(s).encodeNonURICharacters().toString()); } @Test