From 40a0cfe1218086b248a5147ff7c4cd21390a9f4a Mon Sep 17 00:00:00 2001 From: jlinn Date: Mon, 25 Sep 2017 15:40:12 -0700 Subject: [PATCH] Fix malformed url bug --- README.md | 2 +- pom.xml | 2 +- .../org/elasticsearch/index/analysis/url/URLTokenizer.java | 6 ++++-- .../index/analysis/url/URLTokenizerIntegrationTest.java | 6 ++++++ 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 0018270..7e07dd1 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ This plugin enables URL tokenization and token filtering by URL part. | Elasticsearch Version | Plugin Version | |-----------------------|----------------| -| 2.3.4 | 2.3.4.3 | +| 2.3.4 | 2.3.4.4 | | 2.3.3 | 2.3.3.5 | | 2.3.2 | 2.3.2.1 | | 2.3.1 | 2.3.1.1 | diff --git a/pom.xml b/pom.xml index a0bbdf8..5437a65 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ org.elasticsearch elasticsearch-analysis-url - 2.3.4.3 + 2.3.4.4 jar Elasticsearch URL token filter plugin diff --git a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java index 5b56c43..56d26d9 100644 --- a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java +++ b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java @@ -504,8 +504,10 @@ private List tokenizeSpecial(URL url) { // protocol://host token = getPart(url, URLPart.PROTOCOL) + "://" + getPart(url, URLPart.HOST); start = getStartIndex(url, token); - end = getEndIndex(start, token); - tokens.add(new Token(token, URLPart.WHOLE, start, end)); + if (start != -1) { + end = getEndIndex(start, token); + tokens.add(new Token(token, URLPart.WHOLE, start, end)); + } return tokens; } diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerIntegrationTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerIntegrationTest.java index 7669d35..a2bf582 100644 --- a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerIntegrationTest.java +++ b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerIntegrationTest.java @@ -33,6 +33,12 @@ public void testAnalyze() { } + @Test + public void testAnalyzePartial() throws Exception { + assertTokensContain("http://", "tokenizer_url_all", ":80", "http:", "http", "80"); + } + + @Test public void testAnalyzeWhole() throws Exception { List tokens = analyzeURL("http://foo.bar.com", "tokenizer_url_all_malformed");