From b90acadca3220f8c2454e107f3e0ad8a3a34415c Mon Sep 17 00:00:00 2001 From: Tony Trinh Date: Sun, 15 Sep 2013 14:13:13 -0500 Subject: [PATCH] Check for quote-escapes in isEscapedChar() (REGEX-22) Previously, our escape-checker was only looking for slash-escapes, but java.regex.Pattern allows escaping character sequences in between \Q and \E. This caused us to incorrectly count quote-escaped parentheses and brackets, leading to invalid group values in: \Q[\E(?\d+) Issue #2 --- .../java/com/google/code/regexp/Pattern.java | 49 ++++++++++++++++++- .../com/google/code/regexp/MatcherTest.java | 47 ++++++++++++++++++ .../com/google/code/regexp/PatternTest.java | 37 ++++++++++++++ 3 files changed, 132 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/google/code/regexp/Pattern.java b/src/main/java/com/google/code/regexp/Pattern.java index 53cbef8..e8ea4c1 100644 --- a/src/main/java/com/google/code/regexp/Pattern.java +++ b/src/main/java/com/google/code/regexp/Pattern.java @@ -283,13 +283,25 @@ public String toString() { /** * Determines if the character at the specified position - * of a string is escaped with a backslash + * of a string is escaped * * @param s string to evaluate * @param pos the position of the character to evaluate * @return true if the character is escaped; otherwise false */ static private boolean isEscapedChar(String s, int pos) { + return isSlashEscapedChar(s, pos) || isQuoteEscapedChar(s, pos); + } + + /** + * Determines if the character at the specified position + * of a string is escaped with a backslash + * + * @param s string to evaluate + * @param pos the position of the character to evaluate + * @return true if the character is escaped; otherwise false + */ + static private boolean isSlashEscapedChar(String s, int pos) { // Count the backslashes preceding this position. If it's // even, there is no escape and the slashes are just literals. @@ -303,6 +315,41 @@ static private boolean isEscapedChar(String s, int pos) { return numSlashes % 2 != 0; } + /** + * Determines if the character at the specified position + * of a string is quote-escaped (between \\Q and \\E) + * + * @param s string to evaluate + * @param pos the position of the character to evaluate + * @return true if the character is quote-escaped; otherwise false + */ + static private boolean isQuoteEscapedChar(String s, int pos) { + + boolean openQuoteFound = false; + boolean closeQuoteFound = false; + + // find last non-escaped open-quote + String s2 = s.substring(0, pos); + int posOpen = pos; + while ((posOpen = s2.lastIndexOf("\\Q", posOpen - 1)) != -1) { + if (!isSlashEscapedChar(s2, posOpen)) { + openQuoteFound = true; + break; + } + } + + if (openQuoteFound) { + // search remainder of string (after open-quote) for a close-quote; + // no need to check that it's slash-escaped because it can't be + // (the escape character itself is part of the literal when quoted) + if (s2.indexOf("\\E", posOpen) != -1) { + closeQuoteFound = true; + } + } + + return openQuoteFound && !closeQuoteFound; + } + /** * Determines if a string's character is within a regex character class * diff --git a/src/test/java/com/google/code/regexp/MatcherTest.java b/src/test/java/com/google/code/regexp/MatcherTest.java index dbc15e7..b5d2da4 100644 --- a/src/test/java/com/google/code/regexp/MatcherTest.java +++ b/src/test/java/com/google/code/regexp/MatcherTest.java @@ -668,4 +668,51 @@ public void testBackrefNoMatch() { // to the first captured number assertFalse(p.matcher("xyz12345abc123456def").find()); } + + @Test + public void testParenFoundAfterQuoteEscapedBracket() { + // Open-bracket is quote-escaped so it's not a character class; + // process it as a literal. Previously, we saw the bracket as a + // character class, which messed up the group indexes in the pattern + // as reported in Issue #2. + Pattern p = Pattern.compile("(?\\Q[\\E)(?\\d+)(?-)(?\\d+)(?\\])"); + Matcher m = p.matcher("[1-0]"); + assertTrue(m.find()); + assertEquals("[", m.group("T0")); + assertEquals("1", m.group("T1")); + assertEquals("-", m.group("T2")); + assertEquals("0", m.group("T3")); + assertEquals("]", m.group("T4")); + } + + @Test + public void testRealPatternFoundAfterQuoteEscapedPattern() { + // The quote-escaped string looks like a real regex pattern, but + // it's a literal string, so ignore it. The pattern after that + // should still be found + Pattern p = Pattern.compile("\\Q(?\\d+) [ \\E(?abc\\d+) \\Q]\\E"); + Matcher m = p.matcher("(?\\d+) [ abc123 ]"); + assertTrue(m.find()); + assertEquals("abc123", m.group("name")); + } + + @Test + public void testQuoteEscapedPatternDoesNotCreateNamedGroup() { + Pattern p = Pattern.compile("\\Q(?\\d+)\\E (?abc\\d+)"); + Matcher m = p.matcher("(?\\d+) abc123"); + assertTrue(m.find()); + + thrown.expect(IndexOutOfBoundsException.class); + thrown.expectMessage("No group \"foo\""); + m.group("foo"); + } + + @Test + public void testNamedGroupFoundInEscapedQuote() { + // since quote-escape is itself escaped, it's actually a literal \Q and \E + Pattern p = Pattern.compile("(abc)\\\\Q(?\\d+)\\\\E"); + Matcher m = p.matcher("abc\\Q123\\E"); + assertTrue(m.find()); + assertEquals("123", m.group("named")); + } } diff --git a/src/test/java/com/google/code/regexp/PatternTest.java b/src/test/java/com/google/code/regexp/PatternTest.java index 398fc7d..e7a455a 100644 --- a/src/test/java/com/google/code/regexp/PatternTest.java +++ b/src/test/java/com/google/code/regexp/PatternTest.java @@ -284,6 +284,43 @@ public void testIndexOfWithInvalidNegativeInstanceIndex() { assertEquals(-1, p.indexOf("named", -100)); } + @Test + public void testIndexOfNamedGroupAfterQuoteEscapedBracket() { + // open-bracket escaped, so it's not a character class + Pattern p = Pattern.compile("(a)(b)\\Q[\\E(?c)\\]"); + assertEquals(2, p.indexOf("named")); + } + + @Test + public void testIndexOfNamedGroupAfterSlashEscapedBracket() { + // open-bracket escaped, so it's not a character class + Pattern p = Pattern.compile("(a)(b)\\[(?c)\\]"); + assertEquals(2, p.indexOf("named")); + } + + @Test + public void testIndexOfNamedGroupAfterQuoteEscapedPattern() { + // The quote-escaped string looks like a real regex pattern, but + // it's a literal string, so ignore it. The pattern after that + // should still be found + Pattern p = Pattern.compile("(?a)\\Q(?b)(?c)(d) [ \\E(?e) \\Q]\\E"); + assertEquals(1, p.indexOf("named")); + } + + @Test + public void testIndexOfNamedGroupInEscapedQuote() { + // since quote-escape is itself escaped, it's actually a literal \Q and \E + Pattern p = Pattern.compile("(a)\\\\Q(?\\d+)\\\\E"); + assertEquals(1, p.indexOf("named")); + } + + @Test + public void testInvalidCloseQuoteEscapeSequence() { + // when \E present, \Q must also be present, so the following is invalid syntax + thrown.expect(PatternSyntaxException.class); + Pattern.compile("(a)\\\\Q(?d)\\E"); + } + @Test public void testNamedPatternGetsOriginalPattern() { final String ORIG_PATT = "(a)(b)(?:c)(?x)";