Skip to content

Commit

Permalink
Check for quote-escapes in isEscapedChar() (REGEX-22)
Browse files Browse the repository at this point in the history
Previously, our escape-checker was only looking for slash-escapes, but
java.regex.Pattern allows escaping character sequences in between \Q
and \E. This caused us to incorrectly count quote-escaped parentheses
and brackets, leading to invalid group values in:

   \Q[\E(?<foo>\d+)

Issue #2
  • Loading branch information
tony19 committed Sep 15, 2013
1 parent 82bdfeb commit b90acad
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 1 deletion.
49 changes: 48 additions & 1 deletion src/main/java/com/google/code/regexp/Pattern.java
Original file line number Diff line number Diff line change
Expand Up @@ -283,13 +283,25 @@ public String toString() {

/**
* Determines if the character at the specified position
* of a string is escaped with a backslash
* of a string is escaped
*
* @param s string to evaluate
* @param pos the position of the character to evaluate
* @return true if the character is escaped; otherwise false
*/
static private boolean isEscapedChar(String s, int pos) {
return isSlashEscapedChar(s, pos) || isQuoteEscapedChar(s, pos);
}

/**
* Determines if the character at the specified position
* of a string is escaped with a backslash
*
* @param s string to evaluate
* @param pos the position of the character to evaluate
* @return true if the character is escaped; otherwise false
*/
static private boolean isSlashEscapedChar(String s, int pos) {

// Count the backslashes preceding this position. If it's
// even, there is no escape and the slashes are just literals.
Expand All @@ -303,6 +315,41 @@ static private boolean isEscapedChar(String s, int pos) {
return numSlashes % 2 != 0;
}

/**
* Determines if the character at the specified position
* of a string is quote-escaped (between \\Q and \\E)
*
* @param s string to evaluate
* @param pos the position of the character to evaluate
* @return true if the character is quote-escaped; otherwise false
*/
static private boolean isQuoteEscapedChar(String s, int pos) {

boolean openQuoteFound = false;
boolean closeQuoteFound = false;

// find last non-escaped open-quote
String s2 = s.substring(0, pos);
int posOpen = pos;
while ((posOpen = s2.lastIndexOf("\\Q", posOpen - 1)) != -1) {
if (!isSlashEscapedChar(s2, posOpen)) {
openQuoteFound = true;
break;
}
}

if (openQuoteFound) {
// search remainder of string (after open-quote) for a close-quote;
// no need to check that it's slash-escaped because it can't be
// (the escape character itself is part of the literal when quoted)
if (s2.indexOf("\\E", posOpen) != -1) {
closeQuoteFound = true;
}
}

return openQuoteFound && !closeQuoteFound;
}

/**
* Determines if a string's character is within a regex character class
*
Expand Down
47 changes: 47 additions & 0 deletions src/test/java/com/google/code/regexp/MatcherTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -668,4 +668,51 @@ public void testBackrefNoMatch() {
// to the first captured number
assertFalse(p.matcher("xyz12345abc123456def").find());
}

@Test
public void testParenFoundAfterQuoteEscapedBracket() {
// Open-bracket is quote-escaped so it's not a character class;
// process it as a literal. Previously, we saw the bracket as a
// character class, which messed up the group indexes in the pattern
// as reported in Issue #2.
Pattern p = Pattern.compile("(?<T0>\\Q[\\E)(?<T1>\\d+)(?<T2>-)(?<T3>\\d+)(?<T4>\\])");
Matcher m = p.matcher("[1-0]");
assertTrue(m.find());
assertEquals("[", m.group("T0"));
assertEquals("1", m.group("T1"));
assertEquals("-", m.group("T2"));
assertEquals("0", m.group("T3"));
assertEquals("]", m.group("T4"));
}

@Test
public void testRealPatternFoundAfterQuoteEscapedPattern() {
// The quote-escaped string looks like a real regex pattern, but
// it's a literal string, so ignore it. The pattern after that
// should still be found
Pattern p = Pattern.compile("\\Q(?<foo>\\d+) [ \\E(?<name>abc\\d+) \\Q]\\E");
Matcher m = p.matcher("(?<foo>\\d+) [ abc123 ]");
assertTrue(m.find());
assertEquals("abc123", m.group("name"));
}

@Test
public void testQuoteEscapedPatternDoesNotCreateNamedGroup() {
Pattern p = Pattern.compile("\\Q(?<foo>\\d+)\\E (?<name>abc\\d+)");
Matcher m = p.matcher("(?<foo>\\d+) abc123");
assertTrue(m.find());

thrown.expect(IndexOutOfBoundsException.class);
thrown.expectMessage("No group \"foo\"");
m.group("foo");
}

@Test
public void testNamedGroupFoundInEscapedQuote() {
// since quote-escape is itself escaped, it's actually a literal \Q and \E
Pattern p = Pattern.compile("(abc)\\\\Q(?<named>\\d+)\\\\E");
Matcher m = p.matcher("abc\\Q123\\E");
assertTrue(m.find());
assertEquals("123", m.group("named"));
}
}
37 changes: 37 additions & 0 deletions src/test/java/com/google/code/regexp/PatternTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,43 @@ public void testIndexOfWithInvalidNegativeInstanceIndex() {
assertEquals(-1, p.indexOf("named", -100));
}

@Test
public void testIndexOfNamedGroupAfterQuoteEscapedBracket() {
// open-bracket escaped, so it's not a character class
Pattern p = Pattern.compile("(a)(b)\\Q[\\E(?<named>c)\\]");
assertEquals(2, p.indexOf("named"));
}

@Test
public void testIndexOfNamedGroupAfterSlashEscapedBracket() {
// open-bracket escaped, so it's not a character class
Pattern p = Pattern.compile("(a)(b)\\[(?<named>c)\\]");
assertEquals(2, p.indexOf("named"));
}

@Test
public void testIndexOfNamedGroupAfterQuoteEscapedPattern() {
// The quote-escaped string looks like a real regex pattern, but
// it's a literal string, so ignore it. The pattern after that
// should still be found
Pattern p = Pattern.compile("(?<foo>a)\\Q(?<bar>b)(?<baz>c)(d) [ \\E(?<named>e) \\Q]\\E");
assertEquals(1, p.indexOf("named"));
}

@Test
public void testIndexOfNamedGroupInEscapedQuote() {
// since quote-escape is itself escaped, it's actually a literal \Q and \E
Pattern p = Pattern.compile("(a)\\\\Q(?<named>\\d+)\\\\E");
assertEquals(1, p.indexOf("named"));
}

@Test
public void testInvalidCloseQuoteEscapeSequence() {
// when \E present, \Q must also be present, so the following is invalid syntax
thrown.expect(PatternSyntaxException.class);
Pattern.compile("(a)\\\\Q(?<named>d)\\E");
}

@Test
public void testNamedPatternGetsOriginalPattern() {
final String ORIG_PATT = "(a)(b)(?:c)(?<named>x)";
Expand Down

0 comments on commit b90acad

Please sign in to comment.