Skip to content

Commit

Permalink
Add option to print better parse errors (#3700)
Browse files Browse the repository at this point in the history
This PR fixes #3672 by providing a best-effort rendering of the partial
state of the parser when a parse error happens. To do this, we grab the
Earley state corresponding to the span `[0, token_index_before_error]`
and inspect the partial parse trees available, as well as the production
being attempted at this point.

Consider as a simple example the following standard K definition for
arithmetic expressions:
```k
module ARITHMETIC-SYNTAX
  imports UNSIGNED-INT-SYNTAX
  imports ID-SYNTAX

  syntax Exp ::= Int
               | "-" Exp      [group(neg), strict]
               | Exp "+" Exp  [left, group(add), strict]
               | Exp "-" Exp  [left, group(sub), strict]
               | Exp "*" Exp  [left, group(mul), strict]
               | Exp "/" Exp  [left, group(div), strict]
               | "(" Exp ")"  [bracket]

  syntax priorities neg > mul div > add sub
endmodule

module ARITHMETIC
  imports ARITHMETIC-SYNTAX
endmodule
```

On trying to `kast` an invalid term (for example, `1 * 2 + a * 4` - `a`
lexes because we've imported `ID-SYNTAX`, but terms of sort `Id` can't
appear in a valid derivation of `Exp`), we see the following message
from the new component:
```console
$ kast <(echo '1 * 2 + a * 4') --no-exc-wrap
[Error] Inner Parser: Parse error: unexpected token 'a' following token '+'. Additional parsing diagnostic information:
  Attempting to apply production:
    syntax Exp ::= Exp "+" Exp
    produced partial term:
      `_+__ARITHMETIC-SYNTAX_Exp_Exp_Exp`(`_*__ARITHMETIC-SYNTAX_Exp_Exp_Exp`(#token("1","Int"),#token("2","Int")),#token("<error>","<invalid>"))

	Source(/dev/fd/11)
	Location(1,9,1,10)
	1 |
	  .	        ^
```
we make a best-effort attempt to construct what the top-level `_+_` term
might have looked like, placing an `<error>` token where we expected to
see a valid term of sort `Exp`.

The changes made in this PR are as follows:
* Update the implementation of `EarleyParser` to conditionally perform
the rendering as above.
* Add boilerplate code to thread a new option `--debug-parse` through to
the parser when appropriate.
* Add a test case based on the above arithmetic module.

I have additionally verified with @dkcumming that the implementation as
reviewed produces useful information that would have been useful when
addressing the original problem; these cases come from the MIR semantics
and are therefore too large to include here as tests.

---------

Co-authored-by: rv-jenkins <[email protected]>
  • Loading branch information
Baltoli and rv-jenkins authored Oct 20, 2023
1 parent d18ea8f commit cc3d089
Show file tree
Hide file tree
Showing 14 changed files with 156 additions and 32 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
DEF=test
EXT=test
TESTDIR=.
KAST_FLAGS=--debug-parse
PIPEFAIL=
CHECK=$(CONSIDER_ERRORS) $(REMOVE_PATHS) | diff -

include ../../../include/kframework/ktest.mak
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1 ++ 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[Error] Inner Parser: Parse error: unexpected token '+' following token '+'.
Additional parsing diagnostic information:
Attempting to apply production:
syntax Exp ::= Exp "+" Exp
produced partial term:
`_+__TEST-SYNTAX_Exp_Exp_Exp`(#token("1","Int"))

Source(a.test.kast)
Location(1,4,1,5)
1 | 1 ++ 2
. ^
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1 + 5 * a / 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[Error] Inner Parser: Parse error: unexpected token 'a' following token '*'.
Additional parsing diagnostic information:
Attempting to apply production:
syntax Exp ::= Exp "*" Exp
produced partial term:
`_*__TEST-SYNTAX_Exp_Exp_Exp`(`_+__TEST-SYNTAX_Exp_Exp_Exp`(#token("1","Int"),#token("5","Int")))

Source(b.test.kast)
Location(1,9,1,10)
1 | 1 + 5 * a / 2
. ^
21 changes: 21 additions & 0 deletions k-distribution/tests/regression-new/issue-3672-debugParse/test.k
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// Copyright (c) K Team. All Rights Reserved.

module TEST-SYNTAX
imports UNSIGNED-INT-SYNTAX
imports ID-SYNTAX

syntax Exp ::= Int
| "-" Exp [group(neg)]
| Exp "+" Exp [left, group(add)]
| Exp "-" Exp [left, group(sub)]
| Exp "*" Exp [left, group(mul)]
| Exp "/" Exp [left, group(div)]
| "(" Exp ")" [bracket]

syntax priorities neg > mul div > add sub
endmodule

module TEST
imports TEST-SYNTAX
configuration <k> $PGM:Exp </k>
endmodule
2 changes: 1 addition & 1 deletion kernel/src/main/java/org/kframework/kast/KastFrontEnd.java
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ public int run() {
if (options.debugTokens)
System.out.println(kread.showTokens(parsingMod, def, FileUtil.read(stringToParse), source));
else {
K parsed = kread.prettyRead(parsingMod, sort, startSymbolLocation, def, source, FileUtil.read(stringToParse));
K parsed = kread.prettyRead(parsingMod, sort, startSymbolLocation, def, source, FileUtil.read(stringToParse), options.debugParse);

if (options.expandMacros) {
parsed = ExpandMacros.forNonSentences(unparsingMod, files.get(), def.kompileOptions, false).expand(parsed);
Expand Down
3 changes: 3 additions & 0 deletions kernel/src/main/java/org/kframework/kast/KastOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -161,4 +161,7 @@ public Class<InputModes> enumClass() {

@Parameter(names="--debug-tokens", description="Print a Markdown table of tokens matched by the scanner. Useful for debugging parsing errors.")
public boolean debugTokens = false;

@Parameter(names="--debug-parse", description="Print extended diagnostic information and partial ASTs for parse errors.")
public boolean debugParse = false;
}
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,8 @@ public Option<Module> ruleParsingModuleFor(String moduleName) {
* @return the parsed term.
*/

public K parseSingleTerm(Module module, Sort programStartSymbol, String startSymbolLocation, KExceptionManager kem, FileUtil files, String s, Source source) {
try (ParseInModule parseInModule = RuleGrammarGenerator.getCombinedGrammar(module, true, files)) {
public K parseSingleTerm(Module module, Sort programStartSymbol, String startSymbolLocation, KExceptionManager kem, FileUtil files, String s, Source source, boolean partialParseDebug) {
try (ParseInModule parseInModule = RuleGrammarGenerator.getCombinedGrammar(module, true, files, partialParseDebug)) {
Tuple2<Either<Set<KEMException>, K>, Set<KEMException>> res = parseInModule.parseString(s, programStartSymbol, startSymbolLocation, source);
kem.addAllKException(res._2().stream().map(e -> e.getKException()).collect(Collectors.toSet()));
if (res._1().isLeft()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import org.apache.commons.lang3.tuple.Pair;
import org.kframework.Collections;
import org.kframework.attributes.Att;
import org.kframework.attributes.Att.Key;
import org.kframework.attributes.Location;
import org.kframework.attributes.Source;
import org.kframework.builtin.BooleanUtils;
Expand Down Expand Up @@ -408,7 +407,7 @@ private Definition resolveNonConfigBubbles(Definition defWithConfig, boolean ser
RuleGrammarGenerator gen = new RuleGrammarGenerator(defWithCaches);
Module ruleParserModule = gen.getRuleGrammar(defWithCaches.mainModule());
ParseCache cache = loadCache(ruleParserModule);
try (ParseInModule parser = RuleGrammarGenerator.getCombinedGrammar(cache.getModule(), true, profileRules, false, true, files, options.debugTypeInference)) {
try (ParseInModule parser = RuleGrammarGenerator.getCombinedGrammar(cache.getModule(), true, profileRules, false, true, files, options.debugTypeInference, false)) {
Scanner scanner;
if (deserializeScanner) {
scanner = new Scanner(parser, globalOptions, files.resolveKompiled("scanner"));
Expand Down Expand Up @@ -436,7 +435,7 @@ private Module resolveNonConfigBubbles(Module module, Scanner scanner, RuleGramm
ParseCache cache = loadCache(ruleParserModule);
try (ParseInModule parser = needNewScanner ?
RuleGrammarGenerator.getCombinedGrammar(cache.getModule(), true, profileRules, files, options.debugTypeInference) :
RuleGrammarGenerator.getCombinedGrammar(cache.getModule(), scanner, true, profileRules, false, files, options.debugTypeInference)) {
RuleGrammarGenerator.getCombinedGrammar(cache.getModule(), scanner, true, profileRules, false, files, options.debugTypeInference, false)) {
if (needNewScanner)
parser.getScanner(globalOptions);
parser.initialize();
Expand Down Expand Up @@ -550,7 +549,7 @@ public Rule parseRule(CompiledDefinition compiledDef, String contents, Source so
errors = java.util.Collections.synchronizedSet(Sets.newHashSet());
RuleGrammarGenerator gen = new RuleGrammarGenerator(compiledDef.kompiledDefinition);
try (ParseInModule parser = RuleGrammarGenerator
.getCombinedGrammar(gen.getRuleGrammar(compiledDef.getParsedDefinition().mainModule()), true, profileRules, false, true, files, options.debugTypeInference)) {
.getCombinedGrammar(gen.getRuleGrammar(compiledDef.getParsedDefinition().mainModule()), true, profileRules, false, true, files, options.debugTypeInference, false)) {
parser.setScanner(new Scanner(parser, globalOptions, files.resolveKompiled("scanner")));
java.util.Set<K> res = parseBubble(parser, new HashMap<>(),
new Bubble(rule, contents, Att().add(Att.CONTENT_START_LINE(), 1)
Expand Down
8 changes: 4 additions & 4 deletions kernel/src/main/java/org/kframework/parser/KRead.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,11 @@ public String showTokens(Module mod, CompiledDefinition def, String stringToPars
return def.showTokens(mod, files, stringToParse, source);
}

public K prettyRead(Module mod, Sort sort, String startSymbolLocation, CompiledDefinition def, Source source, String stringToParse) {
return prettyRead(mod, sort, startSymbolLocation, def, source, stringToParse, this.input);
public K prettyRead(Module mod, Sort sort, String startSymbolLocation, CompiledDefinition def, Source source, String stringToParse, boolean partialParseDebug) {
return prettyRead(mod, sort, startSymbolLocation, def, source, stringToParse, this.input, partialParseDebug);
}

public K prettyRead(Module mod, Sort sort, String startSymbolLocation, CompiledDefinition def, Source source, String stringToParse, InputModes inputMode) {
public K prettyRead(Module mod, Sort sort, String startSymbolLocation, CompiledDefinition def, Source source, String stringToParse, InputModes inputMode, boolean partialParseDebug) {
switch (inputMode) {
case BINARY:
case JSON:
Expand All @@ -68,7 +68,7 @@ public K prettyRead(Module mod, Sort sort, String startSymbolLocation, CompiledD
case KORE:
return new KoreParser(mod.sortAttributesFor()).parseString(stringToParse);
case PROGRAM:
return def.parseSingleTerm(mod, sort, startSymbolLocation, kem, files, stringToParse, source);
return def.parseSingleTerm(mod, sort, startSymbolLocation, kem, files, stringToParse, source, partialParseDebug);
case RULE:
throw KEMException.internalError("Should have been handled directly by the kast front end: " + inputMode);
default:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,16 +63,17 @@ public class ParseInModule implements Serializable, AutoCloseable {
private final boolean forGlobalScanner;
private final FileUtil files;
private final String typeInferenceDebug;
private final boolean partialParseDebug;

ParseInModule(Module seedModule, boolean strict, boolean profileRules, boolean isBison, boolean forGlobalScanner, FileUtil files, String typeInferenceDebug) {
this(seedModule, null, null, null, null, strict, profileRules, isBison, forGlobalScanner, files, typeInferenceDebug);
ParseInModule(Module seedModule, boolean strict, boolean profileRules, boolean isBison, boolean forGlobalScanner, FileUtil files, String typeInferenceDebug, boolean partialParseDebug) {
this(seedModule, null, null, null, null, strict, profileRules, isBison, forGlobalScanner, files, typeInferenceDebug, partialParseDebug);
}

ParseInModule(Module seedModule, Scanner scanner, boolean strict, boolean profileRules, boolean isBison, boolean forGlobalScanner, FileUtil files, String typeInferenceDebug) {
this(seedModule, null, null, null, scanner, strict, profileRules, isBison, forGlobalScanner, files, typeInferenceDebug);
ParseInModule(Module seedModule, Scanner scanner, boolean strict, boolean profileRules, boolean isBison, boolean forGlobalScanner, FileUtil files, String typeInferenceDebug, boolean partialParseDebug) {
this(seedModule, null, null, null, scanner, strict, profileRules, isBison, forGlobalScanner, files, typeInferenceDebug, partialParseDebug);
}

private ParseInModule(Module seedModule, Module extensionModule, Module disambModule, Module parsingModule, Scanner scanner, boolean strict, boolean profileRules, boolean isBison, boolean forGlobalScanner, FileUtil files, String typeInferenceDebug) {
private ParseInModule(Module seedModule, Module extensionModule, Module disambModule, Module parsingModule, Scanner scanner, boolean strict, boolean profileRules, boolean isBison, boolean forGlobalScanner, FileUtil files, String typeInferenceDebug, boolean partialParseDebug) {
this.seedModule = seedModule;
this.extensionModule = extensionModule;
this.disambModule = disambModule;
Expand All @@ -84,6 +85,7 @@ private ParseInModule(Module seedModule, Module extensionModule, Module disambMo
this.forGlobalScanner = forGlobalScanner;
this.files = files;
this.typeInferenceDebug = typeInferenceDebug;
this.partialParseDebug = partialParseDebug;
}

/**
Expand Down Expand Up @@ -217,7 +219,7 @@ private void getParser(Scanner scanner, Sort startSymbol) {
EarleyParser p = parser;
if (p == null) {
Module m = getParsingModule();
p = new EarleyParser(m, scanner, startSymbol);
p = new EarleyParser(m, scanner, startSymbol, partialParseDebug);
parser = p;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,27 +182,31 @@ public static boolean isParserSort(Sort s) {

/* use this overload if you don't need to profile rule parse times. */
public static ParseInModule getCombinedGrammar(Module mod, boolean strict, FileUtil files) {
return getCombinedGrammar(mod, strict, false, false, false, files, null);
return getCombinedGrammar(mod, strict, false, false, false, files, null, false);
}

public static ParseInModule getCombinedGrammar(Module mod, boolean strict, FileUtil files, boolean partialParseDebug) {
return getCombinedGrammar(mod, strict, false, false, false, files, null, partialParseDebug);
}

public static ParseInModule getCombinedGrammar(Module mod, boolean strict, boolean timing, FileUtil files) {
return getCombinedGrammar(mod, strict, timing, false, false, files, null);
return getCombinedGrammar(mod, strict, timing, false, false, files, null, false);
}

public static ParseInModule getCombinedGrammar(Module mod, boolean strict, boolean timing, FileUtil files, String debugTypeInference) {
return getCombinedGrammar(mod, strict, timing, false, false, files, debugTypeInference);
return getCombinedGrammar(mod, strict, timing, false, false, files, debugTypeInference, false);
}

public static ParseInModule getCombinedGrammar(Module mod, boolean strict, boolean timing, boolean isBison, FileUtil files) {
return getCombinedGrammar(mod, strict, timing, isBison, false, files, null);
return getCombinedGrammar(mod, strict, timing, isBison, false, files, null, false);
}

public static ParseInModule getCombinedGrammar(Module mod, boolean strict, boolean timing, boolean isBison, boolean forGlobalScanner, FileUtil files) {
return getCombinedGrammar(mod, strict, timing, isBison, forGlobalScanner, files, null);
return getCombinedGrammar(mod, strict, timing, isBison, forGlobalScanner, files, null, false);
}

public static ParseInModule getCombinedGrammar(Module mod, Scanner scanner, boolean strict, boolean timing, boolean isBison, FileUtil files) {
return getCombinedGrammar(mod, scanner, strict, timing, isBison, files, null);
return getCombinedGrammar(mod, scanner, strict, timing, isBison, files, null, false);
}

// the forGlobalScanner flag tells the ParseInModule class not to exclude
Expand All @@ -221,14 +225,15 @@ public static ParseInModule getCombinedGrammar(Module mod, Scanner scanner, bool
* which connects the user concrete syntax with K syntax.
*
* @param mod module for which to create the parser.
* @param partialParseDebug
* @return parser which applies disambiguation filters by default.
*/
public static ParseInModule getCombinedGrammar(Module mod, boolean strict, boolean timing, boolean isBison, boolean forGlobalScanner, FileUtil files, String debugTypeInference) {
return new ParseInModule(mod, strict, timing, isBison, forGlobalScanner, files, debugTypeInference);
public static ParseInModule getCombinedGrammar(Module mod, boolean strict, boolean timing, boolean isBison, boolean forGlobalScanner, FileUtil files, String debugTypeInference, boolean partialParseDebug) {
return new ParseInModule(mod, strict, timing, isBison, forGlobalScanner, files, debugTypeInference, partialParseDebug);
}

public static ParseInModule getCombinedGrammar(Module mod, Scanner scanner, boolean strict, boolean timing, boolean isBison, FileUtil files, String debugTypeInference) {
return new ParseInModule(mod, scanner, strict, timing, isBison, false, files, debugTypeInference);
public static ParseInModule getCombinedGrammar(Module mod, Scanner scanner, boolean strict, boolean timing, boolean isBison, FileUtil files, String debugTypeInference, boolean partialParseDebug) {
return new ParseInModule(mod, scanner, strict, timing, isBison, false, files, debugTypeInference, partialParseDebug);
}

public static Tuple3<Module, Module, Module> getCombinedGrammarImpl(Module mod, boolean isBison, boolean forGlobalScanner) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import org.kframework.parser.Constant;
import org.kframework.parser.Term;
import org.kframework.parser.TermCons;
import org.kframework.parser.inner.disambiguation.TreeCleanerVisitor;
import org.kframework.utils.errorsystem.KEMException;
import org.pcollections.ConsPStack;
import org.pcollections.PStack;
Expand Down Expand Up @@ -601,8 +602,9 @@ public List<Integer> getColumns() {
* @param scanner The scanner used to tokenize strings over this grammar.
* @param startSymbol The start symbol to start parsing at.
*/
public EarleyParser(Module m, Scanner scanner, Sort startSymbol) {
public EarleyParser(Module m, Scanner scanner, Sort startSymbol, boolean partialParseDebug) {
this.scanner = scanner;
this.partialParseDebug = partialParseDebug;

// compute metadata about grammar
sorts = getSorts(m);
Expand Down Expand Up @@ -790,6 +792,8 @@ private BitSet[] computeFirstSet() {
private final BitSet nullable;
// the scanner to use to tokenize sentences before parsing them
private final Scanner scanner;
// whether to print detailed partial parse trees when an error occurs
private final boolean partialParseDebug;

/**
* Parse a sentence according to the grammar represented by this parser.
Expand Down Expand Up @@ -850,22 +854,74 @@ public Term parse(String input, Source source, int startLine, int startColumn) {
// a parse error after the outer loop finishes also.
if (S.get(k+1).empty() && Q.empty()) {
// no more states to process, so it must be a parse error
parseError(data, k);
parseError(data, S, k);
}
}
if (S.get(data.words.length).empty()) {
parseError(data, data.words.length-1);
parseError(data, S, data.words.length-1);
}
// finished parsing successfully, so return the final parse forest
return Ambiguity.apply(S.get(data.words.length).states.get(0).parseTree().stream().map(list -> list.get(0)).collect(Collectors.toSet()));
}

// We are only interested in displaying states that span the entire input
// when a parse error occurs; such states have a start-index of 0.
private Set<EarleyState> spanningStates(EarleySet parses) {
return parses.states.stream()
.filter(state -> state.start == 0)
.collect(Collectors.toSet());
}

// We heuristically identify the best state-set for producing diagnostics as the
// most recent such set that includes a _spanning state_; i.e. one with a start
// index of zero.
private Set<EarleyState> bestDiagnosticStates(List<EarleySet> S, int k) {
for(int i = k; i >= 0; --i) {
Set<EarleyState> candidate = spanningStates(S.get(i));
if(!candidate.isEmpty()) {
return candidate;
}
}

return new HashSet<>();
}

private String partialParseTreesDiagnostic(Set<EarleyState> spanningStates) {
if(spanningStates.isEmpty()) {
return "No top-level production could apply to this input.";
}

StringBuilder msg = new StringBuilder();

for(EarleyState state : spanningStates) {
msg.append(" Attempting to apply production:\n ").append(state.prod).append("\n");
for(PStack<Term> possibleTree : state.parseTree()) {
var cleanedChildren = possibleTree
.stream()
.map(term -> new TreeCleanerVisitor().apply(term))
.collect(Collectors.toList());

if(state.prod.prod.klabel().isDefined()) {
var term = TermCons.apply(ConsPStack.from(cleanedChildren), state.prod.prod);
msg.append(" produced partial term:\n ").append(term).append("\n");
} else {
msg.append(" produced partial term with no KLabel, and children:\n");
for(var child : cleanedChildren) {
msg.append(" ").append(child).append("\n");
}
}
}
}

return msg.toString();
}

/**
* @param data The {@link ParserMetadata} about the sentence being parsed
* @param S The set of {@link EarleyState}s for each end-index in the input
* @param k The end-index at which a parse error occurred. In other words, the index just prior to the first token that
* did not parse.
*/
private void parseError(ParserMetadata data, int k) {
private void parseError(ParserMetadata data, List<EarleySet> S, int k) {
int startLine, startColumn, endLine, endColumn;
if (data.words.length == 1) {
startLine = data.lines[0];
Expand All @@ -889,6 +945,12 @@ private void parseError(ParserMetadata data, int k) {
}
Location loc = new Location(startLine, startColumn,
endLine, endColumn);

if(partialParseDebug) {
msg += " Additional parsing diagnostic information:\n";
msg += partialParseTreesDiagnostic(bestDiagnosticStates(S, k));
}

throw KEMException.innerParserError(msg, data.source, loc);
}

Expand Down

0 comments on commit cc3d089

Please sign in to comment.