From cc3d0893cd29d0ece72a950c77e4804db39f3919 Mon Sep 17 00:00:00 2001 From: Bruce Collie Date: Fri, 20 Oct 2023 23:24:54 +0100 Subject: [PATCH] Add option to print better parse errors (#3700) This PR fixes #3672 by providing a best-effort rendering of the partial state of the parser when a parse error happens. To do this, we grab the Earley state corresponding to the span `[0, token_index_before_error]` and inspect the partial parse trees available, as well as the production being attempted at this point. Consider as a simple example the following standard K definition for arithmetic expressions: ```k module ARITHMETIC-SYNTAX imports UNSIGNED-INT-SYNTAX imports ID-SYNTAX syntax Exp ::= Int | "-" Exp [group(neg), strict] | Exp "+" Exp [left, group(add), strict] | Exp "-" Exp [left, group(sub), strict] | Exp "*" Exp [left, group(mul), strict] | Exp "/" Exp [left, group(div), strict] | "(" Exp ")" [bracket] syntax priorities neg > mul div > add sub endmodule module ARITHMETIC imports ARITHMETIC-SYNTAX endmodule ``` On trying to `kast` an invalid term (for example, `1 * 2 + a * 4` - `a` lexes because we've imported `ID-SYNTAX`, but terms of sort `Id` can't appear in a valid derivation of `Exp`), we see the following message from the new component: ```console $ kast <(echo '1 * 2 + a * 4') --no-exc-wrap [Error] Inner Parser: Parse error: unexpected token 'a' following token '+'. Additional parsing diagnostic information: Attempting to apply production: syntax Exp ::= Exp "+" Exp produced partial term: `_+__ARITHMETIC-SYNTAX_Exp_Exp_Exp`(`_*__ARITHMETIC-SYNTAX_Exp_Exp_Exp`(#token("1","Int"),#token("2","Int")),#token("","")) Source(/dev/fd/11) Location(1,9,1,10) 1 | . ^ ``` we make a best-effort attempt to construct what the top-level `_+_` term might have looked like, placing an `` token where we expected to see a valid term of sort `Exp`. The changes made in this PR are as follows: * Update the implementation of `EarleyParser` to conditionally perform the rendering as above. * Add boilerplate code to thread a new option `--debug-parse` through to the parser when appropriate. * Add a test case based on the above arithmetic module. I have additionally verified with @dkcumming that the implementation as reviewed produces useful information that would have been useful when addressing the original problem; these cases come from the MIR semantics and are therefore too large to include here as tests. --------- Co-authored-by: rv-jenkins --- .../issue-3672-debugParse/Makefile | 8 +++ .../issue-3672-debugParse/a.test.kast | 1 + .../issue-3672-debugParse/a.test.kast.out | 11 +++ .../issue-3672-debugParse/b.test.kast | 1 + .../issue-3672-debugParse/b.test.kast.out | 11 +++ .../issue-3672-debugParse/test.k | 21 ++++++ .../org/kframework/kast/KastFrontEnd.java | 2 +- .../java/org/kframework/kast/KastOptions.java | 3 + .../kompile/CompiledDefinition.java | 4 +- .../kframework/kompile/DefinitionParsing.java | 7 +- .../java/org/kframework/parser/KRead.java | 8 +-- .../parser/inner/ParseInModule.java | 14 ++-- .../parser/inner/RuleGrammarGenerator.java | 25 ++++--- .../parser/inner/kernel/EarleyParser.java | 72 +++++++++++++++++-- 14 files changed, 156 insertions(+), 32 deletions(-) create mode 100644 k-distribution/tests/regression-new/issue-3672-debugParse/Makefile create mode 100644 k-distribution/tests/regression-new/issue-3672-debugParse/a.test.kast create mode 100644 k-distribution/tests/regression-new/issue-3672-debugParse/a.test.kast.out create mode 100644 k-distribution/tests/regression-new/issue-3672-debugParse/b.test.kast create mode 100644 k-distribution/tests/regression-new/issue-3672-debugParse/b.test.kast.out create mode 100644 k-distribution/tests/regression-new/issue-3672-debugParse/test.k diff --git a/k-distribution/tests/regression-new/issue-3672-debugParse/Makefile b/k-distribution/tests/regression-new/issue-3672-debugParse/Makefile new file mode 100644 index 00000000000..80d219d92a3 --- /dev/null +++ b/k-distribution/tests/regression-new/issue-3672-debugParse/Makefile @@ -0,0 +1,8 @@ +DEF=test +EXT=test +TESTDIR=. +KAST_FLAGS=--debug-parse +PIPEFAIL= +CHECK=$(CONSIDER_ERRORS) $(REMOVE_PATHS) | diff - + +include ../../../include/kframework/ktest.mak diff --git a/k-distribution/tests/regression-new/issue-3672-debugParse/a.test.kast b/k-distribution/tests/regression-new/issue-3672-debugParse/a.test.kast new file mode 100644 index 00000000000..cd52da9e230 --- /dev/null +++ b/k-distribution/tests/regression-new/issue-3672-debugParse/a.test.kast @@ -0,0 +1 @@ +1 ++ 2 diff --git a/k-distribution/tests/regression-new/issue-3672-debugParse/a.test.kast.out b/k-distribution/tests/regression-new/issue-3672-debugParse/a.test.kast.out new file mode 100644 index 00000000000..b956ca56a31 --- /dev/null +++ b/k-distribution/tests/regression-new/issue-3672-debugParse/a.test.kast.out @@ -0,0 +1,11 @@ +[Error] Inner Parser: Parse error: unexpected token '+' following token '+'. +Additional parsing diagnostic information: + Attempting to apply production: + syntax Exp ::= Exp "+" Exp + produced partial term: + `_+__TEST-SYNTAX_Exp_Exp_Exp`(#token("1","Int")) + + Source(a.test.kast) + Location(1,4,1,5) + 1 | 1 ++ 2 + . ^ diff --git a/k-distribution/tests/regression-new/issue-3672-debugParse/b.test.kast b/k-distribution/tests/regression-new/issue-3672-debugParse/b.test.kast new file mode 100644 index 00000000000..46983123c98 --- /dev/null +++ b/k-distribution/tests/regression-new/issue-3672-debugParse/b.test.kast @@ -0,0 +1 @@ +1 + 5 * a / 2 diff --git a/k-distribution/tests/regression-new/issue-3672-debugParse/b.test.kast.out b/k-distribution/tests/regression-new/issue-3672-debugParse/b.test.kast.out new file mode 100644 index 00000000000..c8b57d92320 --- /dev/null +++ b/k-distribution/tests/regression-new/issue-3672-debugParse/b.test.kast.out @@ -0,0 +1,11 @@ +[Error] Inner Parser: Parse error: unexpected token 'a' following token '*'. +Additional parsing diagnostic information: + Attempting to apply production: + syntax Exp ::= Exp "*" Exp + produced partial term: + `_*__TEST-SYNTAX_Exp_Exp_Exp`(`_+__TEST-SYNTAX_Exp_Exp_Exp`(#token("1","Int"),#token("5","Int"))) + + Source(b.test.kast) + Location(1,9,1,10) + 1 | 1 + 5 * a / 2 + . ^ diff --git a/k-distribution/tests/regression-new/issue-3672-debugParse/test.k b/k-distribution/tests/regression-new/issue-3672-debugParse/test.k new file mode 100644 index 00000000000..ea90826056f --- /dev/null +++ b/k-distribution/tests/regression-new/issue-3672-debugParse/test.k @@ -0,0 +1,21 @@ +// Copyright (c) K Team. All Rights Reserved. + +module TEST-SYNTAX + imports UNSIGNED-INT-SYNTAX + imports ID-SYNTAX + + syntax Exp ::= Int + | "-" Exp [group(neg)] + | Exp "+" Exp [left, group(add)] + | Exp "-" Exp [left, group(sub)] + | Exp "*" Exp [left, group(mul)] + | Exp "/" Exp [left, group(div)] + | "(" Exp ")" [bracket] + + syntax priorities neg > mul div > add sub +endmodule + +module TEST + imports TEST-SYNTAX + configuration $PGM:Exp +endmodule diff --git a/kernel/src/main/java/org/kframework/kast/KastFrontEnd.java b/kernel/src/main/java/org/kframework/kast/KastFrontEnd.java index c222faa9924..0f539ee397a 100644 --- a/kernel/src/main/java/org/kframework/kast/KastFrontEnd.java +++ b/kernel/src/main/java/org/kframework/kast/KastFrontEnd.java @@ -217,7 +217,7 @@ public int run() { if (options.debugTokens) System.out.println(kread.showTokens(parsingMod, def, FileUtil.read(stringToParse), source)); else { - K parsed = kread.prettyRead(parsingMod, sort, startSymbolLocation, def, source, FileUtil.read(stringToParse)); + K parsed = kread.prettyRead(parsingMod, sort, startSymbolLocation, def, source, FileUtil.read(stringToParse), options.debugParse); if (options.expandMacros) { parsed = ExpandMacros.forNonSentences(unparsingMod, files.get(), def.kompileOptions, false).expand(parsed); diff --git a/kernel/src/main/java/org/kframework/kast/KastOptions.java b/kernel/src/main/java/org/kframework/kast/KastOptions.java index 7ef74e430c6..aa63d6d8078 100644 --- a/kernel/src/main/java/org/kframework/kast/KastOptions.java +++ b/kernel/src/main/java/org/kframework/kast/KastOptions.java @@ -161,4 +161,7 @@ public Class enumClass() { @Parameter(names="--debug-tokens", description="Print a Markdown table of tokens matched by the scanner. Useful for debugging parsing errors.") public boolean debugTokens = false; + + @Parameter(names="--debug-parse", description="Print extended diagnostic information and partial ASTs for parse errors.") + public boolean debugParse = false; } diff --git a/kernel/src/main/java/org/kframework/kompile/CompiledDefinition.java b/kernel/src/main/java/org/kframework/kompile/CompiledDefinition.java index 289619a9fa7..9ce8a08456d 100644 --- a/kernel/src/main/java/org/kframework/kompile/CompiledDefinition.java +++ b/kernel/src/main/java/org/kframework/kompile/CompiledDefinition.java @@ -196,8 +196,8 @@ public Option ruleParsingModuleFor(String moduleName) { * @return the parsed term. */ - public K parseSingleTerm(Module module, Sort programStartSymbol, String startSymbolLocation, KExceptionManager kem, FileUtil files, String s, Source source) { - try (ParseInModule parseInModule = RuleGrammarGenerator.getCombinedGrammar(module, true, files)) { + public K parseSingleTerm(Module module, Sort programStartSymbol, String startSymbolLocation, KExceptionManager kem, FileUtil files, String s, Source source, boolean partialParseDebug) { + try (ParseInModule parseInModule = RuleGrammarGenerator.getCombinedGrammar(module, true, files, partialParseDebug)) { Tuple2, K>, Set> res = parseInModule.parseString(s, programStartSymbol, startSymbolLocation, source); kem.addAllKException(res._2().stream().map(e -> e.getKException()).collect(Collectors.toSet())); if (res._1().isLeft()) { diff --git a/kernel/src/main/java/org/kframework/kompile/DefinitionParsing.java b/kernel/src/main/java/org/kframework/kompile/DefinitionParsing.java index 6adc0ec1e3a..6eabf9d788f 100644 --- a/kernel/src/main/java/org/kframework/kompile/DefinitionParsing.java +++ b/kernel/src/main/java/org/kframework/kompile/DefinitionParsing.java @@ -7,7 +7,6 @@ import org.apache.commons.lang3.tuple.Pair; import org.kframework.Collections; import org.kframework.attributes.Att; -import org.kframework.attributes.Att.Key; import org.kframework.attributes.Location; import org.kframework.attributes.Source; import org.kframework.builtin.BooleanUtils; @@ -408,7 +407,7 @@ private Definition resolveNonConfigBubbles(Definition defWithConfig, boolean ser RuleGrammarGenerator gen = new RuleGrammarGenerator(defWithCaches); Module ruleParserModule = gen.getRuleGrammar(defWithCaches.mainModule()); ParseCache cache = loadCache(ruleParserModule); - try (ParseInModule parser = RuleGrammarGenerator.getCombinedGrammar(cache.getModule(), true, profileRules, false, true, files, options.debugTypeInference)) { + try (ParseInModule parser = RuleGrammarGenerator.getCombinedGrammar(cache.getModule(), true, profileRules, false, true, files, options.debugTypeInference, false)) { Scanner scanner; if (deserializeScanner) { scanner = new Scanner(parser, globalOptions, files.resolveKompiled("scanner")); @@ -436,7 +435,7 @@ private Module resolveNonConfigBubbles(Module module, Scanner scanner, RuleGramm ParseCache cache = loadCache(ruleParserModule); try (ParseInModule parser = needNewScanner ? RuleGrammarGenerator.getCombinedGrammar(cache.getModule(), true, profileRules, files, options.debugTypeInference) : - RuleGrammarGenerator.getCombinedGrammar(cache.getModule(), scanner, true, profileRules, false, files, options.debugTypeInference)) { + RuleGrammarGenerator.getCombinedGrammar(cache.getModule(), scanner, true, profileRules, false, files, options.debugTypeInference, false)) { if (needNewScanner) parser.getScanner(globalOptions); parser.initialize(); @@ -550,7 +549,7 @@ public Rule parseRule(CompiledDefinition compiledDef, String contents, Source so errors = java.util.Collections.synchronizedSet(Sets.newHashSet()); RuleGrammarGenerator gen = new RuleGrammarGenerator(compiledDef.kompiledDefinition); try (ParseInModule parser = RuleGrammarGenerator - .getCombinedGrammar(gen.getRuleGrammar(compiledDef.getParsedDefinition().mainModule()), true, profileRules, false, true, files, options.debugTypeInference)) { + .getCombinedGrammar(gen.getRuleGrammar(compiledDef.getParsedDefinition().mainModule()), true, profileRules, false, true, files, options.debugTypeInference, false)) { parser.setScanner(new Scanner(parser, globalOptions, files.resolveKompiled("scanner"))); java.util.Set res = parseBubble(parser, new HashMap<>(), new Bubble(rule, contents, Att().add(Att.CONTENT_START_LINE(), 1) diff --git a/kernel/src/main/java/org/kframework/parser/KRead.java b/kernel/src/main/java/org/kframework/parser/KRead.java index 6ed83e89188..867851c956c 100644 --- a/kernel/src/main/java/org/kframework/parser/KRead.java +++ b/kernel/src/main/java/org/kframework/parser/KRead.java @@ -55,11 +55,11 @@ public String showTokens(Module mod, CompiledDefinition def, String stringToPars return def.showTokens(mod, files, stringToParse, source); } - public K prettyRead(Module mod, Sort sort, String startSymbolLocation, CompiledDefinition def, Source source, String stringToParse) { - return prettyRead(mod, sort, startSymbolLocation, def, source, stringToParse, this.input); + public K prettyRead(Module mod, Sort sort, String startSymbolLocation, CompiledDefinition def, Source source, String stringToParse, boolean partialParseDebug) { + return prettyRead(mod, sort, startSymbolLocation, def, source, stringToParse, this.input, partialParseDebug); } - public K prettyRead(Module mod, Sort sort, String startSymbolLocation, CompiledDefinition def, Source source, String stringToParse, InputModes inputMode) { + public K prettyRead(Module mod, Sort sort, String startSymbolLocation, CompiledDefinition def, Source source, String stringToParse, InputModes inputMode, boolean partialParseDebug) { switch (inputMode) { case BINARY: case JSON: @@ -68,7 +68,7 @@ public K prettyRead(Module mod, Sort sort, String startSymbolLocation, CompiledD case KORE: return new KoreParser(mod.sortAttributesFor()).parseString(stringToParse); case PROGRAM: - return def.parseSingleTerm(mod, sort, startSymbolLocation, kem, files, stringToParse, source); + return def.parseSingleTerm(mod, sort, startSymbolLocation, kem, files, stringToParse, source, partialParseDebug); case RULE: throw KEMException.internalError("Should have been handled directly by the kast front end: " + inputMode); default: diff --git a/kernel/src/main/java/org/kframework/parser/inner/ParseInModule.java b/kernel/src/main/java/org/kframework/parser/inner/ParseInModule.java index 4e3cbdb4066..059dfda65fe 100644 --- a/kernel/src/main/java/org/kframework/parser/inner/ParseInModule.java +++ b/kernel/src/main/java/org/kframework/parser/inner/ParseInModule.java @@ -63,16 +63,17 @@ public class ParseInModule implements Serializable, AutoCloseable { private final boolean forGlobalScanner; private final FileUtil files; private final String typeInferenceDebug; + private final boolean partialParseDebug; - ParseInModule(Module seedModule, boolean strict, boolean profileRules, boolean isBison, boolean forGlobalScanner, FileUtil files, String typeInferenceDebug) { - this(seedModule, null, null, null, null, strict, profileRules, isBison, forGlobalScanner, files, typeInferenceDebug); + ParseInModule(Module seedModule, boolean strict, boolean profileRules, boolean isBison, boolean forGlobalScanner, FileUtil files, String typeInferenceDebug, boolean partialParseDebug) { + this(seedModule, null, null, null, null, strict, profileRules, isBison, forGlobalScanner, files, typeInferenceDebug, partialParseDebug); } - ParseInModule(Module seedModule, Scanner scanner, boolean strict, boolean profileRules, boolean isBison, boolean forGlobalScanner, FileUtil files, String typeInferenceDebug) { - this(seedModule, null, null, null, scanner, strict, profileRules, isBison, forGlobalScanner, files, typeInferenceDebug); + ParseInModule(Module seedModule, Scanner scanner, boolean strict, boolean profileRules, boolean isBison, boolean forGlobalScanner, FileUtil files, String typeInferenceDebug, boolean partialParseDebug) { + this(seedModule, null, null, null, scanner, strict, profileRules, isBison, forGlobalScanner, files, typeInferenceDebug, partialParseDebug); } - private ParseInModule(Module seedModule, Module extensionModule, Module disambModule, Module parsingModule, Scanner scanner, boolean strict, boolean profileRules, boolean isBison, boolean forGlobalScanner, FileUtil files, String typeInferenceDebug) { + private ParseInModule(Module seedModule, Module extensionModule, Module disambModule, Module parsingModule, Scanner scanner, boolean strict, boolean profileRules, boolean isBison, boolean forGlobalScanner, FileUtil files, String typeInferenceDebug, boolean partialParseDebug) { this.seedModule = seedModule; this.extensionModule = extensionModule; this.disambModule = disambModule; @@ -84,6 +85,7 @@ private ParseInModule(Module seedModule, Module extensionModule, Module disambMo this.forGlobalScanner = forGlobalScanner; this.files = files; this.typeInferenceDebug = typeInferenceDebug; + this.partialParseDebug = partialParseDebug; } /** @@ -217,7 +219,7 @@ private void getParser(Scanner scanner, Sort startSymbol) { EarleyParser p = parser; if (p == null) { Module m = getParsingModule(); - p = new EarleyParser(m, scanner, startSymbol); + p = new EarleyParser(m, scanner, startSymbol, partialParseDebug); parser = p; } } diff --git a/kernel/src/main/java/org/kframework/parser/inner/RuleGrammarGenerator.java b/kernel/src/main/java/org/kframework/parser/inner/RuleGrammarGenerator.java index e2a54d0e14a..7c4e0303e47 100644 --- a/kernel/src/main/java/org/kframework/parser/inner/RuleGrammarGenerator.java +++ b/kernel/src/main/java/org/kframework/parser/inner/RuleGrammarGenerator.java @@ -182,27 +182,31 @@ public static boolean isParserSort(Sort s) { /* use this overload if you don't need to profile rule parse times. */ public static ParseInModule getCombinedGrammar(Module mod, boolean strict, FileUtil files) { - return getCombinedGrammar(mod, strict, false, false, false, files, null); + return getCombinedGrammar(mod, strict, false, false, false, files, null, false); + } + + public static ParseInModule getCombinedGrammar(Module mod, boolean strict, FileUtil files, boolean partialParseDebug) { + return getCombinedGrammar(mod, strict, false, false, false, files, null, partialParseDebug); } public static ParseInModule getCombinedGrammar(Module mod, boolean strict, boolean timing, FileUtil files) { - return getCombinedGrammar(mod, strict, timing, false, false, files, null); + return getCombinedGrammar(mod, strict, timing, false, false, files, null, false); } public static ParseInModule getCombinedGrammar(Module mod, boolean strict, boolean timing, FileUtil files, String debugTypeInference) { - return getCombinedGrammar(mod, strict, timing, false, false, files, debugTypeInference); + return getCombinedGrammar(mod, strict, timing, false, false, files, debugTypeInference, false); } public static ParseInModule getCombinedGrammar(Module mod, boolean strict, boolean timing, boolean isBison, FileUtil files) { - return getCombinedGrammar(mod, strict, timing, isBison, false, files, null); + return getCombinedGrammar(mod, strict, timing, isBison, false, files, null, false); } public static ParseInModule getCombinedGrammar(Module mod, boolean strict, boolean timing, boolean isBison, boolean forGlobalScanner, FileUtil files) { - return getCombinedGrammar(mod, strict, timing, isBison, forGlobalScanner, files, null); + return getCombinedGrammar(mod, strict, timing, isBison, forGlobalScanner, files, null, false); } public static ParseInModule getCombinedGrammar(Module mod, Scanner scanner, boolean strict, boolean timing, boolean isBison, FileUtil files) { - return getCombinedGrammar(mod, scanner, strict, timing, isBison, files, null); + return getCombinedGrammar(mod, scanner, strict, timing, isBison, files, null, false); } // the forGlobalScanner flag tells the ParseInModule class not to exclude @@ -221,14 +225,15 @@ public static ParseInModule getCombinedGrammar(Module mod, Scanner scanner, bool * which connects the user concrete syntax with K syntax. * * @param mod module for which to create the parser. + * @param partialParseDebug * @return parser which applies disambiguation filters by default. */ - public static ParseInModule getCombinedGrammar(Module mod, boolean strict, boolean timing, boolean isBison, boolean forGlobalScanner, FileUtil files, String debugTypeInference) { - return new ParseInModule(mod, strict, timing, isBison, forGlobalScanner, files, debugTypeInference); + public static ParseInModule getCombinedGrammar(Module mod, boolean strict, boolean timing, boolean isBison, boolean forGlobalScanner, FileUtil files, String debugTypeInference, boolean partialParseDebug) { + return new ParseInModule(mod, strict, timing, isBison, forGlobalScanner, files, debugTypeInference, partialParseDebug); } - public static ParseInModule getCombinedGrammar(Module mod, Scanner scanner, boolean strict, boolean timing, boolean isBison, FileUtil files, String debugTypeInference) { - return new ParseInModule(mod, scanner, strict, timing, isBison, false, files, debugTypeInference); + public static ParseInModule getCombinedGrammar(Module mod, Scanner scanner, boolean strict, boolean timing, boolean isBison, FileUtil files, String debugTypeInference, boolean partialParseDebug) { + return new ParseInModule(mod, scanner, strict, timing, isBison, false, files, debugTypeInference, partialParseDebug); } public static Tuple3 getCombinedGrammarImpl(Module mod, boolean isBison, boolean forGlobalScanner) { diff --git a/kernel/src/main/java/org/kframework/parser/inner/kernel/EarleyParser.java b/kernel/src/main/java/org/kframework/parser/inner/kernel/EarleyParser.java index 8f3f541295e..8ea406e5ee5 100644 --- a/kernel/src/main/java/org/kframework/parser/inner/kernel/EarleyParser.java +++ b/kernel/src/main/java/org/kframework/parser/inner/kernel/EarleyParser.java @@ -20,6 +20,7 @@ import org.kframework.parser.Constant; import org.kframework.parser.Term; import org.kframework.parser.TermCons; +import org.kframework.parser.inner.disambiguation.TreeCleanerVisitor; import org.kframework.utils.errorsystem.KEMException; import org.pcollections.ConsPStack; import org.pcollections.PStack; @@ -601,8 +602,9 @@ public List getColumns() { * @param scanner The scanner used to tokenize strings over this grammar. * @param startSymbol The start symbol to start parsing at. */ - public EarleyParser(Module m, Scanner scanner, Sort startSymbol) { + public EarleyParser(Module m, Scanner scanner, Sort startSymbol, boolean partialParseDebug) { this.scanner = scanner; + this.partialParseDebug = partialParseDebug; // compute metadata about grammar sorts = getSorts(m); @@ -790,6 +792,8 @@ private BitSet[] computeFirstSet() { private final BitSet nullable; // the scanner to use to tokenize sentences before parsing them private final Scanner scanner; + // whether to print detailed partial parse trees when an error occurs + private final boolean partialParseDebug; /** * Parse a sentence according to the grammar represented by this parser. @@ -850,22 +854,74 @@ public Term parse(String input, Source source, int startLine, int startColumn) { // a parse error after the outer loop finishes also. if (S.get(k+1).empty() && Q.empty()) { // no more states to process, so it must be a parse error - parseError(data, k); + parseError(data, S, k); } } if (S.get(data.words.length).empty()) { - parseError(data, data.words.length-1); + parseError(data, S, data.words.length-1); } // finished parsing successfully, so return the final parse forest return Ambiguity.apply(S.get(data.words.length).states.get(0).parseTree().stream().map(list -> list.get(0)).collect(Collectors.toSet())); } + // We are only interested in displaying states that span the entire input + // when a parse error occurs; such states have a start-index of 0. + private Set spanningStates(EarleySet parses) { + return parses.states.stream() + .filter(state -> state.start == 0) + .collect(Collectors.toSet()); + } + + // We heuristically identify the best state-set for producing diagnostics as the + // most recent such set that includes a _spanning state_; i.e. one with a start + // index of zero. + private Set bestDiagnosticStates(List S, int k) { + for(int i = k; i >= 0; --i) { + Set candidate = spanningStates(S.get(i)); + if(!candidate.isEmpty()) { + return candidate; + } + } + + return new HashSet<>(); + } + + private String partialParseTreesDiagnostic(Set spanningStates) { + if(spanningStates.isEmpty()) { + return "No top-level production could apply to this input."; + } + + StringBuilder msg = new StringBuilder(); + + for(EarleyState state : spanningStates) { + msg.append(" Attempting to apply production:\n ").append(state.prod).append("\n"); + for(PStack possibleTree : state.parseTree()) { + var cleanedChildren = possibleTree + .stream() + .map(term -> new TreeCleanerVisitor().apply(term)) + .collect(Collectors.toList()); + + if(state.prod.prod.klabel().isDefined()) { + var term = TermCons.apply(ConsPStack.from(cleanedChildren), state.prod.prod); + msg.append(" produced partial term:\n ").append(term).append("\n"); + } else { + msg.append(" produced partial term with no KLabel, and children:\n"); + for(var child : cleanedChildren) { + msg.append(" ").append(child).append("\n"); + } + } + } + } + + return msg.toString(); + } + /** * @param data The {@link ParserMetadata} about the sentence being parsed + * @param S The set of {@link EarleyState}s for each end-index in the input * @param k The end-index at which a parse error occurred. In other words, the index just prior to the first token that - * did not parse. */ - private void parseError(ParserMetadata data, int k) { + private void parseError(ParserMetadata data, List S, int k) { int startLine, startColumn, endLine, endColumn; if (data.words.length == 1) { startLine = data.lines[0]; @@ -889,6 +945,12 @@ private void parseError(ParserMetadata data, int k) { } Location loc = new Location(startLine, startColumn, endLine, endColumn); + + if(partialParseDebug) { + msg += " Additional parsing diagnostic information:\n"; + msg += partialParseTreesDiagnostic(bestDiagnosticStates(S, k)); + } + throw KEMException.innerParserError(msg, data.source, loc); }