From 359639045247572e9217b7c5fe2585982e679876 Mon Sep 17 00:00:00 2001 From: Tatu Saloranta Date: Wed, 22 Aug 2018 16:35:52 -0700 Subject: [PATCH] Fix #42 --- release-notes/VERSION | 2 + .../com/ctc/wstx/evt/WstxEventReader.java | 528 +++++++++++++++++- 2 files changed, 514 insertions(+), 16 deletions(-) diff --git a/release-notes/VERSION b/release-notes/VERSION index 043461e6..d6971755 100644 --- a/release-notes/VERSION +++ b/release-notes/VERSION @@ -6,6 +6,8 @@ Project: woodstox 5.2.0 (not yet released) +#42: Multi-document mode produces events only for first document + (reported by display-none@github) #46: NoSuchMethodError after update to Woodstox 5.1.0 (reported by Alexander V, veita@github) #47: Validation error due to white-space being handled as CData by `BaseStreamWriter` diff --git a/src/main/java/com/ctc/wstx/evt/WstxEventReader.java b/src/main/java/com/ctc/wstx/evt/WstxEventReader.java index 49671f22..cbb66df2 100644 --- a/src/main/java/com/ctc/wstx/evt/WstxEventReader.java +++ b/src/main/java/com/ctc/wstx/evt/WstxEventReader.java @@ -15,39 +15,161 @@ package com.ctc.wstx.evt; +import java.util.NoSuchElementException; + import javax.xml.stream.*; +import javax.xml.stream.events.Characters; +import javax.xml.stream.events.XMLEvent; import javax.xml.stream.util.XMLEventAllocator; +import org.codehaus.stax2.XMLEventReader2; import org.codehaus.stax2.XMLStreamReader2; -import org.codehaus.stax2.ri.Stax2EventReaderImpl; import com.ctc.wstx.cfg.ErrorConsts; import com.ctc.wstx.exc.WstxParsingException; import com.ctc.wstx.sr.StreamScanner; /** - * Woodstox version, based on generic Stax reference implementation - * baseline of {@link Stax2EventReaderImpl}. + * Woodstox version of {@link XMLEventReader2} (and {@link XMLEventReader}). + *

+ * NOTE: up to Woodstox 5.1, this was based on Stax2 Reference Implementation + * ({@link org.codehaus.stax2.ri.Stax2EventReaderImpl}), but due to various issues + * has temporarily (?) been cut-paste-modified here. Ideally it would be reconciled + * once Stax2-api version 4.2 can be relied as baseline, but that may take time. */ public class WstxEventReader - extends Stax2EventReaderImpl +// extends Stax2EventReaderImpl // before 5.2 + implements XMLEventReader2, XMLStreamConstants { + // // // Enumerated state ids + + protected final static int STATE_INITIAL = 1; + protected final static int STATE_END_OF_INPUT = 2; + protected final static int STATE_CONTENT = 3; + + + // // // Enumerated error case ids + + /** + * Current state when getElementText() called not START_ELEMENT + */ + protected final static int ERR_GETELEMTEXT_NOT_START_ELEM = 1; + + /** + * Encountered non-textual event (other than closing END_ELEMENT) + * when collecting text for getElementText() + */ + protected final static int ERR_GETELEMTEXT_NON_TEXT_EVENT = 2; + + /** + * Encountered CHARACTERS or CDATA that contains non-white space + * char(s), when trying to locate tag with nextTag() + */ + protected final static int ERR_NEXTTAG_NON_WS_TEXT = 3; + + /** + * Encountered non-skippable non-text/element event with + * nextTag() + */ + protected final static int ERR_NEXTTAG_WRONG_TYPE = 4; + + /* + /********************************************************************** + /* Configuration + /********************************************************************** + */ + + protected final XMLEventAllocator mAllocator; + + protected final XMLStreamReader2 mReader; + + /* + /********************************************************************** + /* State + /********************************************************************** + */ + + /** + * Event that has been peeked, ie. loaded without call to + * {@link #nextEvent}; will be returned and cleared by + * call to {@link #nextEvent} (or, returned again if peeked + * again) + */ + protected XMLEvent mPeekedEvent = null; + + /** + * High-level state indicator, with currently three values: + * whether we are initializing (need to synthetize START_DOCUMENT), + * at END_OF_INPUT (end-of-doc), or otherwise, normal operation. + * Useful in simplifying some methods, as well as to make sure + * that independent of how stream reader handles things, event reader + * can reliably detect End-Of-Document. + */ + protected int mState = STATE_INITIAL; + + /** + * This variable keeps track of the type of the 'previous' event + * when peeking for the next Event. It is needed for some functionality, + * to remember state even when underlying parser has to move to peek + * the next event. + */ + protected int mPrePeekEvent = START_DOCUMENT; + + /* + /********************************************************************** + /* Woodstox-specific + /********************************************************************** + */ + + /** + * Marker flag to allow specialized handling in "multi-document" reading + * mode. + */ protected final boolean mCfgMultiDocMode; + + /* + /********************************************************************** + /* Construction + /********************************************************************** + */ public WstxEventReader(XMLEventAllocator a, XMLStreamReader2 r) { - super(a, r); + mAllocator = a; + mReader = r; mCfgMultiDocMode = (r instanceof StreamScanner) && ((StreamScanner) r).getConfig().inputParsingModeDocuments(); } - + /* - ////////////////////////////////////////////////////// - // Impl of abstract methods - ////////////////////////////////////////////////////// + /********************************************************************** + /* Abstract methods that Stax2EventReaderImpl would expose + /********************************************************************** */ @Override + public boolean isPropertySupported(String name) + { + return ((XMLStreamReader2)getStreamReader()).isPropertySupported(name); + } + + @Override + public boolean setProperty(String name, Object value) + { + return ((XMLStreamReader2)getStreamReader()).setProperty(name, value); + } + + /** + * Method called upon encountering a problem that should result + * in an exception being thrown. If non-null String is returned. + * that will be used as the message of exception thrown; if null, + * a standard message will be used instead. + * + * @param errorType Type of the problem, one of ERR_ + * constants + * @param eventType Type of the event that triggered the problem, + * if any; -1 if not available. + */ protected String getErrorDesc(int errorType, int currEvent) { // Defaults are mostly fine, except we can easily add event type desc @@ -64,29 +186,403 @@ protected String getErrorDesc(int errorType, int currEvent) return null; } + /* + /********************************************************************** + /* XMLEventReader API + /********************************************************************** + */ + @Override - public boolean isPropertySupported(String name) + public void close() throws XMLStreamException { - return ((XMLStreamReader2)getStreamReader()).isPropertySupported(name); + mReader.close(); } @Override - public boolean setProperty(String name, Object value) + public String getElementText() throws XMLStreamException { - return ((XMLStreamReader2)getStreamReader()).setProperty(name, value); + /* Simple, if no peeking occured: can just forward this to the + * underlying parser + */ + if (mPeekedEvent == null) { + return mReader.getElementText(); + } + + XMLEvent evt = mPeekedEvent; + mPeekedEvent = null; + + /* Otherwise need to verify that we are currently over START_ELEMENT. + * Problem is we have already went past it... + */ + if (mPrePeekEvent != START_ELEMENT) { + reportProblem(findErrorDesc(ERR_GETELEMTEXT_NOT_START_ELEM, mPrePeekEvent)); + } + // ??? do we need to update mPrePeekEvent now + + String str = null; + StringBuffer sb = null; + + // Ok, fine, then just need to loop through and get all the text... + for (; true; evt = nextEvent()) { + if (evt.isEndElement()) { + break; + } + int type = evt.getEventType(); + if (type == COMMENT || type == PROCESSING_INSTRUCTION) { + // can/should just ignore them + continue; + } + if (!evt.isCharacters()) { + reportProblem(findErrorDesc(ERR_GETELEMTEXT_NON_TEXT_EVENT, type)); + } + String curr = evt.asCharacters().getData(); + if (str == null) { + str = curr; + } else { + if (sb == null) { + sb = new StringBuffer(str.length() + curr.length()); + sb.append(str); + } + sb.append(curr); + } + } + + if (sb != null) { + return sb.toString(); + } + return (str == null) ? "" : str; + } + + @Override + public Object getProperty(String name) { + return mReader.getProperty(name); + } + + @Override + public boolean hasNext() { + return (mState != STATE_END_OF_INPUT); } + @Override + public XMLEvent nextEvent() throws XMLStreamException + { + if (mState == STATE_END_OF_INPUT) { + throwEndOfInput(); + } else if (mState == STATE_INITIAL) { + mState = STATE_CONTENT; + return createStartDocumentEvent(); + } + if (mPeekedEvent != null) { + XMLEvent evt = mPeekedEvent; + mPeekedEvent = null; + if (evt.isEndDocument()) { + updateStateEndDocument(); + } + return evt; + } + return createNextEvent(true, mReader.next()); + } + + @Override + public Object next() { + try { + return nextEvent(); + } catch (XMLStreamException sex) { + throwUnchecked(sex); + return null; + } + } + + @Override + public XMLEvent nextTag() throws XMLStreamException + { + // If we have peeked something, need to process it + if (mPeekedEvent != null) { + XMLEvent evt = mPeekedEvent; + mPeekedEvent = null; + int type = evt.getEventType(); + switch (type) { + case END_DOCUMENT: + return null; + case START_DOCUMENT: + // Need to skip START_DOCUMENT to get the root elem + break; + case SPACE: + // Ignorable WS is just fine + break; + + /* !!! 07-Dec-2004, TSa: Specs are mum about Comments and PIs. + * But why would they not be skipped just like what + * the stream reader does? + */ + case COMMENT: + case PROCESSING_INSTRUCTION: + break; + case CDATA: + case CHARACTERS: + if (((Characters) evt).isWhiteSpace()) { + break; + } + reportProblem(findErrorDesc(ERR_NEXTTAG_NON_WS_TEXT, type)); + break; // never gets here, but some compilers whine without... + case START_ELEMENT: + case END_ELEMENT: + return evt; + + default: + reportProblem(findErrorDesc(ERR_NEXTTAG_WRONG_TYPE, type)); + } + } else { + /* 13-Sep-2005, TSa: As pointed out by Patrick, we may need to + * initialize the state here, too; otherwise peek() won't work + * correctly. The problem is that following loop's get method + * does not use event reader's method but underlying reader's. + * As such, it won't update state: most importantly, initial + * state may not be changed to non-initial. + */ + if (mState == STATE_INITIAL) { + mState = STATE_CONTENT; + } + } + + while (true) { + int next = mReader.next(); + + switch (next) { + case END_DOCUMENT: + return null; + case SPACE: + case COMMENT: + case PROCESSING_INSTRUCTION: + continue; + case CDATA: + case CHARACTERS: + if (mReader.isWhiteSpace()) { + continue; + } + reportProblem(findErrorDesc(ERR_NEXTTAG_NON_WS_TEXT, next)); + break; // just to keep Jikes happy... + + case START_ELEMENT: + case END_ELEMENT: + return createNextEvent(false, next); + + default: + reportProblem(findErrorDesc(ERR_NEXTTAG_WRONG_TYPE, next)); + } + } + } + + @Override + public XMLEvent peek() throws XMLStreamException + { + if (mPeekedEvent == null) { + if (mState == STATE_END_OF_INPUT) { + // 06-Mar-2006, TSa: Fixed as per Arjen's suggestion: + //throwEndOfInput(); + return null; + } + if (mState == STATE_INITIAL) { + // Not sure what it should be... but this should do: + mPrePeekEvent = START_DOCUMENT; + mPeekedEvent = createStartDocumentEvent(); + mState = STATE_CONTENT; + } else { + mPrePeekEvent = mReader.getEventType(); + mPeekedEvent = createNextEvent(false, mReader.next()); + } + } + return mPeekedEvent; + } + + /** + * Note: only here because we implement Iterator interface. Will not + * work, don't bother calling it. + */ + @Override + public void remove() { + throw new UnsupportedOperationException("Can not remove events from XMLEventReader."); + } + + /** + * Method called when we are about to return END_DOCUMENT event. + * Usually this should change state to STATE_END_OF_INPUT, but + * may vary for some alternative read modes (like multi-document) + * + * @since 4.2 + */ + protected void updateStateEndDocument() throws XMLStreamException { + if (mCfgMultiDocMode) { + // As per [woodstox-core#42] should allow reading over multiple documents... + if (mReader.hasNext()) { + // Let's sanity-check that we get token we expect however: + int next = mReader.next(); + if (next == START_DOCUMENT) { + mPrePeekEvent = START_DOCUMENT; + mPeekedEvent = createStartDocumentEvent(); + mState = STATE_CONTENT; + return; + } + reportProblem("Unexpected token ("+ErrorConsts.tokenTypeDesc(next) + +") after END_DOCUMENT in multi-document mode, XMLStreamReader.hasNext() returning true"); + } + } + mState = STATE_END_OF_INPUT; + } /* - ////////////////////////////////////////////////////// - // Overrides - ////////////////////////////////////////////////////// + /********************************************************************** + /* XMLEventReader2 API + /********************************************************************** */ + /** + *

+ * Note: although the interface allows implementations to + * throw an {@link XMLStreamException}, the reference implementation + * doesn't currently need to. + * It's still declared, in case in future there is need to throw + * such an exception. + */ @Override + public boolean hasNextEvent() throws XMLStreamException + { + return (mState != STATE_END_OF_INPUT); + } + + /* + /********************************************************************** + /* Overridable factory methods + /********************************************************************** + */ + + protected XMLEvent createNextEvent(boolean checkEOD, int type) + throws XMLStreamException + { + try { + XMLEvent evt = mAllocator.allocate(mReader); + if (checkEOD && type == END_DOCUMENT) { + updateStateEndDocument(); + } + return evt; + } catch (RuntimeException rex) { + throw _checkUnwrap(rex); + } + } + + protected XMLStreamException _checkUnwrap(RuntimeException rex) + { + /* 29-Mar-2008, TSa: Due to some problems with Stax API + * (lack of 'throws XMLStreamException' in signature of + * XMLStreamReader.getText(), for one) it is possible + * we will get a wrapped XMLStreamException. If so, + * we should be able to unwrap it. + */ + Throwable t = rex.getCause(); + while (t != null) { + if (t instanceof XMLStreamException) { + return (XMLStreamException) t; + } + t = t.getCause(); + } + // Nope, need to re-throw as is + throw rex; + } + + /** + * Method called to create the very first event (START_DOCUMENT). + */ + protected XMLEvent createStartDocumentEvent() + throws XMLStreamException + { + XMLEvent start = mAllocator.allocate(mReader); + return start; + } + + /* + /********************************************************************** + /* Overridable error reporting methods + /********************************************************************** + */ + + // note: `private` before 4.2 + protected void throwEndOfInput() + { + throw new NoSuchElementException(); + } + + protected void throwUnchecked(XMLStreamException sex) + { + // Wrapped root cause? Let's only unwrap one layer; one that + // must have been used to expose the problem (if any) + Throwable t = (sex.getNestedException() == null) ? sex : sex.getNestedException(); + // Unchecked? Can re-throw as is + if (t instanceof RuntimeException) { + throw (RuntimeException) t; + } + if (t instanceof Error) { + throw (Error) t; + } + // Otherwise, let's just wrap it + throw new RuntimeException("[was "+t.getClass()+"] "+t.getMessage(), t); + } + + protected void reportProblem(String msg) + throws XMLStreamException + { + reportProblem(msg, mReader.getLocation()); + } + protected void reportProblem(String msg, Location loc) throws XMLStreamException { + if (loc == null) { + throw new WstxParsingException(msg); + } throw new WstxParsingException(msg, loc); } + + /* + /********************************************************************** + /* Package methods for sub-classes + /********************************************************************** + */ + + protected XMLStreamReader getStreamReader() + { + return mReader; + } + + /* + /********************************************************************** + /* Other internal methods + /********************************************************************** + */ + + // note: `private` before 4.2 + /** + * Method used to locate error message description to use. + * Calls sub-classes getErrorDesc() first, and only + * if no message found, uses default messages defined here. + */ + protected final String findErrorDesc(int errorType, int currEvent) + { + String msg = getErrorDesc(errorType, currEvent); + if (msg != null) { + return msg; + } + switch (errorType) { + case ERR_GETELEMTEXT_NOT_START_ELEM: + return "Current state not START_ELEMENT when calling getElementText()"; + case ERR_GETELEMTEXT_NON_TEXT_EVENT: + return "Expected a text token"; + case ERR_NEXTTAG_NON_WS_TEXT: + return "Only all-whitespace CHARACTERS/CDATA (or SPACE) allowed for nextTag()"; + case ERR_NEXTTAG_WRONG_TYPE: + return "Should only encounter START_ELEMENT/END_ELEMENT, SPACE, or all-white-space CHARACTERS"; + } + + // should never happen, but it'd be bad to throw another exception... + return "Internal error (unrecognized error type: "+errorType+")"; + } }