Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

capture all terminators and quotes in the sentence #360

Merged
merged 10 commits into from
Dec 19, 2023
65 changes: 49 additions & 16 deletions ext/js/dom/document-util.js
Original file line number Diff line number Diff line change
Expand Up @@ -113,27 +113,45 @@ export class DocumentUtil {
const text = source.text();
const textLength = text.length;
const textEndAnchor = textLength - endLength;
let pos1 = startLength;
let pos2 = textEndAnchor;

/** Relative start position of the sentence (inclusive). */
let cursorStart = startLength;
/** Relative end position of the sentence (exclusive). */
let cursorEnd = textEndAnchor;

// Move backward
let quoteStack = [];
for (; pos1 > 0; --pos1) {
const c = text[pos1 - 1];
for (; cursorStart > 0; --cursorStart) {
// Check if the previous character should be included.
let c = text[cursorStart - 1];
if (c === '\n' && terminateAtNewlines) { break; }

if (quoteStack.length === 0) {
const terminatorInfo = terminatorMap.get(c);
let terminatorInfo = terminatorMap.get(c);
if (typeof terminatorInfo !== 'undefined') {
if (terminatorInfo[0]) { --pos1; }
// Include the previous character while it is a terminator character and is included at start.
while (terminatorInfo[0] && cursorStart > 0) {
--cursorStart;
if (cursorStart === 0) { break; }
c = text[cursorStart - 1];
terminatorInfo = terminatorMap.get(c);
if (typeof terminatorInfo === 'undefined') { break; }
}
break;
}
}

let quoteInfo = forwardQuoteMap.get(c);
if (typeof quoteInfo !== 'undefined') {
if (quoteStack.length === 0) {
if (quoteInfo[1]) { --pos1; }
// Include the previous character while it is a quote character and is included at start.
while (quoteInfo[1] && cursorStart > 0) {
--cursorStart;
if (cursorStart === 0) { break; }
c = text[cursorStart - 1];
quoteInfo = forwardQuoteMap.get(c);
if (typeof quoteInfo === 'undefined') { break; }
}
break;
} else if (quoteStack[0] === c) {
quoteStack.pop();
Expand All @@ -149,22 +167,37 @@ export class DocumentUtil {

// Move forward
quoteStack = [];
for (; pos2 < textLength; ++pos2) {
const c = text[pos2];
for (; cursorEnd < textLength; ++cursorEnd) {
// Check if the following character should be included.
let c = text[cursorEnd];
if (c === '\n' && terminateAtNewlines) { break; }

if (quoteStack.length === 0) {
const terminatorInfo = terminatorMap.get(c);
let terminatorInfo = terminatorMap.get(c);
if (typeof terminatorInfo !== 'undefined') {
if (terminatorInfo[1]) { ++pos2; }
// Include the following character while it is a terminator character and is included at end.
while (terminatorInfo[1] && cursorEnd < textLength) {
++cursorEnd;
if (cursorEnd === textLength) { break; }
c = text[cursorEnd];
terminatorInfo = terminatorMap.get(c);
if (typeof terminatorInfo === 'undefined') { break; }
}
break;
}
}

let quoteInfo = backwardQuoteMap.get(c);
if (typeof quoteInfo !== 'undefined') {
if (quoteStack.length === 0) {
if (quoteInfo[1]) { ++pos2; }
Casheeew marked this conversation as resolved.
Show resolved Hide resolved
// Include the following character while it is a quote character and is included at end.
while (quoteInfo[1] && cursorEnd < textLength) {
++cursorEnd;
if (cursorEnd === textLength) { break; }
c = text[cursorEnd];
quoteInfo = forwardQuoteMap.get(c);
if (typeof quoteInfo === 'undefined') { break; }
}
break;
} else if (quoteStack[0] === c) {
quoteStack.pop();
Expand All @@ -179,13 +212,13 @@ export class DocumentUtil {
}

// Trim whitespace
for (; pos1 < startLength && this._isWhitespace(text[pos1]); ++pos1) { /* NOP */ }
for (; pos2 > textEndAnchor && this._isWhitespace(text[pos2 - 1]); --pos2) { /* NOP */ }
for (; cursorStart < startLength && this._isWhitespace(text[cursorStart]); ++cursorStart) { /* NOP */ }
for (; cursorEnd > textEndAnchor && this._isWhitespace(text[cursorEnd - 1]); --cursorEnd) { /* NOP */ }

// Result
return {
text: text.substring(pos1, pos2),
offset: startLength - pos1
text: text.substring(cursorStart, cursorEnd),
offset: startLength - cursorStart
};
}

Expand Down
32 changes: 32 additions & 0 deletions test/data/html/test-document1.html
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,38 @@ <h1>Yomitan Tests</h1>
<span>ありがとございます。ありがとございます。</span>
</div>

<div
class="test"
data-test-type="scan"
data-element-from-point-selector="span"
data-caret-range-from-point-selector="span"
data-start-node-selector="span"
data-start-offset="4"
data-end-node-selector="span"
data-end-offset="4"
data-result-type="TextSourceRange"
data-sentence-scan-extent="100"
data-sentence="ありがとございます。!?"
>
<span>ありがとございます。!?ありがとございます。!?</span>
</div>

<div
class="test"
data-test-type="scan"
data-element-from-point-selector="span"
data-caret-range-from-point-selector="span"
data-start-node-selector="span"
data-start-offset="4"
data-end-node-selector="span"
data-end-offset="4"
data-result-type="TextSourceRange"
data-sentence-scan-extent="100"
data-sentence="ありがとございます!!!"
>
<span>ありがとございます!!!ありがとございます!!!</span>
</div>

<div
class="test"
data-test-type="scan"
Expand Down