forked from owid/owid-grapher
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchunk.tsx
84 lines (75 loc) · 2.23 KB
/
chunk.tsx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import { flatten } from "@ourworldindata/utils"
import chunk from "chunk-text"
export const chunkWords = (text: string, maxChunkLength: number): string[] =>
chunk(text, maxChunkLength)
export const chunkSentences = (
text: string,
maxChunkLength: number
): string[] => {
// See https://stackoverflow.com/a/25736082/1983739
// Not perfect, just works in most cases
const sentenceRegex = /(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\n)\s/g
const sentences = flatten(
text
.split(sentenceRegex)
.map((s) =>
s.length > maxChunkLength ? chunkWords(s, maxChunkLength) : s
)
)
.map((s) => s.trim())
.filter((s) => s)
.reverse() as string[]
const chunks = []
let chunk = sentences.pop()
if (!chunk) return []
while (true) {
const sentence = sentences.pop()
if (!sentence) {
chunks.push(chunk)
break
} else {
const nextChunk: string = chunk + " " + sentence
if (nextChunk.length > maxChunkLength) {
chunks.push(chunk)
chunk = sentence
} else chunk = nextChunk
}
}
return chunks
}
// Chunks a given bit of text into an array of fragments less than or equal to maxChunkLength in size
// These chunks will honor sentence boundaries where possible
export const chunkParagraphs = (
text: string,
maxChunkLength: number
): string[] => {
const paragraphs = flatten(
text
.split("\n\n")
.map((p) =>
p.length > maxChunkLength
? chunkSentences(p, maxChunkLength)
: p
)
)
.map((p) => p.trim())
.filter((p) => p)
.reverse() as string[]
const chunks = []
let chunk = paragraphs.pop()
if (!chunk) return []
while (true) {
const p = paragraphs.pop()
if (!p) {
chunks.push(chunk)
break
} else {
const nextChunk: string = chunk + "\n\n" + p
if (nextChunk.length > maxChunkLength) {
chunks.push(chunk)
chunk = p
} else chunk = nextChunk
}
}
return chunks
}