Skip to content

Commit

Permalink
Chunk Translation Text
Browse files Browse the repository at this point in the history
  • Loading branch information
mohamede1945 committed Jan 11, 2024
1 parent ff0f364 commit c2515e9
Showing 22 changed files with 651 additions and 125 deletions.
6 changes: 5 additions & 1 deletion .swiftformat
Original file line number Diff line number Diff line change
@@ -8,7 +8,11 @@
--wrapparameters before-first # wrapArguments
--funcattributes prev-line # wrapAttributes
--typeattributes prev-line # wrapAttributes
--beforemarks typealias,struct,enum # organizeDeclarations
--beforemarks typealias,struct # organizeDeclarations

--structthreshold 70
--classthreshold 70
--enumthreshold 70

# Disabled

53 changes: 53 additions & 0 deletions .swiftpm/xcode/xcshareddata/xcschemes/QuranTextKitTests.xcscheme
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
<?xml version="1.0" encoding="UTF-8"?>
<Scheme
LastUpgradeVersion = "1430"
version = "1.7">
<BuildAction
parallelizeBuildables = "YES"
buildImplicitDependencies = "YES">
</BuildAction>
<TestAction
buildConfiguration = "Debug"
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
shouldUseLaunchSchemeArgsEnv = "YES"
shouldAutocreateTestPlan = "YES">
<Testables>
<TestableReference
skipped = "NO">
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "QuranTextKitTests"
BuildableName = "QuranTextKitTests"
BlueprintName = "QuranTextKitTests"
ReferencedContainer = "container:">
</BuildableReference>
</TestableReference>
</Testables>
</TestAction>
<LaunchAction
buildConfiguration = "Debug"
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
launchStyle = "0"
useCustomWorkingDirectory = "NO"
ignoresPersistentStateOnLaunch = "NO"
debugDocumentVersioning = "YES"
debugServiceExtension = "internal"
allowLocationSimulation = "YES">
</LaunchAction>
<ProfileAction
buildConfiguration = "Release"
shouldUseLaunchSchemeArgsEnv = "YES"
savedToolIdentifier = ""
useCustomWorkingDirectory = "NO"
debugDocumentVersioning = "YES">
</ProfileAction>
<AnalyzeAction
buildConfiguration = "Debug">
</AnalyzeAction>
<ArchiveAction
buildConfiguration = "Release"
revealArchiveInOrganizer = "YES">
</ArchiveAction>
</Scheme>
12 changes: 7 additions & 5 deletions Core/SystemDependenciesFake/FileSystemFake.swift
Original file line number Diff line number Diff line change
@@ -22,11 +22,6 @@ public final class FileSystemFake: FileSystem, Sendable {
var resourceValuesByURL: [URL: ResourceValuesFake] = [:]
}

enum FileSystemError: Error {
case noResourceValues
case general(String)
}

// MARK: Lifecycle

public init() {}
@@ -117,6 +112,13 @@ public final class FileSystemFake: FileSystem, Sendable {
files.insert(path)
}

// MARK: Internal

enum FileSystemError: Error {
case noResourceValues
case general(String)
}

// MARK: Private

private let state = ManagedCriticalState(State())
119 changes: 119 additions & 0 deletions Core/Utilities/Sources/Extensions/String+Chunking.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
//
// String+Chunking.swift
//
//
// Created by Mohamed Afifi on 2023-12-31.
//

import Foundation

extension String {
public func chunk(maxChunkSize: Int) -> [Substring] {
chunkRanges(maxChunkSize: maxChunkSize).map { self[$0] }
}

public func chunkRanges(maxChunkSize: Int) -> [Range<String.Index>] {
var chunks: [Range<String.Index>] = []
chunkText(self, range: startIndex ..< endIndex, maxChunkSize: maxChunkSize, strategy: .paragraph, chunks: &chunks)
return chunks
}
}

private func chunkText(_ text: String, range: Range<String.Index>, maxChunkSize: Int, strategy: ChunkingStrategy, chunks: inout [Range<String.Index>]) {
let blocks = text.split(in: range, on: strategy.enumerationOptions)

var accumlatedChunkStartIndex = range.lowerBound
var accumlatedBlocks = 0

func addAccumlatedChunk(to upperBound: String.Index, next: String.Index) {
if accumlatedBlocks > 0 && accumlatedChunkStartIndex < range.upperBound {
chunks.append(accumlatedChunkStartIndex ..< upperBound)
accumlatedBlocks = 0
}
accumlatedChunkStartIndex = next
}

for block in blocks {
let blockLength = text.distance(from: block.lowerBound, to: block.upperBound)
if blockLength > maxChunkSize {
// Add accumlated chunks.
addAccumlatedChunk(to: block.lowerBound, next: block.upperBound)

if let nextStrategy = strategy.nextStrategy() {
// Try a finer strategy
chunkText(text, range: block, maxChunkSize: maxChunkSize, strategy: nextStrategy, chunks: &chunks)
} else {
// No finer strategy, add the long block as a separate chunk.
chunks.append(block)
}
} else {
// Try to extend current chunk.
let extendedCurrentChunkLength = text.distance(from: accumlatedChunkStartIndex, to: block.upperBound)

if extendedCurrentChunkLength > maxChunkSize {
// Add the current chunk and start a new one from the current block.
addAccumlatedChunk(to: block.lowerBound, next: block.lowerBound)
accumlatedBlocks = 1
} else {
// Continue to accumlate blocks.
accumlatedBlocks += 1
}
}
}

if accumlatedChunkStartIndex < range.upperBound {
addAccumlatedChunk(to: range.upperBound, next: range.upperBound)
}
}

private extension String {
func split(in range: Range<Index>, on: EnumerationOptions) -> [Range<Index>] {
var subranges: [Range<Index>] = []

enumerateSubstrings(in: range, options: [on, .substringNotRequired]) { _, subrange, _, _ in
let modifiedSubrange: Range<Index>
if let lastRangeIndex = subranges.indices.last {
// Update last range to end at the new subrange.
subranges[lastRangeIndex] = subranges[lastRangeIndex].lowerBound ..< subrange.lowerBound
modifiedSubrange = subrange
} else {
modifiedSubrange = range.lowerBound ..< subrange.upperBound
}
subranges.append(modifiedSubrange)
}

// Check if there's any remaining text after the last subrange
if let lastRangeIndex = subranges.indices.last {
// Merge any remaining text with the last subrange
subranges[lastRangeIndex] = subranges[lastRangeIndex].lowerBound ..< range.upperBound
}

if subranges.isEmpty {
subranges.append(range)
}

return subranges
}
}

private enum ChunkingStrategy {
case paragraph, sentence, word

// MARK: Internal

var enumerationOptions: String.EnumerationOptions {
switch self {
case .paragraph: return .byParagraphs
case .sentence: return .bySentences
case .word: return .byWords
}
}

func nextStrategy() -> ChunkingStrategy? {
switch self {
case .paragraph: return .sentence
case .sentence: return .word
case .word: return nil
}
}
}
50 changes: 50 additions & 0 deletions Core/Utilities/Sources/Extensions/String+Extension.swift
Original file line number Diff line number Diff line change
@@ -56,6 +56,12 @@ extension String {
}

extension String {
public func ranges(of regex: NSRegularExpression) -> [Range<String.Index>] {
let range = NSRange(startIndex ..< endIndex, in: self)
let matches = regex.matches(in: self, range: range)
return matches.compactMap { Range($0.range, in: self) }
}

public func replacingOccurrences(matchingPattern pattern: String, replacementProvider: (String) -> String?) -> String {
let expression = try! NSRegularExpression(pattern: pattern, options: []) // swiftlint:disable:this force_try
let matches = expression.matches(in: self, options: [], range: NSRange(startIndex ..< endIndex, in: self))
@@ -66,4 +72,48 @@ extension String {
current.replaceSubrange(range, with: replacement)
}
}

public func replaceMatches(
of regex: NSRegularExpression,
replace: (Substring, Int) -> String
) -> (String, [Range<String.Index>]) {
let ranges = ranges(of: regex)
return replacing(sortedRanges: ranges, body: replace)
}
}

extension String {
public func replacing(
sortedRanges: [Range<String.Index>],
body: (Substring, Int) -> String
) -> (String, [Range<String.Index>]) {
var newText = self
var offsets = [(start: Int, length: Int, offset: Int)]()
var replacementIndex = sortedRanges.count - 1

for matchRange in sortedRanges.reversed() {
let match = self[matchRange]

let replacement = body(match, replacementIndex)
newText.replaceSubrange(matchRange, with: replacement)

let replacementStart = newText.distance(from: newText.startIndex, to: matchRange.lowerBound)
offsets.append((
start: replacementStart,
length: replacement.count,
offset: match.count - replacement.count
))

replacementIndex -= 1
}

var accumlatedOffset = 0
let ranges = offsets.reversed().map { data -> Range<String.Index> in
let start = newText.index(newText.startIndex, offsetBy: data.start - accumlatedOffset)
let end = newText.index(start, offsetBy: data.length)
accumlatedOffset += data.offset
return start ..< end
}
return (newText, ranges)
}
}
Loading

0 comments on commit c2515e9

Please sign in to comment.