Skip to content

Commit 60ca8e9

Browse files
committed
Make Unigram into a class instead to reduce heap.
1 parent a96199a commit 60ca8e9

File tree

6 files changed

+88
-32
lines changed

6 files changed

+88
-32
lines changed

Sources/Megrez/1_Compositor.swift

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -199,9 +199,10 @@ extension Megrez {
199199
/// - direction: 指定方向(相對於文字輸入方向而言)。
200200
/// - isMarker: 是否為標記游標。
201201
public func isCursorAtEdge(direction: TypingDirection, isMarker: Bool = false) -> Bool {
202+
let pos = isMarker ? marker : cursor
202203
switch direction {
203-
case .front: cursor == length
204-
case .rear: cursor == 0
204+
case .front: return pos == length
205+
case .rear: return pos == 0
205206
}
206207
}
207208

@@ -226,7 +227,7 @@ extension Megrez {
226227
return false
227228
}
228229
pos += delta
229-
if isCursorCuttingChar(isMarker: true) {
230+
if isCursorCuttingChar(isMarker: isMarker) {
230231
return jumpCursorBySegment(to: direction, isMarker: isMarker)
231232
}
232233
return true
@@ -353,12 +354,19 @@ extension Megrez {
353354
)
354355
-> [Megrez.Unigram] {
355356
if let cached = cache[keyArray] {
356-
// 如果將來 Gram 變成 Class 的話,不要與之前的結果共用記憶體位置。
357-
return cached
358-
}
359-
return langModel.unigramsFor(keyArray: keyArray).sorted {
360-
$0.score > $1.score
357+
return cached.map(\.copy)
361358
}
359+
let canonical = langModel
360+
.unigramsFor(keyArray: keyArray)
361+
.map { source -> Megrez.Unigram in
362+
if source.keyArray == keyArray {
363+
return source.copy
364+
}
365+
return source.copy(withKeyArray: keyArray)
366+
}
367+
.sorted { $0.score > $1.score }
368+
cache[keyArray] = canonical
369+
return canonical.map(\.copy)
362370
}
363371
}
364372
}

Sources/Megrez/2_PathFinder.swift

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@ extension Megrez {
6868
while currentPos > 0 {
6969
guard let node = parent[currentPos] else { break }
7070
let insertable = Megrez.GramInPath(
71-
keyArray: node.keyArray,
7271
gram: node.currentUnigram,
7372
isOverridden: node.isOverridden
7473
)

Sources/Megrez/5_Node.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ extension Megrez {
4343
self.overridingScore = node.overridingScore
4444
self.keyArray = node.keyArray
4545
self.segLength = node.segLength
46-
self.unigrams = node.unigrams
46+
self.unigrams = node.unigrams.map(\.copy)
4747
self.currentOverrideType = node.currentOverrideType
4848
self.currentUnigramIndex = node.currentUnigramIndex
4949
}

Sources/Megrez/7_Unigram.swift

Lines changed: 49 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,45 +5,82 @@
55
// MARK: - Megrez.Unigram
66

77
extension Megrez {
8-
/// 語言模型的基礎資料單位結構
9-
public struct Unigram: Equatable, CustomStringConvertible, Hashable, Codable {
8+
/// 語言模型的基礎資料單位類型
9+
public final class Unigram: Codable, CustomStringConvertible, Equatable, Hashable {
1010
// MARK: Lifecycle
1111

12-
/// 建立語言模型基礎資料單位副本。基礎資料單位由詞彙內容與統計權重組成
12+
/// 建立語言模型基礎資料單位副本。基礎資料單位由索引鍵陣列、詞彙內容與統計權重組成
1313
/// - Parameters:
14+
/// - keyArray: 對應的索引鍵陣列。
1415
/// - value: 詞彙內容。
1516
/// - score: 統計權重(雙精度浮點數)。
16-
public init(value: String = "", score: Double = 0) {
17+
public init(keyArray: [String] = [], value: String = "", score: Double = 0) {
18+
self.keyArray = keyArray
1719
self.value = value
1820
self.score = score
1921
}
2022

23+
public required init(from decoder: any Decoder) throws {
24+
let container = try decoder.container(keyedBy: CodingKeys.self)
25+
self.keyArray = try container.decode([String].self, forKey: .keyArray)
26+
self.value = try container.decode(String.self, forKey: .value)
27+
self.score = try container.decode(Double.self, forKey: .score)
28+
}
29+
2130
// MARK: Public
2231

32+
/// 對應的索引鍵陣列。
33+
public let keyArray: [String]
2334
/// 詞彙內容,可以是單字或詞組。
24-
public var value: String
35+
public let value: String
2536
/// 統計權重。
26-
public var score: Double
37+
public let score: Double
38+
39+
/// 段長(索引鍵陣列的元素數量)。
40+
public var segLength: Int { keyArray.count }
41+
42+
/// 檢查是否「讀音字長與候選字字長不一致」。
43+
public var isReadingMismatched: Bool { keyArray.count != value.count }
2744

2845
/// 將當前單元圖列印成一個字串。
2946
public var description: String {
3047
"(" + value.description + "," + String(score) + ")"
3148
}
3249

33-
public static func == (lhs: Self, rhs: Self) -> Bool {
34-
lhs.hashValue == rhs.hashValue
50+
/// 單元圖的淺層複製品(保持相同的索引鍵陣列)。
51+
public var copy: Unigram { copy(withKeyArray: nil) }
52+
53+
public static func == (lhs: Unigram, rhs: Unigram) -> Bool {
54+
lhs.keyArray == rhs.keyArray && lhs.value == rhs.value && lhs.score == rhs.score
3555
}
3656

37-
public static func < (lhs: Self, rhs: Self) -> Bool {
38-
lhs.value < rhs.value || (lhs.value == rhs.value && lhs.score < rhs.score)
57+
/// 建立一個新的單元圖副本。
58+
/// - Parameter keyArrayOverride: 若指定,則使用新的索引鍵陣列。
59+
/// - Returns: 單元圖副本。
60+
public func copy(withKeyArray keyArrayOverride: [String]? = nil) -> Unigram {
61+
.init(keyArray: keyArrayOverride ?? keyArray, value: value, score: score)
3962
}
4063

41-
/// 做為預設雜湊函式。
42-
/// - Parameter hasher: 目前物件的雜湊碼。
4364
public func hash(into hasher: inout Hasher) {
65+
hasher.combine(keyArray)
4466
hasher.combine(value)
4567
hasher.combine(score)
4668
}
69+
70+
public func encode(to encoder: any Encoder) throws {
71+
var container = encoder.container(keyedBy: CodingKeys.self)
72+
try container.encode(keyArray, forKey: .keyArray)
73+
try container.encode(value, forKey: .value)
74+
try container.encode(score, forKey: .score)
75+
}
76+
77+
// MARK: Private
78+
79+
private enum CodingKeys: String, CodingKey {
80+
case keyArray
81+
case value
82+
case score
83+
}
4784
}
4885
}
4986

Sources/Megrez/8_GramInPath.swift

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,23 +12,21 @@ extension Megrez {
1212
public struct GramInPath: Codable, Hashable {
1313
// MARK: Lifecycle
1414

15-
public init(keyArray: [String], gram: Unigram, isOverridden: Bool) {
16-
self.keyArray = keyArray
15+
public init(gram: Unigram, isOverridden: Bool) {
1716
self.gram = gram
1817
self.isOverridden = isOverridden
19-
self.isReadingMismatched = keyArray.count != gram.value.count
2018
}
2119

2220
// MARK: Public
2321

2422
public let gram: Unigram
2523
public let isOverridden: Bool
26-
public let keyArray: [String]
27-
public let isReadingMismatched: Bool
2824

25+
public var keyArray: [String] { gram.keyArray }
2926
public var value: String { gram.value }
3027
public var score: Double { gram.score }
31-
public var segLength: Int { keyArray.count }
28+
public var segLength: Int { gram.segLength }
29+
public var isReadingMismatched: Bool { gram.isReadingMismatched }
3230

3331
/// 該節點當前狀態所展示的鍵值配對。
3432
public var asCandidatePair: KeyValuePaired {
@@ -165,7 +163,7 @@ extension Array where Element == Megrez.GramInPath {
165163
}
166164
guard let perceptedGIP else { return nil }
167165
var arrGIPs = self
168-
while arrGIPs.last?.gram != perceptedGIP.gram { arrGIPs.removeLast() }
166+
while arrGIPs.last?.gram !== perceptedGIP.gram { arrGIPs.removeLast() }
169167
var isHead = true
170168
var outputCells = [String]()
171169
loopProc: while !arrGIPs.isEmpty, let frontendPair = arrGIPs.last {

Tests/MegrezTests/SimpleLM.swift

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,17 +54,31 @@ class SimpleLM: LangModelProtocol {
5454
let col0 = String(linestream[0])
5555
let col1 = String(linestream[1])
5656
let col2 = Double(linestream[2]) ?? 0.0
57-
let u = Megrez.Unigram(value: swapKeyValue ? col0 : col1, score: col2)
58-
mutDatabase[swapKeyValue ? col1 : col0, default: []].append(u)
57+
let key = swapKeyValue ? col1 : col0
58+
let value = swapKeyValue ? col0 : col1
59+
let keyArray = separatorComponents(from: key)
60+
let u = Megrez.Unigram(keyArray: keyArray, value: value, score: col2)
61+
mutDatabase[key, default: []].append(u)
5962
}
6063
}
64+
65+
// MARK: Private
66+
67+
private func separatorComponents(from key: String) -> [String] {
68+
if separator.isEmpty {
69+
return key.map(\.description)
70+
}
71+
return key
72+
.components(separatedBy: separator)
73+
.filter { !$0.isEmpty }
74+
}
6175
}
6276

6377
// MARK: - MockLM
6478

6579
class MockLM: LangModelProtocol {
6680
func unigramsFor(keyArray: [String]) -> [Megrez.Unigram] {
67-
[Megrez.Unigram(value: keyArray.joined(), score: -1)]
81+
[Megrez.Unigram(keyArray: keyArray, value: keyArray.joined(), score: -1)]
6882
}
6983

7084
func hasUnigramsFor(keyArray: [String]) -> Bool {

0 commit comments

Comments
 (0)