Skip to content

Commit 6221541

Browse files
feat(thicktoken): slice tokens before decoding
1 parent f35d80b commit 6221541

File tree

4 files changed

+138
-6
lines changed

4 files changed

+138
-6
lines changed

thicktoken/src/smart-slice.test.ts

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import { SmartSlice } from './smart-slice'
2+
import { test, expect } from 'vitest'
3+
4+
test('SmartSlice can slice the entire range', () => {
5+
const slice = new SmartSlice([[0, 10]], 10)
6+
const indices = [...slice]
7+
expect(indices).toEqual([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
8+
})
9+
10+
test('SmartSlice can slice multiple ranges', () => {
11+
const slice = new SmartSlice(
12+
[
13+
[0, 3],
14+
[5, 7],
15+
[8, 10]
16+
],
17+
10
18+
)
19+
const indices = [...slice]
20+
expect(indices).toEqual([0, 1, 2, 5, 6, 8, 9])
21+
})
22+
23+
test('SmartSlice handles empty slices', () => {
24+
const slice = new SmartSlice([], 10)
25+
const indices = [...slice]
26+
expect(indices).toEqual([])
27+
})
28+
29+
test('SmartSlice handles overlapping slices', () => {
30+
const slice = new SmartSlice(
31+
[
32+
[0, 5],
33+
[3, 8]
34+
],
35+
10
36+
)
37+
const indices = [...slice]
38+
expect(indices).toEqual([0, 1, 2, 3, 4, 5, 6, 7])
39+
})
40+
41+
test('SmartSlice handles out-of-bounds slices', () => {
42+
const slice = new SmartSlice(
43+
[
44+
[-5, 3],
45+
[8, 12]
46+
],
47+
10
48+
)
49+
const indices = [...slice]
50+
expect(indices).toEqual([0, 1, 2, 8, 9])
51+
})
52+
53+
test('SmartSlice handles single-point slices', () => {
54+
const slice = new SmartSlice(
55+
[
56+
[4, 5],
57+
[7, 8]
58+
],
59+
10
60+
)
61+
const indices = [...slice]
62+
expect(indices).toEqual([4, 7])
63+
})
64+
65+
test('SmartSlice sorts and merges slices correctly', () => {
66+
const slice = new SmartSlice(
67+
[
68+
[5, 7],
69+
[9, 10],
70+
[0, 3],
71+
[2, 6]
72+
],
73+
10
74+
)
75+
const indices = [...slice]
76+
expect(indices).toEqual([0, 1, 2, 3, 4, 5, 6, 9])
77+
})

thicktoken/src/smart-slice.ts

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
export type Slice = [number, number]
2+
3+
export class SmartSlice {
4+
private _sortedSlices: Slice[]
5+
6+
public constructor(slices: Slice[], total: number) {
7+
// merge and sort slices here if needed
8+
9+
const trimmed: Slice[] = []
10+
for (const [start, end] of slices) {
11+
const clampedStart = Math.max(0, Math.min(total, start))
12+
const clampedEnd = Math.max(0, Math.min(total, end))
13+
if (clampedStart < clampedEnd) {
14+
trimmed.push([clampedStart, clampedEnd])
15+
}
16+
}
17+
18+
const sorted: Slice[] = trimmed.sort((a, b) => a[0] - b[0])
19+
20+
const merged: Slice[] = []
21+
for (const [start, end] of sorted) {
22+
const last = merged[merged.length - 1]
23+
if (last && start <= last[1]) {
24+
// overlapping or contiguous slices, merge them
25+
last[1] = Math.max(last[1], end)
26+
} else {
27+
// no overlap, add new slice
28+
merged.push([start, end])
29+
}
30+
}
31+
32+
this._sortedSlices = merged
33+
}
34+
35+
public *[Symbol.iterator](): Generator<number> {
36+
for (const [start, end] of this._sortedSlices) {
37+
for (let i = start; i < end; i++) {
38+
yield i
39+
}
40+
}
41+
}
42+
}

thicktoken/src/tokenizer.ts

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import cl100k_base from 'tiktoken/encoders/cl100k_base.json'
22
import { Tiktoken, init } from 'tiktoken/lite/init'
33
import { deepClone, mapValues, uniq } from './utils'
4+
import { Slice, SmartSlice } from './smart-slice'
45

56
let tokenizer: TextTokenizer | null = null
67
let lock: Promise<void> | false = false
78

89
const CHUNK_SIZE = 100_000
10+
const DEFAULT_SLICES: Slice[] = [[0, Number.POSITIVE_INFINITY]]
911

1012
export class TextTokenizer {
1113
private warnOnSlowCalls = true
@@ -41,7 +43,7 @@ export class TextTokenizer {
4143
}
4244

4345
const next = text.slice(i + CHUNK_SIZE - MARGIN, i + CHUNK_SIZE)
44-
const unsafe = this.split(next).slice(-1)[0].length
46+
const unsafe = this.split(next).slice(-1)[0]!.length
4547

4648
yield text.slice(i, i + CHUNK_SIZE - unsafe)
4749

@@ -137,17 +139,27 @@ export class TextTokenizer {
137139
return newObject
138140
}
139141

140-
public split(text: string): string[] {
142+
public split(text: string, slices: Slice[] = DEFAULT_SLICES): string[] {
141143
const decoder = new TextDecoder()
144+
const output = this.tokenizer.encode(text ?? '')
145+
142146
const str: string[] = []
143-
this.tokenizer.encode(text ?? '').forEach((x) => {
144-
// copying to a new array because of memory allocation in WASM
145-
str.push(decoder.decode(this.tokenizer.decode(new Uint32Array([x]))))
146-
})
147+
148+
const smartSlice = new SmartSlice(slices, output.length)
149+
for (const idx of smartSlice) {
150+
const encodedToken = output[idx]!
151+
str.push(this._decodeToken(decoder, encodedToken))
152+
}
147153

148154
return str
149155
}
150156

157+
private _decodeToken(decoder: TextDecoder, encodedToken: number): string {
158+
// copying to a new array because of memory allocation in WASM
159+
const copy = this.tokenizer.decode(new Uint32Array([encodedToken]))
160+
return decoder.decode(copy)
161+
}
162+
151163
/**
152164
* Counts the number of tokens, up to a fixed ceiling, after which we return
153165
* The reason to have a ceiling is to avoid performance issues with very large texts

thicktoken/tsconfig.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
"emitDecoratorMetadata": true,
2121
"useUnknownInCatchVariables": false,
2222
"forceConsistentCasingInFileNames": true,
23+
"noUncheckedIndexedAccess": true,
2324
"outDir": "./dist",
2425
"rootDir": "./src",
2526
"types": ["node"],

0 commit comments

Comments
 (0)