Skip to content

Commit 37ca534

Browse files
authored
mdast chunk (#1736)
* add naive chunking * chunking * refactor: enhance mdast stringify function to handle array input and improve chunking tests * refactor: update mdastStringify to accept RootContent and improve type safety in tests * docs: add example for chunking the mdast tree into sections * test: add chunking tests for large markdown documents and improve coverage * refactor: improve chunking logic in mdast function to handle nodes more efficiently * refactor: update mdast function to accept Root type and adjust chunking logic in tests
1 parent 66041a8 commit 37ca534

File tree

7 files changed

+1410
-33
lines changed

7 files changed

+1410
-33
lines changed

.vscode/settings.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@
274274
"qwen",
275275
"RAAA",
276276
"redteam",
277+
"remarkalerts",
277278
"remarkdetails",
278279
"resd",
279280
"resj",

docs/src/content/docs/reference/runtime/plugin-mdast.mdx

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,17 @@ const updated = visit(root, `code`, (node) => {
5353
const markdown = await stringify(updated);
5454
```
5555

56+
- chunk the tree into sections
57+
58+
```typescript
59+
const { parse, chunk } = await mdast();
60+
const root = parse(
61+
"# Section 1\n\nContent 1\n\n## Subsection 1.1\n\nContent 1.1\n\n# Section 2\n\nContent 2",
62+
);
63+
const sections = chunk(root);
64+
console.log(sections);
65+
```
66+
5667
In order to get type completion, you will need to install the `@types/mdast` package as a development dependency.
5768

5869
## Debugging trees

packages/core/src/anthropic.ts

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// Copyright (c) Microsoft Corporation.
22
// Licensed under the MIT License.
33

4-
import { ChatCompletionHandler, LanguageModel, ListModelsFunction } from "./chat.js";
4+
import type { ChatCompletionHandler, LanguageModel, ListModelsFunction } from "./chat.js";
55
import {
66
ANTHROPIC_MAX_TOKEN,
77
MODEL_PROVIDER_ANTHROPIC,
@@ -13,7 +13,7 @@ import { approximateTokens } from "./tokens.js";
1313
import { resolveTokenEncoder } from "./encoders.js";
1414
import type Anthropic from "@anthropic-ai/sdk";
1515
import type AnthropicBedrock from "@anthropic-ai/bedrock-sdk";
16-
import {
16+
import type {
1717
ChatCompletionResponse,
1818
ChatCompletionToolCall,
1919
ChatCompletionUsage,
@@ -33,14 +33,15 @@ import {
3333
import { logError } from "./util.js";
3434
import { resolveUndiciProxyAgent } from "./proxy.js";
3535
import type { ProxyAgent } from "undici";
36-
import { MarkdownTrace } from "./trace.js";
37-
import { createFetch, FetchType } from "./fetch.js";
36+
import type { MarkdownTrace } from "./trace.js";
37+
import { createFetch } from "./fetch.js";
38+
import type { FetchType } from "./fetch.js";
3839
import { JSONLLMTryParse } from "./json5.js";
39-
import { LanguageModelConfiguration } from "./server/messages.js";
40+
import type { LanguageModelConfiguration } from "./server/messages.js";
4041
import { deleteUndefinedValues } from "./cleaners.js";
4142
import debug from "debug";
4243
import { providerFeatures } from "./features.js";
43-
import { LanguageModelInfo } from "./types.js";
44+
import type { LanguageModelInfo } from "./types.js";
4445
const dbg = debug("genaiscript:anthropic");
4546
const dbgMessages = debug("genaiscript:anthropic:msg");
4647

packages/plugin-mdast/src/remarkdetails.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,12 @@ import { remark } from "remark";
88
import { genaiscriptDebug } from "@genaiscript/core";
99
const dbg = genaiscriptDebug("mdast:html:details");
1010

11+
declare module "mdast" {
12+
interface RootContentMap {
13+
details: DetailsElement;
14+
}
15+
}
16+
1117
export interface RemarkDetailsOptions {}
1218

1319
export interface DetailsElement extends Parent {

packages/plugin-mdast/src/unified.ts

Lines changed: 95 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
// Copyright (c) Microsoft Corporation.
22
// Licensed under the MIT License.
33

4-
import type { Root } from "mdast";
4+
import type { Root, RootContent } from "mdast";
55
import type { WorkspaceFile } from "@genaiscript/core";
66
import { checkRuntime, filenameOrFileToContent, genaiscriptDebug } from "@genaiscript/core";
77
import type { Processor } from "unified";
88
import remarkGitHubAlerts from "./remarkalerts.js";
99
import type { GitHubAlertMarker } from "./remarkalerts.js";
10-
import remarkDetails, { DetailsElement, SummaryElement } from "./remarkdetails.js";
10+
import remarkDetails from "./remarkdetails.js";
11+
import type { DetailsElement, SummaryElement } from "./remarkdetails.js";
12+
import { approximateTokens } from "@genaiscript/core";
1113
const dbg = genaiscriptDebug("mdast");
1214

1315
export interface MdAstOptions {
@@ -64,14 +66,15 @@ export async function mdast(options?: MdAstOptions) {
6466
return processed as Root;
6567
};
6668

67-
const mdastStringify = (root: Root, options?: {}): string => {
69+
const mdastStringify = (root: Root | RootContent[], stringifyOptions?: object): string => {
6870
if (!root) return "";
6971

7072
dbg(`stringify`);
7173
const processor = unified();
7274
usePlugins(processor, "stringify");
75+
// @ts-expect-error - TypeScript doesn't recognize the handlers option
7376
processor.use(stringify, {
74-
...(options || {}),
77+
...(stringifyOptions || {}),
7578
handlers: {
7679
githubAlertMarker(node: GitHubAlertMarker) {
7780
return node.value;
@@ -83,15 +86,101 @@ export async function mdast(options?: MdAstOptions) {
8386
return `<summary>${node.children.map((child) => processor.stringify(child)).join("")}</summary>`;
8487
},
8588
},
86-
} as any);
89+
});
8790

88-
const result = processor.stringify(root);
91+
const n = Array.isArray(root) ? ({ type: "root", children: root } satisfies Root) : root;
92+
const result = processor.stringify(n);
8993
return String(result);
9094
};
9195

96+
const mdChunk = (
97+
nodes: Root | RootContent[],
98+
maxTokens: number,
99+
chunkOptions?: {
100+
tokenize: (text: string) => number;
101+
},
102+
): RootContent[][] => {
103+
const { tokenize = approximateTokens } = chunkOptions || {};
104+
if (!nodes) return [];
105+
if (!Array.isArray(nodes)) {
106+
if (nodes.type !== "root") throw new Error("Expected nodes to be an array or a Root type");
107+
nodes = nodes.children || [];
108+
}
109+
if (nodes.length === 0) return [];
110+
111+
const chunks: RootContent[][] = [];
112+
let currentChunk: RootContent[] = [];
113+
let currentTokenCount = 0;
114+
115+
const measure = (ns: RootContent[]): number => tokenize(mdastStringify(ns));
116+
117+
// Process nodes in order, never reordering them
118+
for (let i = 0; i < nodes.length; i++) {
119+
const node = nodes[i];
120+
const nodeTokens = measure([node]);
121+
122+
// If adding this node would exceed the limit and we have content in current chunk
123+
if (currentTokenCount + nodeTokens > maxTokens && currentChunk.length > 0) {
124+
// For headings, try to keep them with their content by looking ahead
125+
if (node.type === "heading") {
126+
// Look ahead to see how much content follows this heading
127+
let headingContentSize = nodeTokens;
128+
let nextHeadingIndex = i + 1;
129+
130+
// Find content that belongs to this heading (until next heading of same or higher level)
131+
while (nextHeadingIndex < nodes.length) {
132+
const nextNode = nodes[nextHeadingIndex];
133+
if (nextNode.type === "heading" && nextNode.depth <= node.depth) {
134+
break; // Found a heading of same or higher level
135+
}
136+
headingContentSize += measure([nextNode]);
137+
nextHeadingIndex++;
138+
}
139+
140+
// If the heading + its content can fit in a new chunk, start a new chunk
141+
if (headingContentSize <= maxTokens) {
142+
chunks.push(currentChunk);
143+
currentChunk = [];
144+
currentTokenCount = 0;
145+
}
146+
// Otherwise, just finalize current chunk and continue
147+
else if (currentChunk.length > 0) {
148+
chunks.push(currentChunk);
149+
currentChunk = [];
150+
currentTokenCount = 0;
151+
}
152+
} else {
153+
// For non-heading nodes, just start a new chunk
154+
chunks.push(currentChunk);
155+
currentChunk = [];
156+
currentTokenCount = 0;
157+
}
158+
}
159+
160+
// Add the current node to the chunk
161+
currentChunk.push(node);
162+
currentTokenCount += nodeTokens;
163+
164+
// If this single node exceeds maxTokens, put it in its own chunk
165+
if (nodeTokens > maxTokens && currentChunk.length === 1) {
166+
chunks.push(currentChunk);
167+
currentChunk = [];
168+
currentTokenCount = 0;
169+
}
170+
}
171+
172+
// Add final chunk if it has content
173+
if (currentChunk.length > 0) {
174+
chunks.push(currentChunk);
175+
}
176+
177+
return chunks;
178+
};
179+
92180
return Object.freeze({
93181
parse: mdastParse,
94182
stringify: mdastStringify,
183+
chunk: mdChunk,
95184
visit,
96185
visitParents,
97186
inspect,

0 commit comments

Comments
 (0)