11// Copyright (c) Microsoft Corporation.
22// Licensed under the MIT License.
33
4- import type { Root } from "mdast" ;
4+ import type { Root , RootContent } from "mdast" ;
55import type { WorkspaceFile } from "@genaiscript/core" ;
66import { checkRuntime , filenameOrFileToContent , genaiscriptDebug } from "@genaiscript/core" ;
77import type { Processor } from "unified" ;
88import remarkGitHubAlerts from "./remarkalerts.js" ;
99import type { GitHubAlertMarker } from "./remarkalerts.js" ;
10- import remarkDetails , { DetailsElement , SummaryElement } from "./remarkdetails.js" ;
10+ import remarkDetails from "./remarkdetails.js" ;
11+ import type { DetailsElement , SummaryElement } from "./remarkdetails.js" ;
12+ import { approximateTokens } from "@genaiscript/core" ;
1113const dbg = genaiscriptDebug ( "mdast" ) ;
1214
1315export interface MdAstOptions {
@@ -64,14 +66,15 @@ export async function mdast(options?: MdAstOptions) {
6466 return processed as Root ;
6567 } ;
6668
67- const mdastStringify = ( root : Root , options ?: { } ) : string => {
69+ const mdastStringify = ( root : Root | RootContent [ ] , stringifyOptions ?: object ) : string => {
6870 if ( ! root ) return "" ;
6971
7072 dbg ( `stringify` ) ;
7173 const processor = unified ( ) ;
7274 usePlugins ( processor , "stringify" ) ;
75+ // @ts -expect-error - TypeScript doesn't recognize the handlers option
7376 processor . use ( stringify , {
74- ...( options || { } ) ,
77+ ...( stringifyOptions || { } ) ,
7578 handlers : {
7679 githubAlertMarker ( node : GitHubAlertMarker ) {
7780 return node . value ;
@@ -83,15 +86,101 @@ export async function mdast(options?: MdAstOptions) {
8386 return `<summary>${ node . children . map ( ( child ) => processor . stringify ( child ) ) . join ( "" ) } </summary>` ;
8487 } ,
8588 } ,
86- } as any ) ;
89+ } ) ;
8790
88- const result = processor . stringify ( root ) ;
91+ const n = Array . isArray ( root ) ? ( { type : "root" , children : root } satisfies Root ) : root ;
92+ const result = processor . stringify ( n ) ;
8993 return String ( result ) ;
9094 } ;
9195
96+ const mdChunk = (
97+ nodes : Root | RootContent [ ] ,
98+ maxTokens : number ,
99+ chunkOptions ?: {
100+ tokenize : ( text : string ) => number ;
101+ } ,
102+ ) : RootContent [ ] [ ] => {
103+ const { tokenize = approximateTokens } = chunkOptions || { } ;
104+ if ( ! nodes ) return [ ] ;
105+ if ( ! Array . isArray ( nodes ) ) {
106+ if ( nodes . type !== "root" ) throw new Error ( "Expected nodes to be an array or a Root type" ) ;
107+ nodes = nodes . children || [ ] ;
108+ }
109+ if ( nodes . length === 0 ) return [ ] ;
110+
111+ const chunks : RootContent [ ] [ ] = [ ] ;
112+ let currentChunk : RootContent [ ] = [ ] ;
113+ let currentTokenCount = 0 ;
114+
115+ const measure = ( ns : RootContent [ ] ) : number => tokenize ( mdastStringify ( ns ) ) ;
116+
117+ // Process nodes in order, never reordering them
118+ for ( let i = 0 ; i < nodes . length ; i ++ ) {
119+ const node = nodes [ i ] ;
120+ const nodeTokens = measure ( [ node ] ) ;
121+
122+ // If adding this node would exceed the limit and we have content in current chunk
123+ if ( currentTokenCount + nodeTokens > maxTokens && currentChunk . length > 0 ) {
124+ // For headings, try to keep them with their content by looking ahead
125+ if ( node . type === "heading" ) {
126+ // Look ahead to see how much content follows this heading
127+ let headingContentSize = nodeTokens ;
128+ let nextHeadingIndex = i + 1 ;
129+
130+ // Find content that belongs to this heading (until next heading of same or higher level)
131+ while ( nextHeadingIndex < nodes . length ) {
132+ const nextNode = nodes [ nextHeadingIndex ] ;
133+ if ( nextNode . type === "heading" && nextNode . depth <= node . depth ) {
134+ break ; // Found a heading of same or higher level
135+ }
136+ headingContentSize += measure ( [ nextNode ] ) ;
137+ nextHeadingIndex ++ ;
138+ }
139+
140+ // If the heading + its content can fit in a new chunk, start a new chunk
141+ if ( headingContentSize <= maxTokens ) {
142+ chunks . push ( currentChunk ) ;
143+ currentChunk = [ ] ;
144+ currentTokenCount = 0 ;
145+ }
146+ // Otherwise, just finalize current chunk and continue
147+ else if ( currentChunk . length > 0 ) {
148+ chunks . push ( currentChunk ) ;
149+ currentChunk = [ ] ;
150+ currentTokenCount = 0 ;
151+ }
152+ } else {
153+ // For non-heading nodes, just start a new chunk
154+ chunks . push ( currentChunk ) ;
155+ currentChunk = [ ] ;
156+ currentTokenCount = 0 ;
157+ }
158+ }
159+
160+ // Add the current node to the chunk
161+ currentChunk . push ( node ) ;
162+ currentTokenCount += nodeTokens ;
163+
164+ // If this single node exceeds maxTokens, put it in its own chunk
165+ if ( nodeTokens > maxTokens && currentChunk . length === 1 ) {
166+ chunks . push ( currentChunk ) ;
167+ currentChunk = [ ] ;
168+ currentTokenCount = 0 ;
169+ }
170+ }
171+
172+ // Add final chunk if it has content
173+ if ( currentChunk . length > 0 ) {
174+ chunks . push ( currentChunk ) ;
175+ }
176+
177+ return chunks ;
178+ } ;
179+
92180 return Object . freeze ( {
93181 parse : mdastParse ,
94182 stringify : mdastStringify ,
183+ chunk : mdChunk ,
95184 visit,
96185 visitParents,
97186 inspect,
0 commit comments