-
-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathtransform-docs.awk
More file actions
executable file
·294 lines (256 loc) · 7.59 KB
/
transform-docs.awk
File metadata and controls
executable file
·294 lines (256 loc) · 7.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
#!/usr/bin/awk -f
# Transform script for converting Bazel docs to Mintlify MDX format
# Usage: awk -f transform-docs.awk input.md > output.mdx
BEGIN {
in_frontmatter = 0
first_h1_found = 0
frontmatter_printed = 0
before_first_h1 = 1
in_code_block = 0
}
# Skip Jekyll front-matter lines
/^Project: \/_project\.yaml$/ { next }
/^Book: \/_book\.yaml$/ { next }
# Remove lines that contain only '{% include "_buttons.html" %}'
/^{% include "_buttons\.html" %}$/ { next }
# Remove lines containing '{% dynamic setvar'
/{% dynamic setvar/ { next }
# Remove any lines that start with '{%'
/^{%/ { next }
# Track code blocks to avoid processing their content
/^```/ {
in_code_block = !in_code_block
print
next
}
# Don't process lines inside code blocks
in_code_block {
print
next
}
# Convert HTML comments to MDX comments
/<!-- / {
# Multi-line comment handling
if (/<!-- .* -->/) {
# Single line comment
gsub(/<!--/, "{/*", $0)
gsub(/-->/, "*/}", $0)
print
next
} else {
# Start of multi-line comment
gsub(/<!--/, "{/*", $0)
print
next
}
}
# End of multi-line comment
/--> *$/ {
gsub(/-->/, "*/}", $0)
print
next
}
# Convert <pre> tags to markdown code blocks
/^<pre>/ {
gsub(/^<pre>/, "```")
gsub(/<\/pre>$/, "```")
print
next
}
# Fix <pre> tags that don't close properly
/<pre[^>]*>/ {
# If it has content after the tag and ends with ```, it's malformed
if (/<pre[^>]*>[^<]*```$/) {
# Replace <pre...>content``` with just content (already has ```)
gsub(/<pre[^>]*>/, "", $0)
print
next
}
# If it has content after the tag, it's likely malformed
if (/<pre[^>]*>[^<]*$/) {
gsub(/<pre[^>]*>/, "```", $0)
}
}
# Remove </pre> tags that appear at the end of lines
{
gsub(/<\/pre>$/, "```", $0)
}
# Remove anchor parts from headings (e.g., ## Title {:#anchor})
/^#+ .* \{:#[^}]*\}$/ {
heading = $0
gsub(/\s*\{:#[^}]*\}$/, "", heading)
gsub(/[ \t]+$/, "", heading)
print heading
next
}
# Remove anchor syntax like {:flag--deleted_packages} or {: .external}
{
gsub(/\{:[^}]*\}/, "", $0)
}
# Fix common Jekyll/Kramdown patterns that break MDX
{
# Remove {# ... #} Jekyll comments - be more aggressive
while (match($0, /\{#[^}]*#\}/)) {
gsub(/\{#[^}]*#\}/, "", $0)
}
}
# Fix problematic {{ "<var>" }} and {{ '</var>' }} patterns
{
# These double curly braces with quotes break acorn parser
gsub(/\{\{ *"<var>" *\}\}/, "<var>", $0)
gsub(/\{\{ *"<\/var>" *\}\}/, "</var>", $0)
gsub(/\{\{ *'<var>' *\}\}/, "<var>", $0)
gsub(/\{\{ *'<\/var>' *\}\}/, "</var>", $0)
}
# Fix < and > that should be escaped differently in MDX
{
# In attribute values, convert < and > to actual < >
# This is safer in JSX/MDX
while (match($0, /="[^"]*&[lg]t;[^"]*"/)) {
gsub(/</, "<", $0)
gsub(/>/, ">", $0)
}
}
# Fix empty thead tags - <thead></th> should be <thead>
{
# Multiple variations of broken thead
gsub(/<thead><\/th>/, "<thead>", $0)
gsub(/<thead><\/thead>/, "<thead>", $0)
# Also fix lines that are ONLY </th> after a thead
if ($0 ~ /^<\/th>$/ || $0 ~ /^[ \t]*<\/th>[ \t]*$/) {
next # Skip this line entirely
}
}
# Fix malformed <img> tags with align attribute without quotes
{
# align=right should be align="right"
gsub(/align=right/, "align=\"right\"", $0)
gsub(/align=left/, "align=\"left\"", $0)
gsub(/align=center/, "align=\"center\"", $0)
}
# Fix malformed <col> tags - CORRECTED VERSION
{
# First, handle <col> with no attributes
gsub(/<col>/, "<col />", $0)
# Then handle <col ...> with attributes but no self-closing slash
# We need to find <col followed by attributes followed by > (not />)
while (match($0, /<col [^>\/]*>/)) {
# Get the matched string
pre = substr($0, 1, RSTART - 1)
matched = substr($0, RSTART, RLENGTH)
post = substr($0, RSTART + RLENGTH)
# Remove the trailing > and add />
matched = substr(matched, 1, length(matched) - 1) " />"
$0 = pre matched post
}
}
# Close other self-closing HTML tags properly
{
# Fix <br> tags
gsub(/<br>/, "<br />", $0)
# Fix <img> tags - ensure they're self-closing
while (match($0, /<img[^>]*[^\/]>/)) {
pre = substr($0, 1, RSTART - 1)
tag = substr($0, RSTART, RLENGTH)
post = substr($0, RSTART + RLENGTH)
# Remove the trailing > and add />
tag = substr(tag, 1, length(tag) - 1) " />"
$0 = pre tag post
}
# Fix <hr> tags
gsub(/<hr>/, "<hr />", $0)
}
# Fix unclosed <p> tags
{
# If we have <p> but no closing tag on the same line, and line ends with text
if (/<p[^>]*>/ && !/<\/p>/ && $0 !~ /<\/(div|table|ul|ol|blockquote)>$/) {
# Check if it's just a <p> with content and no closing
if ($0 ~ /<p[^>]*>[^<]+$/) {
$0 = $0 "</p>"
}
}
}
# Fix unclosed <code> tags in <code class="..."> patterns
{
# Fix escaped underscores in code tags like \_
# These appear in patterns like noimplicit\_deps
if (/<code>.*\\_.*<\/code>/) {
gsub(/\\_/, "_", $0)
}
# If we have opening <code> but line doesn't end with </code>
if (/<code[^>]*>/ && !/<\/code>/) {
# Look for the pattern and close it properly
if ($0 ~ /<code[^>]*>[^<]*$/) {
$0 = $0 "</code>"
}
}
}
# Fix unclosed <td> and <th> tags at end of lines (common in tables)
/<td[^>]*>[^<]*$/ {
# Only add closing tag if there isn't already one
if ($0 !~ /<\/td>$/) {
$0 = $0 "</td>"
}
}
# Be more careful with <th> - don't match <thead>
/^[^<]*<th[^e]/ && /<th[^>]*>[^<]*$/ {
if ($0 !~ /<\/th>$/) {
$0 = $0 "</th>"
}
}
# Fix malformed <a> tags
{
# Ensure href has quotes
while (match($0, /<a ([^>]*)href=([^"'][^ >]+)/)) {
pre = substr($0, 1, RSTART - 1)
match_str = substr($0, RSTART, RLENGTH)
post = substr($0, RSTART + RLENGTH)
gsub(/href=([^"'][^ >]+)/, "href=\"\\1\"", match_str)
$0 = pre match_str post
}
# Fix broken <a href="\1" pattern - remove the \1
gsub(/<a href="\\1"/, "<a href=\"#\"", $0)
}
# Fix special characters in text that break JSX parsing
{
# Escape curly braces in code examples like select({ ... })
# But only if they're NOT in backticks
if ($0 ~ /\{['"]/ && $0 ~ /['"]\}/ && $0 !~ /`[^`]*\{[^`]*\}[^`]*`/) {
# Escape curly braces in dictionary/object literals like {'cpu': 'ppc'}
# outside of code blocks
gsub(/\{'/, "\\{'", $0)
gsub(/'\}/, "'\\}", $0)
gsub(/\{"/, "\\{\"", $0)
gsub(/"\}/, "\"\\}", $0)
}
}
# Escape angle brackets in specific contexts (like #include <foo>)
{
# Escape angle brackets in text that looks like C++ includes
# Pattern: #include <something.h> or e.g. #include <foo/bar.h>
if ($0 ~ /#include </) {
gsub(/#include </, "#include \\<", $0)
# Find the closing > and escape it too
gsub(/\.h>/, ".h\\>", $0)
}
}
# Skip blank lines before first H1
/^[ \t]*$/ && before_first_h1 == 1 { next }
# Convert first H1 to front-matter
/^# / && first_h1_found == 0 {
title = substr($0, 3) # Remove "# " prefix
gsub(/^[ \t]+|[ \t]+$/, "", title) # Trim whitespace
# Escape single quotes in title by doubling them for YAML
gsub(/'/, "''", title)
print "---"
print "title: '" title "'"
print "---"
print ""
first_h1_found = 1
before_first_h1 = 0
next
}
# Print all other lines
{
print
}