@@ -18,103 +18,127 @@ def process_pandoc_json(doc):
1818 current_section = 1 # Start from 1 since we'll add offset to get 0
1919 section_counter_found = False
2020
21- def process_element (elem ):
22- nonlocal section_offset , current_section , section_counter_found
23-
24- if isinstance (elem , dict ):
25- # Handle RawBlock LaTeX commands
26- if elem .get ('t' ) == 'RawBlock' and elem .get ('c' , [None , None ])[0 ] == 'latex' :
27- latex_content = elem ['c' ][1 ]
28-
29- # Check for \\setcounter{section}{-1}
30- if r'\setcounter{section}{-1}' in latex_content :
31- section_offset = - 1
32- section_counter_found = True
33- # Remove this command from output
34- return {'t' : 'Null' }
35-
36- # Handle RawInline LaTeX commands
37- elif elem .get ('t' ) == 'RawInline' and elem .get ('c' , [None , None ])[0 ] == 'latex' :
38- latex_content = elem ['c' ][1 ]
39-
40- # Check for \\setcounter{section}{-1}
41- if r'\setcounter{section}{-1}' in latex_content :
42- section_offset = - 1
43- section_counter_found = True
44- # Remove this command from output
45- return {'t' : 'Str' , 'c' : '' }
46-
47- # Handle Headers (sections)
48- elif elem .get ('t' ) == 'Header' :
49- level = elem ['c' ][0 ]
50- attr = elem ['c' ][1 ]
51- content = elem ['c' ][2 ]
52-
53- if level == 1 : # Top-level section
54- # Calculate the adjusted section number (starting from 0 if offset is -1)
55- adjusted_section = current_section + section_offset
56- current_section += 1
57-
58- # Store label mapping if this section has a label
59- section_id = attr [0 ] if attr [0 ] else None
60- if section_id :
61- section_labels [section_id ] = adjusted_section
62-
63- # Clean up section title - remove § symbols and numbers
64- if content :
65- # Extract the existing text content
66- existing_text = ""
67- for item in content :
68- if isinstance (item , dict ) and item .get ('t' ) == 'Str' :
69- existing_text += item ['c' ]
70- elif isinstance (item , dict ) and item .get ('t' ) == 'Space' :
71- existing_text += " "
72-
73- # Remove any existing § symbol and number from the beginning
74- if existing_text .startswith ('§' ):
75- import re
76- existing_text = re .sub (r'^§\d+\s*' , '' , existing_text )
77-
78- # Create new content with just the clean title (no § symbol)
79- new_content = [{'t' : 'Str' , 'c' : existing_text }]
80- elem ['c' ][2 ] = new_content
81-
82- return elem
83-
84- # Handle Links (cross-references)
85- elif elem .get ('t' ) == 'Link' :
86- attr = elem ['c' ][0 ]
87- content = elem ['c' ][1 ]
88- target = elem ['c' ][2 ]
89- url = target [0 ]
90- title = target [1 ]
91-
92- # Check if this is an internal reference (starts with #)
93- if url .startswith ('#' ):
94- ref_id = url [1 :] # Remove the #
95- if ref_id in section_labels :
96- # Update the link text to show correct section number
97- adjusted_num = section_labels [ref_id ]
98- # Replace the content with the correct section number
99- elem ['c' ][1 ] = [{'t' : 'Str' , 'c' : f'§{ adjusted_num } ' }]
100-
101- return elem
102-
103- # Recursively process other elements
104- elif isinstance (elem , dict ):
105- for key , value in elem .items ():
106- if isinstance (value , (list , dict )):
107- elem [key ] = process_element (value )
108-
109- elif isinstance (elem , list ):
110- return [process_element (item ) for item in elem ]
111-
112- return elem
21+
22+ # Two-pass approach: first gather headers and labels, then transform
23+ def gather_labels (blocks ):
24+ """Walk blocks and gather label -> hierarchical-number strings.
25+ We maintain counters per header level. For example, for a subsection
26+ we produce '13.6'. The section counter (level 1) gets the section_offset applied.
27+ """
28+ labels = {}
29+ # counters indexed by level-1: counters[0]=section, [1]=subsection, ...
30+ counters = [0 , 0 , 0 ]
31+
32+ def walk (bs ):
33+ for b in bs :
34+ if isinstance (b , dict ) and b .get ('t' ) == 'RawBlock' and isinstance (b .get ('c' ), list ) and b ['c' ][0 ] == 'latex' :
35+ text = b ['c' ][1 ]
36+ if r'\\setcounter{section}{-1}' in text or r'\setcounter{section}{-1}' in text :
37+ # set the offset so sections are effectively zero-based
38+ nonlocal_set_offset ()
39+ if isinstance (b , dict ) and b .get ('t' ) == 'Header' :
40+ level = b ['c' ][0 ]
41+ attr = b ['c' ][1 ]
42+ # ensure counters list long enough
43+ if level - 1 >= len (counters ):
44+ counters .extend ([0 ] * (level - len (counters )))
45+ # increment this level and zero deeper levels
46+ counters [level - 1 ] += 1
47+ for i in range (level , len (counters )):
48+ counters [i ] = 0
49+ # compute hierarchical number string
50+ # apply section_offset only to top-level counter
51+ parts = []
52+ for i in range (level ):
53+ val = counters [i ]
54+ if i == 0 :
55+ val = val + section_offset
56+ parts .append (str (val ))
57+ numstr = '.' .join (parts )
58+ if attr and attr [0 ]:
59+ labels [attr [0 ]] = numstr
60+ # recurse into possible nested lists
61+ if isinstance (b , dict ):
62+ for key in ('c' , 'content' , 'blocks' ):
63+ if key in b and isinstance (b [key ], list ):
64+ walk (b [key ])
65+
66+ # helper to set section_offset from inner scope
67+ def nonlocal_set_offset ():
68+ nonlocal section_offset
69+ section_offset = - 1
70+
71+ walk (blocks )
72+ return labels
73+
74+ def transform_blocks (blocks ):
75+ """Transform headers (clean titles) and replace link texts using section_labels mapping.
76+ section_labels values are strings like '13.6' now.
77+ """
78+ # counters for producing header numbers during transform (keep consistent with gather)
79+ counters = [0 , 0 , 0 ]
80+
81+ def walk_and_transform (bs ):
82+ new = []
83+ for b in bs :
84+ if isinstance (b , dict ) and b .get ('t' ) == 'Header' :
85+ level = b ['c' ][0 ]
86+ content = b ['c' ][2 ]
87+ # ensure counters long enough
88+ if level - 1 >= len (counters ):
89+ counters .extend ([0 ] * (level - len (counters )))
90+ counters [level - 1 ] += 1
91+ for i in range (level , len (counters )):
92+ counters [i ] = 0
93+ # clean title text regardless of level
94+ existing_text = ''
95+ for item in content :
96+ if isinstance (item , dict ) and item .get ('t' ) == 'Str' :
97+ existing_text += item ['c' ]
98+ elif isinstance (item , dict ) and item .get ('t' ) == 'Space' :
99+ existing_text += ' '
100+ if existing_text .startswith ('§' ):
101+ existing_text = re .sub (r'^§\d+(?:\.\d+)*\s*' , '' , existing_text )
102+ b ['c' ][2 ] = [{'t' : 'Str' , 'c' : existing_text }]
103+ # Transform links and recurse
104+ def transform_element (elem ):
105+ if isinstance (elem , dict ):
106+ if elem .get ('t' ) == 'Link' :
107+ target = elem ['c' ][2 ]
108+ url = target [0 ]
109+ if isinstance (url , str ) and url .startswith ('#' ):
110+ ref_id = url [1 :]
111+ if ref_id in section_labels :
112+ num = section_labels [ref_id ]
113+ # set link text to the hierarchical number string (no §)
114+ elem ['c' ][1 ] = [{'t' : 'Str' , 'c' : num }]
115+ return elem
116+ for k , v in list (elem .items ()):
117+ if isinstance (v , list ):
118+ elem [k ] = [transform_element (x ) for x in v ]
119+ elif isinstance (elem , list ):
120+ return [transform_element (x ) for x in elem ]
121+ return elem
122+
123+ # Recurse into nested lists
124+ if isinstance (b , dict ):
125+ for key in ('c' , 'content' , 'blocks' ):
126+ if key in b and isinstance (b [key ], list ):
127+ b [key ] = walk_and_transform (b [key ])
128+ b = transform_element (b )
129+ new .append (b )
130+ return new
131+
132+ return walk_and_transform (blocks )
113133
114- # Process the document
134+ # Two-pass processing: gather labels first, then transform blocks
115135 if 'blocks' in doc :
116- doc ['blocks' ] = process_element (doc ['blocks' ])
117-
136+ # gather labels (fills section_labels)
137+ gathered = gather_labels (doc ['blocks' ])
138+ section_labels .update (gathered )
139+ # transform blocks using the gathered labels
140+ current_section = 1
141+ doc ['blocks' ] = transform_blocks (doc ['blocks' ])
118142 return doc
119143
120144def main ():
0 commit comments