this repo has no description
1"""dump ALL intermediate values from spaCy's NER pipeline for debugging.
2
3compares these against the spacez zig reimplementation to find divergence.
4
5usage:
6 uv run --python 3.12 --with spacy \
7 --with 'en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl' \
8 python scripts/dump_intermediates.py
9"""
10
11import numpy as np
12
13np.set_printoptions(precision=10, suppress=False, linewidth=200)
14
15SENTENCE = "NASA launched the Artemis mission."
16
17
18def main():
19 import spacy
20 from spacy.attrs import NORM, PREFIX, SUFFIX, SHAPE
21 from thinc.backends.numpy_ops import NumpyOps
22
23 nlp = spacy.load("en_core_web_sm")
24 ner = nlp.get_pipe("ner")
25 model = ner.model
26 ops = NumpyOps()
27
28 # the NER model has its own tok2vec ref (not a listener — a standalone model)
29 # with 4 hash embeds (NORM, PREFIX, SUFFIX, SHAPE), not the 6 from the pipe tok2vec
30 tok2vec = model.get_ref("tok2vec")
31 lower = model.get_ref("lower")
32 upper = model.get_ref("upper")
33
34 # walk the NER tok2vec to find components
35 hash_embeds = []
36 reduce_maxout = None
37 reduce_ln = None
38 for node in tok2vec.walk():
39 if node.name == "hashembed":
40 hash_embeds.append(node)
41 if node.name == "maxout" and reduce_maxout is None and len(hash_embeds) == 4:
42 reduce_maxout = node
43 if node.name == "layernorm" and reduce_ln is None and reduce_maxout is not None:
44 reduce_ln = node
45 break
46 assert len(hash_embeds) == 4, f"expected 4 hash embeds, got {len(hash_embeds)}"
47
48 # find CNN residual blocks — filter to exactly the single-block residuals,
49 # not the chain-of-residuals node
50 residual_blocks = []
51 for node in tok2vec.walk():
52 if node.name == "residual(expand_window>>maxout>>layernorm>>dropout)":
53 residual_blocks.append(node)
54 assert len(residual_blocks) == 4, f"expected 4 residual blocks, got {len(residual_blocks)}"
55
56 # find linear projection
57 linear_proj = None
58 for node in tok2vec.walk():
59 if node.name == "linear":
60 linear_proj = node
61
62 # ── (a) token texts ──
63 print("=" * 80)
64 print("(a) TOKEN TEXTS (from tokenizer)")
65 print("=" * 80)
66 doc = nlp.make_doc(SENTENCE)
67 tokens = [t.text for t in doc]
68 print(f"sentence: {SENTENCE!r}")
69 print(f"tokens ({len(tokens)}): {tokens}")
70 print()
71
72 # ── (b) token attributes: NORM, PREFIX, SUFFIX, SHAPE hashes ──
73 print("=" * 80)
74 print("(b) TOKEN ATTRIBUTES (NORM, PREFIX, SUFFIX, SHAPE hashes)")
75 print("=" * 80)
76 attr_array = doc.to_array([NORM, PREFIX, SUFFIX, SHAPE])
77 for i, tok in enumerate(doc):
78 print(f" token[{i}] = {tok.text!r}")
79 print(f" NORM = 0x{attr_array[i][0]:016x} ({tok.norm_!r})")
80 print(f" PREFIX = 0x{attr_array[i][1]:016x} ({tok.prefix_!r})")
81 print(f" SUFFIX = 0x{attr_array[i][2]:016x} ({tok.suffix_!r})")
82 print(f" SHAPE = 0x{attr_array[i][3]:016x} ({tok.shape_!r})")
83 print()
84
85 # ── (c) hash embedding table lookups ──
86 print("=" * 80)
87 print("(c) HASH EMBEDDING TABLE LOOKUPS (raw rows from each embed table)")
88 print("=" * 80)
89 attr_names = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
90
91 all_embed_rows = [] # [embed_idx][tok_idx] = row vector
92 for embed_idx, he in enumerate(hash_embeds):
93 E = he.get_param("E")
94 seed = he.attrs["seed"]
95 nV = E.shape[0]
96 nO = E.shape[1]
97 print(f"\n hash_embed[{embed_idx}] ({attr_names[embed_idx]}): E.shape={E.shape}, seed={seed}")
98
99 embed_rows = []
100 for tok_idx, tok in enumerate(doc):
101 attr_val = attr_array[tok_idx][embed_idx]
102 key_arr = np.array([attr_val], dtype=np.uint64)
103 hash_result = ops.hash(key_arr, seed) # shape (1, 4) of uint32
104 bucket = hash_result[0][0] % nV
105 row = E[bucket]
106 embed_rows.append(row)
107 print(f" token[{tok_idx}] {tok.text!r}: attr=0x{attr_val:016x}, hash4={hash_result[0].tolist()}, bucket={bucket}")
108 print(f" row = {row}")
109 all_embed_rows.append(embed_rows)
110 print()
111
112 # ── (d) MultiHashEmbed output (after concat → maxout reduce → layernorm) ──
113 print("=" * 80)
114 print("(d) MULTIHASHEMBED OUTPUT (concat → maxout → layernorm → 96-dim)")
115 print("=" * 80)
116
117 # concatenate the 4 embed rows for each token: 4 × 96 = 384
118 n_tokens = len(doc)
119 concat_matrix = np.zeros((n_tokens, 4 * 96), dtype=np.float32)
120 for tok_idx in range(n_tokens):
121 parts = [all_embed_rows[e][tok_idx] for e in range(4)]
122 concat_matrix[tok_idx] = np.concatenate(parts)
123
124 print(f"\n concatenated embeddings: shape={concat_matrix.shape}")
125 for tok_idx, tok in enumerate(doc):
126 print(f" token[{tok_idx}] {tok.text!r}: first 16 = {concat_matrix[tok_idx, :16]}")
127
128 # apply reduction maxout
129 W_maxout = reduce_maxout.get_param("W")
130 b_maxout = reduce_maxout.get_param("b")
131 nO = b_maxout.shape[0]
132 nP = b_maxout.shape[1]
133 nI = W_maxout.shape[-1]
134 print(f"\n reduce maxout: W={W_maxout.shape}, b={b_maxout.shape} (nO={nO}, nP={nP}, nI={nI})")
135
136 W_flat = W_maxout.reshape(nO * nP, nI)
137 b_flat = b_maxout.reshape(nO * nP)
138 Y_pre_max = concat_matrix @ W_flat.T + b_flat
139 Y_pieces = Y_pre_max.reshape(-1, nO, nP)
140 Y_maxout = Y_pieces.max(axis=-1)
141
142 print(f"\n after maxout: shape={Y_maxout.shape}")
143 for tok_idx, tok in enumerate(doc):
144 print(f" token[{tok_idx}] {tok.text!r}: {Y_maxout[tok_idx]}")
145
146 # apply layernorm
147 G_ln = reduce_ln.get_param("G")
148 b_ln = reduce_ln.get_param("b")
149 mean = Y_maxout.mean(axis=-1, keepdims=True)
150 var = Y_maxout.var(axis=-1, keepdims=True)
151 std = np.sqrt(var + 1e-12)
152 Y_ln = G_ln * (Y_maxout - mean) / std + b_ln
153
154 print(f"\n after layernorm (manual): shape={Y_ln.shape}")
155 for tok_idx, tok in enumerate(doc):
156 print(f" token[{tok_idx}] {tok.text!r}: {Y_ln[tok_idx]}")
157
158 # now get ground truth by running the embed portion of the model
159 # the NER tok2vec structure is:
160 # layers[0] = tok2vec_chain (embed+cnn)
161 # layers[0]._layers[0] = embed (extract_features >> list2ragged >> with_array(concat_embeds) >> with_array(maxout>>ln>>drop) >> ragged2list)
162 # layers[0]._layers[1] = with_array(4 × residual)
163 # layers[1] = list2array
164 # layers[2] = linear
165 tok2vec_chain = tok2vec._layers[0]
166 embed_chain = tok2vec_chain._layers[0]
167 encode_with_array = tok2vec_chain._layers[1]
168 list2array_layer = tok2vec._layers[1]
169 linear_layer = tok2vec._layers[2]
170
171 # run the embed chain to get ground-truth embed output
172 doc_fresh = nlp.make_doc(SENTENCE)
173 embed_output = embed_chain.predict([doc_fresh]) # list of arrays
174 print(f"\n ground-truth embed output (from model): shape={embed_output[0].shape}")
175 for tok_idx, tok in enumerate(doc_fresh):
176 print(f" token[{tok_idx}] {tok.text!r}: {embed_output[0][tok_idx]}")
177 print()
178
179 # ── (e) after each CNN block ──
180 print("=" * 80)
181 print("(e) AFTER EACH CNN BLOCK (4 residual blocks)")
182 print("=" * 80)
183
184 # get the inner chain of residual blocks
185 # encode_with_array wraps a chain of 4 residual blocks
186 encode_inner = encode_with_array._layers[0] # the chain of 4 residuals
187 residual_layers = encode_inner._layers
188
189 # run residual blocks one at a time on the embed output
190 current = embed_output[0].copy()
191 print(f"\n input to CNN (embed output): shape={current.shape}")
192 for block_idx, res_block in enumerate(residual_layers):
193 current = res_block.predict(current)
194 print(f"\n after CNN block {block_idx}: shape={current.shape}")
195 for tok_idx, tok in enumerate(doc_fresh):
196 print(f" token[{tok_idx}] {tok.text!r}: {current[tok_idx]}")
197
198 # verify against full encode
199 encoded_output = encode_with_array.predict(embed_output)
200 print(f"\n ground-truth encode output: shape={encoded_output[0].shape}")
201 for tok_idx, tok in enumerate(doc_fresh):
202 print(f" token[{tok_idx}] {tok.text!r}: {encoded_output[0][tok_idx]}")
203
204 # verify match
205 diff = np.abs(current - encoded_output[0]).max()
206 print(f" max diff between manual CNN and model encode: {diff}")
207 print()
208
209 # ── (f) after linear projection ──
210 print("=" * 80)
211 print("(f) AFTER LINEAR PROJECTION (96 → 64-dim)")
212 print("=" * 80)
213
214 # apply list2array then linear
215 as_array = list2array_layer.predict(encoded_output)
216 print(f" list2array output: shape={as_array.shape}")
217
218 projected = linear_layer.predict(as_array)
219 print(f" linear projection output: shape={projected.shape}")
220 for tok_idx, tok in enumerate(doc_fresh):
221 print(f" token[{tok_idx}] {tok.text!r}: {projected[tok_idx]}")
222
223 # also verify via full tok2vec predict
224 doc_verify = nlp.make_doc(SENTENCE)
225 full_output = tok2vec.predict([doc_verify])
226 print(f"\n ground-truth full tok2vec output: shape={full_output.shape}")
227 for tok_idx, tok in enumerate(doc_verify):
228 print(f" token[{tok_idx}] {tok.text!r}: {full_output[tok_idx]}")
229 diff = np.abs(projected - full_output).max()
230 print(f" max diff between manual and model.predict: {diff}")
231
232 # also print linear weights for reference
233 W_lin = linear_layer.get_param("W")
234 b_lin = linear_layer.get_param("b")
235 print(f"\n linear W: {W_lin.shape}, b: {b_lin.shape}")
236 print()
237
238 # ── (g) parser steps ──
239 print("=" * 80)
240 print("(g) PARSER (NER) TRANSITION STEPS")
241 print("=" * 80)
242
243 # get weights
244 lower_W = lower.get_param("W") # (3, 64, 2, 64) = (nF, nO, nP, nI)
245 lower_b = lower.get_param("b") # (64, 2) = (nO, nP)
246 lower_pad = lower.get_param("pad") # (1, 3, 64, 2) = (1, nF, nO, nP)
247 upper_W = upper.get_param("W") # (74, 64) = (n_actions, nO)
248 upper_b = upper.get_param("b") # (74,) = (n_actions,)
249
250 nF, nO_l, nP_l, nI_l = lower_W.shape
251 print(f"\n lower: W={lower_W.shape} (nF={nF}, nO={nO_l}, nP={nP_l}, nI={nI_l})")
252 print(f" lower: b={lower_b.shape}, pad={lower_pad.shape}")
253 print(f" upper: W={upper_W.shape}, b={upper_b.shape}")
254
255 # get action names
256 moves = ner.moves
257 n_actions = moves.n_moves
258 action_names = [moves.get_class_name(i) for i in range(n_actions)]
259 print(f"\n actions ({n_actions}): {action_names}")
260
261 # precompute lower features for all tokens
262 # PrecomputableAffine: Y[t, f, o, p] = sum_i(X[t, i] * W[f, o, p, i])
263 # bias b[o,p] is added AFTER summing features, not per-token
264 # X = tokvecs, shape (nT, nI_l=64)
265 tokvecs = full_output # shape (nT, 64)
266 precomputed = np.einsum('ti,fopi->tfop', tokvecs, lower_W)
267 print(f"\n precomputed shape: {precomputed.shape}")
268
269 print(f"\n lower pad vector:")
270 print(f" pad shape: {lower_pad.shape}")
271 for f in range(nF):
272 print(f" pad[0, {f}]: {lower_pad[0, f]}")
273 print()
274
275 print(f" precomputed features per token:")
276 for tok_idx in range(n_tokens):
277 print(f" token[{tok_idx}] {doc[tok_idx].text!r}:")
278 for f in range(nF):
279 print(f" feat[{f}]: {precomputed[tok_idx, f]}")
280 print()
281
282 # step through the NER transition system
283 print(" stepping through NER transitions...")
284 print()
285
286 # CRITICAL: for nF=3, spaCy's set_context_tokens uses:
287 # ids[0] = B(0) — current buffer token
288 # ids[1] = E(0) — first word of open entity, or -1
289 # ids[2] = B(0) - 1 — word before buffer (end of entity), or -1
290 # NOT [S(0), B(0), B(1)] as one might assume.
291
292 from spacy.pipeline._parser_internals.stateclass import StateClass
293
294 doc_step = nlp.make_doc(SENTENCE)
295 # we need the tok2vec output on this doc — run tok2vec.predict
296 step_tokvecs = tok2vec.predict([doc_step])
297 # precompute for this doc (NO bias — bias is added after summing features)
298 step_precomputed = np.einsum('ti,fopi->tfop', step_tokvecs, lower_W)
299
300 # use spaCy's actual get_token_ids via ParserStepModel to get correct features
301 # we construct a lightweight wrapper that gives us token_ids
302 state = StateClass(doc_step)
303 step = 0
304
305 # helper: extract token ids using spaCy's actual C code
306 def get_feat_ids(state):
307 """get feature token ids using spaCy's set_context_tokens."""
308 ids = np.zeros((1, nF), dtype=np.int32)
309 ids.fill(-1)
310 # use the state's C-level set_context_tokens via the StateClass wrapper
311 # StateClass wraps StateC; we can call get_token_ids on a step model
312 # but it's simpler to just use the known nF=3 logic:
313 # ids[0] = B(0), ids[1] = E(0) if entity open else -1,
314 # ids[2] = B(0)-1 if both ids[0] and ids[1] are valid else -1
315 b0 = state.B(0)
316 if b0 >= 0:
317 ids[0, 0] = b0
318 else:
319 ids[0, 0] = -1
320 if state.entity_is_open():
321 ids[0, 1] = state.E(0)
322 else:
323 ids[0, 1] = -1
324 if ids[0, 0] == -1 or ids[0, 1] == -1:
325 ids[0, 2] = -1
326 else:
327 ids[0, 2] = ids[0, 0] - 1
328 return ids[0]
329
330 while not state.is_final():
331 feat_ids = get_feat_ids(state)
332
333 print(f" --- step {step} ---")
334 feat_labels = ["B(0)", "E(0)", "B(0)-1"]
335 print(f" feature token indices: {feat_labels[0]}={feat_ids[0]}, {feat_labels[1]}={feat_ids[1]}, {feat_labels[2]}={feat_ids[2]}")
336 for fi in range(nF):
337 tid = feat_ids[fi]
338 if 0 <= tid < n_tokens:
339 print(f" feat[{fi}] ({feat_labels[fi]}) → token[{tid}] = {doc_step[tid].text!r}")
340 else:
341 print(f" feat[{fi}] ({feat_labels[fi]}) → PAD (index {tid})")
342
343 # sum precomputed features (or pad) for each feature slot
344 hidden_input = np.zeros((nO_l, nP_l), dtype=np.float32)
345 for fi in range(nF):
346 tid = feat_ids[fi]
347 if 0 <= tid < n_tokens:
348 contrib = step_precomputed[tid, fi]
349 hidden_input += contrib
350 print(f" precomp[{tid},{fi}] = {contrib}")
351 else:
352 contrib = lower_pad[0, fi]
353 hidden_input += contrib
354 print(f" pad[{fi}] = {contrib}")
355
356 # add bias (applied after summing, before maxout)
357 hidden_input += lower_b
358 print(f" summed + bias (nO×nP = {nO_l}×{nP_l}): {hidden_input}")
359
360 # maxout over pieces
361 hidden = hidden_input.max(axis=-1)
362 which = hidden_input.argmax(axis=-1)
363 print(f" after maxout: {hidden}")
364 print(f" maxout winners: {which}")
365
366 # apply upper: scores = hidden @ W.T + b
367 scores = hidden @ upper_W.T + upper_b
368 print(f" raw scores ({n_actions}): {scores}")
369
370 # valid actions and best — is_valid takes a move name string
371 valid = []
372 for i in range(n_actions):
373 if moves.is_valid(state, action_names[i]):
374 valid.append(i)
375
376 # find best valid
377 best_idx = -1
378 best_score = -float('inf')
379 for i in valid:
380 if scores[i] > best_score:
381 best_score = scores[i]
382 best_idx = i
383
384 print(f" valid actions: {[action_names[i] for i in valid]}")
385 print(f" top-5 by score:")
386 top5 = np.argsort(scores)[::-1][:5]
387 for rank, idx in enumerate(top5):
388 v = "(valid)" if idx in valid else "(INVALID)"
389 print(f" [{rank}] {action_names[idx]}: {scores[idx]:.10f} {v}")
390
391 if best_idx >= 0:
392 print(f" chosen: {action_names[best_idx]} (score={best_score:.10f})")
393 moves.apply_transition(state, action_names[best_idx])
394 else:
395 print(f" no valid actions, breaking")
396 break
397
398 print()
399 step += 1
400 if step > 30:
401 print(" (safety limit: stopping after 30 steps)")
402 break
403
404 # apply state annotations to the doc so entities are visible
405 moves.set_annotations(state, doc_step)
406
407 # show final entities
408 print(f" final entities (from manual stepping):")
409 for ent in doc_step.ents:
410 print(f" {ent.text!r} → {ent.label_} [{ent.start_char}:{ent.end_char}]")
411
412 # compare with nlp() result
413 print()
414 doc_auto = nlp(SENTENCE)
415 print(f" final entities (from nlp()):")
416 for ent in doc_auto.ents:
417 print(f" {ent.text!r} → {ent.label_} [{ent.start_char}:{ent.end_char}]")
418
419 match = [(e.text, e.label_) for e in doc_step.ents] == [(e.text, e.label_) for e in doc_auto.ents]
420 print(f" manual vs nlp() match: {match}")
421
422 print("\n" + "=" * 80)
423 print("DONE")
424 print("=" * 80)
425
426
427if __name__ == "__main__":
428 main()