scripts/dump_intermediates.py at main

zzstoatzz.io / spacez
fork atom
this repo has no description
fork atom
spacez / scripts / dump_intermediates.py
at main 428 lines 17 kB view raw
wrap content
zzstoatzz.io fix: CNN padding, parser features, and UTF-8 offsets — 73/73 match 12d ago
a555bdea
  1"""dump ALL intermediate values from spaCy's NER pipeline for debugging.
  2
  3compares these against the spacez zig reimplementation to find divergence.
  4
  5usage:
  6  uv run --python 3.12 --with spacy \
  7    --with 'en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl' \
  8    python scripts/dump_intermediates.py
  9"""
 10
 11import numpy as np
 12
 13np.set_printoptions(precision=10, suppress=False, linewidth=200)
 14
 15SENTENCE = "NASA launched the Artemis mission."
 16
 17
 18def main():
 19    import spacy
 20    from spacy.attrs import NORM, PREFIX, SUFFIX, SHAPE
 21    from thinc.backends.numpy_ops import NumpyOps
 22
 23    nlp = spacy.load("en_core_web_sm")
 24    ner = nlp.get_pipe("ner")
 25    model = ner.model
 26    ops = NumpyOps()
 27
 28    # the NER model has its own tok2vec ref (not a listener — a standalone model)
 29    # with 4 hash embeds (NORM, PREFIX, SUFFIX, SHAPE), not the 6 from the pipe tok2vec
 30    tok2vec = model.get_ref("tok2vec")
 31    lower = model.get_ref("lower")
 32    upper = model.get_ref("upper")
 33
 34    # walk the NER tok2vec to find components
 35    hash_embeds = []
 36    reduce_maxout = None
 37    reduce_ln = None
 38    for node in tok2vec.walk():
 39        if node.name == "hashembed":
 40            hash_embeds.append(node)
 41        if node.name == "maxout" and reduce_maxout is None and len(hash_embeds) == 4:
 42            reduce_maxout = node
 43        if node.name == "layernorm" and reduce_ln is None and reduce_maxout is not None:
 44            reduce_ln = node
 45            break
 46    assert len(hash_embeds) == 4, f"expected 4 hash embeds, got {len(hash_embeds)}"
 47
 48    # find CNN residual blocks — filter to exactly the single-block residuals,
 49    # not the chain-of-residuals node
 50    residual_blocks = []
 51    for node in tok2vec.walk():
 52        if node.name == "residual(expand_window>>maxout>>layernorm>>dropout)":
 53            residual_blocks.append(node)
 54    assert len(residual_blocks) == 4, f"expected 4 residual blocks, got {len(residual_blocks)}"
 55
 56    # find linear projection
 57    linear_proj = None
 58    for node in tok2vec.walk():
 59        if node.name == "linear":
 60            linear_proj = node
 61
 62    # ── (a) token texts ──
 63    print("=" * 80)
 64    print("(a) TOKEN TEXTS (from tokenizer)")
 65    print("=" * 80)
 66    doc = nlp.make_doc(SENTENCE)
 67    tokens = [t.text for t in doc]
 68    print(f"sentence: {SENTENCE!r}")
 69    print(f"tokens ({len(tokens)}): {tokens}")
 70    print()
 71
 72    # ── (b) token attributes: NORM, PREFIX, SUFFIX, SHAPE hashes ──
 73    print("=" * 80)
 74    print("(b) TOKEN ATTRIBUTES (NORM, PREFIX, SUFFIX, SHAPE hashes)")
 75    print("=" * 80)
 76    attr_array = doc.to_array([NORM, PREFIX, SUFFIX, SHAPE])
 77    for i, tok in enumerate(doc):
 78        print(f"  token[{i}] = {tok.text!r}")
 79        print(f"    NORM   = 0x{attr_array[i][0]:016x}  ({tok.norm_!r})")
 80        print(f"    PREFIX = 0x{attr_array[i][1]:016x}  ({tok.prefix_!r})")
 81        print(f"    SUFFIX = 0x{attr_array[i][2]:016x}  ({tok.suffix_!r})")
 82        print(f"    SHAPE  = 0x{attr_array[i][3]:016x}  ({tok.shape_!r})")
 83    print()
 84
 85    # ── (c) hash embedding table lookups ──
 86    print("=" * 80)
 87    print("(c) HASH EMBEDDING TABLE LOOKUPS (raw rows from each embed table)")
 88    print("=" * 80)
 89    attr_names = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
 90
 91    all_embed_rows = []  # [embed_idx][tok_idx] = row vector
 92    for embed_idx, he in enumerate(hash_embeds):
 93        E = he.get_param("E")
 94        seed = he.attrs["seed"]
 95        nV = E.shape[0]
 96        nO = E.shape[1]
 97        print(f"\n  hash_embed[{embed_idx}] ({attr_names[embed_idx]}): E.shape={E.shape}, seed={seed}")
 98
 99        embed_rows = []
100        for tok_idx, tok in enumerate(doc):
101            attr_val = attr_array[tok_idx][embed_idx]
102            key_arr = np.array([attr_val], dtype=np.uint64)
103            hash_result = ops.hash(key_arr, seed)  # shape (1, 4) of uint32
104            bucket = hash_result[0][0] % nV
105            row = E[bucket]
106            embed_rows.append(row)
107            print(f"    token[{tok_idx}] {tok.text!r}: attr=0x{attr_val:016x}, hash4={hash_result[0].tolist()}, bucket={bucket}")
108            print(f"      row = {row}")
109        all_embed_rows.append(embed_rows)
110    print()
111
112    # ── (d) MultiHashEmbed output (after concat → maxout reduce → layernorm) ──
113    print("=" * 80)
114    print("(d) MULTIHASHEMBED OUTPUT (concat → maxout → layernorm → 96-dim)")
115    print("=" * 80)
116
117    # concatenate the 4 embed rows for each token: 4 × 96 = 384
118    n_tokens = len(doc)
119    concat_matrix = np.zeros((n_tokens, 4 * 96), dtype=np.float32)
120    for tok_idx in range(n_tokens):
121        parts = [all_embed_rows[e][tok_idx] for e in range(4)]
122        concat_matrix[tok_idx] = np.concatenate(parts)
123
124    print(f"\n  concatenated embeddings: shape={concat_matrix.shape}")
125    for tok_idx, tok in enumerate(doc):
126        print(f"    token[{tok_idx}] {tok.text!r}: first 16 = {concat_matrix[tok_idx, :16]}")
127
128    # apply reduction maxout
129    W_maxout = reduce_maxout.get_param("W")
130    b_maxout = reduce_maxout.get_param("b")
131    nO = b_maxout.shape[0]
132    nP = b_maxout.shape[1]
133    nI = W_maxout.shape[-1]
134    print(f"\n  reduce maxout: W={W_maxout.shape}, b={b_maxout.shape} (nO={nO}, nP={nP}, nI={nI})")
135
136    W_flat = W_maxout.reshape(nO * nP, nI)
137    b_flat = b_maxout.reshape(nO * nP)
138    Y_pre_max = concat_matrix @ W_flat.T + b_flat
139    Y_pieces = Y_pre_max.reshape(-1, nO, nP)
140    Y_maxout = Y_pieces.max(axis=-1)
141
142    print(f"\n  after maxout: shape={Y_maxout.shape}")
143    for tok_idx, tok in enumerate(doc):
144        print(f"    token[{tok_idx}] {tok.text!r}: {Y_maxout[tok_idx]}")
145
146    # apply layernorm
147    G_ln = reduce_ln.get_param("G")
148    b_ln = reduce_ln.get_param("b")
149    mean = Y_maxout.mean(axis=-1, keepdims=True)
150    var = Y_maxout.var(axis=-1, keepdims=True)
151    std = np.sqrt(var + 1e-12)
152    Y_ln = G_ln * (Y_maxout - mean) / std + b_ln
153
154    print(f"\n  after layernorm (manual): shape={Y_ln.shape}")
155    for tok_idx, tok in enumerate(doc):
156        print(f"    token[{tok_idx}] {tok.text!r}: {Y_ln[tok_idx]}")
157
158    # now get ground truth by running the embed portion of the model
159    # the NER tok2vec structure is:
160    #   layers[0] = tok2vec_chain (embed+cnn)
161    #     layers[0]._layers[0] = embed (extract_features >> list2ragged >> with_array(concat_embeds) >> with_array(maxout>>ln>>drop) >> ragged2list)
162    #     layers[0]._layers[1] = with_array(4 × residual)
163    #   layers[1] = list2array
164    #   layers[2] = linear
165    tok2vec_chain = tok2vec._layers[0]
166    embed_chain = tok2vec_chain._layers[0]
167    encode_with_array = tok2vec_chain._layers[1]
168    list2array_layer = tok2vec._layers[1]
169    linear_layer = tok2vec._layers[2]
170
171    # run the embed chain to get ground-truth embed output
172    doc_fresh = nlp.make_doc(SENTENCE)
173    embed_output = embed_chain.predict([doc_fresh])  # list of arrays
174    print(f"\n  ground-truth embed output (from model): shape={embed_output[0].shape}")
175    for tok_idx, tok in enumerate(doc_fresh):
176        print(f"    token[{tok_idx}] {tok.text!r}: {embed_output[0][tok_idx]}")
177    print()
178
179    # ── (e) after each CNN block ──
180    print("=" * 80)
181    print("(e) AFTER EACH CNN BLOCK (4 residual blocks)")
182    print("=" * 80)
183
184    # get the inner chain of residual blocks
185    # encode_with_array wraps a chain of 4 residual blocks
186    encode_inner = encode_with_array._layers[0]  # the chain of 4 residuals
187    residual_layers = encode_inner._layers
188
189    # run residual blocks one at a time on the embed output
190    current = embed_output[0].copy()
191    print(f"\n  input to CNN (embed output): shape={current.shape}")
192    for block_idx, res_block in enumerate(residual_layers):
193        current = res_block.predict(current)
194        print(f"\n  after CNN block {block_idx}: shape={current.shape}")
195        for tok_idx, tok in enumerate(doc_fresh):
196            print(f"    token[{tok_idx}] {tok.text!r}: {current[tok_idx]}")
197
198    # verify against full encode
199    encoded_output = encode_with_array.predict(embed_output)
200    print(f"\n  ground-truth encode output: shape={encoded_output[0].shape}")
201    for tok_idx, tok in enumerate(doc_fresh):
202        print(f"    token[{tok_idx}] {tok.text!r}: {encoded_output[0][tok_idx]}")
203
204    # verify match
205    diff = np.abs(current - encoded_output[0]).max()
206    print(f"  max diff between manual CNN and model encode: {diff}")
207    print()
208
209    # ── (f) after linear projection ──
210    print("=" * 80)
211    print("(f) AFTER LINEAR PROJECTION (96 → 64-dim)")
212    print("=" * 80)
213
214    # apply list2array then linear
215    as_array = list2array_layer.predict(encoded_output)
216    print(f"  list2array output: shape={as_array.shape}")
217
218    projected = linear_layer.predict(as_array)
219    print(f"  linear projection output: shape={projected.shape}")
220    for tok_idx, tok in enumerate(doc_fresh):
221        print(f"    token[{tok_idx}] {tok.text!r}: {projected[tok_idx]}")
222
223    # also verify via full tok2vec predict
224    doc_verify = nlp.make_doc(SENTENCE)
225    full_output = tok2vec.predict([doc_verify])
226    print(f"\n  ground-truth full tok2vec output: shape={full_output.shape}")
227    for tok_idx, tok in enumerate(doc_verify):
228        print(f"    token[{tok_idx}] {tok.text!r}: {full_output[tok_idx]}")
229    diff = np.abs(projected - full_output).max()
230    print(f"  max diff between manual and model.predict: {diff}")
231
232    # also print linear weights for reference
233    W_lin = linear_layer.get_param("W")
234    b_lin = linear_layer.get_param("b")
235    print(f"\n  linear W: {W_lin.shape}, b: {b_lin.shape}")
236    print()
237
238    # ── (g) parser steps ──
239    print("=" * 80)
240    print("(g) PARSER (NER) TRANSITION STEPS")
241    print("=" * 80)
242
243    # get weights
244    lower_W = lower.get_param("W")  # (3, 64, 2, 64) = (nF, nO, nP, nI)
245    lower_b = lower.get_param("b")  # (64, 2) = (nO, nP)
246    lower_pad = lower.get_param("pad")  # (1, 3, 64, 2) = (1, nF, nO, nP)
247    upper_W = upper.get_param("W")  # (74, 64) = (n_actions, nO)
248    upper_b = upper.get_param("b")  # (74,) = (n_actions,)
249
250    nF, nO_l, nP_l, nI_l = lower_W.shape
251    print(f"\n  lower: W={lower_W.shape} (nF={nF}, nO={nO_l}, nP={nP_l}, nI={nI_l})")
252    print(f"  lower: b={lower_b.shape}, pad={lower_pad.shape}")
253    print(f"  upper: W={upper_W.shape}, b={upper_b.shape}")
254
255    # get action names
256    moves = ner.moves
257    n_actions = moves.n_moves
258    action_names = [moves.get_class_name(i) for i in range(n_actions)]
259    print(f"\n  actions ({n_actions}): {action_names}")
260
261    # precompute lower features for all tokens
262    # PrecomputableAffine: Y[t, f, o, p] = sum_i(X[t, i] * W[f, o, p, i])
263    # bias b[o,p] is added AFTER summing features, not per-token
264    # X = tokvecs, shape (nT, nI_l=64)
265    tokvecs = full_output  # shape (nT, 64)
266    precomputed = np.einsum('ti,fopi->tfop', tokvecs, lower_W)
267    print(f"\n  precomputed shape: {precomputed.shape}")
268
269    print(f"\n  lower pad vector:")
270    print(f"    pad shape: {lower_pad.shape}")
271    for f in range(nF):
272        print(f"    pad[0, {f}]: {lower_pad[0, f]}")
273    print()
274
275    print(f"  precomputed features per token:")
276    for tok_idx in range(n_tokens):
277        print(f"    token[{tok_idx}] {doc[tok_idx].text!r}:")
278        for f in range(nF):
279            print(f"      feat[{f}]: {precomputed[tok_idx, f]}")
280    print()
281
282    # step through the NER transition system
283    print("  stepping through NER transitions...")
284    print()
285
286    # CRITICAL: for nF=3, spaCy's set_context_tokens uses:
287    #   ids[0] = B(0)         — current buffer token
288    #   ids[1] = E(0)         — first word of open entity, or -1
289    #   ids[2] = B(0) - 1     — word before buffer (end of entity), or -1
290    # NOT [S(0), B(0), B(1)] as one might assume.
291
292    from spacy.pipeline._parser_internals.stateclass import StateClass
293
294    doc_step = nlp.make_doc(SENTENCE)
295    # we need the tok2vec output on this doc — run tok2vec.predict
296    step_tokvecs = tok2vec.predict([doc_step])
297    # precompute for this doc (NO bias — bias is added after summing features)
298    step_precomputed = np.einsum('ti,fopi->tfop', step_tokvecs, lower_W)
299
300    # use spaCy's actual get_token_ids via ParserStepModel to get correct features
301    # we construct a lightweight wrapper that gives us token_ids
302    state = StateClass(doc_step)
303    step = 0
304
305    # helper: extract token ids using spaCy's actual C code
306    def get_feat_ids(state):
307        """get feature token ids using spaCy's set_context_tokens."""
308        ids = np.zeros((1, nF), dtype=np.int32)
309        ids.fill(-1)
310        # use the state's C-level set_context_tokens via the StateClass wrapper
311        # StateClass wraps StateC; we can call get_token_ids on a step model
312        # but it's simpler to just use the known nF=3 logic:
313        #   ids[0] = B(0), ids[1] = E(0) if entity open else -1,
314        #   ids[2] = B(0)-1 if both ids[0] and ids[1] are valid else -1
315        b0 = state.B(0)
316        if b0 >= 0:
317            ids[0, 0] = b0
318        else:
319            ids[0, 0] = -1
320        if state.entity_is_open():
321            ids[0, 1] = state.E(0)
322        else:
323            ids[0, 1] = -1
324        if ids[0, 0] == -1 or ids[0, 1] == -1:
325            ids[0, 2] = -1
326        else:
327            ids[0, 2] = ids[0, 0] - 1
328        return ids[0]
329
330    while not state.is_final():
331        feat_ids = get_feat_ids(state)
332
333        print(f"  --- step {step} ---")
334        feat_labels = ["B(0)", "E(0)", "B(0)-1"]
335        print(f"    feature token indices: {feat_labels[0]}={feat_ids[0]}, {feat_labels[1]}={feat_ids[1]}, {feat_labels[2]}={feat_ids[2]}")
336        for fi in range(nF):
337            tid = feat_ids[fi]
338            if 0 <= tid < n_tokens:
339                print(f"      feat[{fi}] ({feat_labels[fi]}) → token[{tid}] = {doc_step[tid].text!r}")
340            else:
341                print(f"      feat[{fi}] ({feat_labels[fi]}) → PAD (index {tid})")
342
343        # sum precomputed features (or pad) for each feature slot
344        hidden_input = np.zeros((nO_l, nP_l), dtype=np.float32)
345        for fi in range(nF):
346            tid = feat_ids[fi]
347            if 0 <= tid < n_tokens:
348                contrib = step_precomputed[tid, fi]
349                hidden_input += contrib
350                print(f"    precomp[{tid},{fi}] = {contrib}")
351            else:
352                contrib = lower_pad[0, fi]
353                hidden_input += contrib
354                print(f"    pad[{fi}] = {contrib}")
355
356        # add bias (applied after summing, before maxout)
357        hidden_input += lower_b
358        print(f"    summed + bias (nO×nP = {nO_l}×{nP_l}): {hidden_input}")
359
360        # maxout over pieces
361        hidden = hidden_input.max(axis=-1)
362        which = hidden_input.argmax(axis=-1)
363        print(f"    after maxout: {hidden}")
364        print(f"    maxout winners: {which}")
365
366        # apply upper: scores = hidden @ W.T + b
367        scores = hidden @ upper_W.T + upper_b
368        print(f"    raw scores ({n_actions}): {scores}")
369
370        # valid actions and best — is_valid takes a move name string
371        valid = []
372        for i in range(n_actions):
373            if moves.is_valid(state, action_names[i]):
374                valid.append(i)
375
376        # find best valid
377        best_idx = -1
378        best_score = -float('inf')
379        for i in valid:
380            if scores[i] > best_score:
381                best_score = scores[i]
382                best_idx = i
383
384        print(f"    valid actions: {[action_names[i] for i in valid]}")
385        print(f"    top-5 by score:")
386        top5 = np.argsort(scores)[::-1][:5]
387        for rank, idx in enumerate(top5):
388            v = "(valid)" if idx in valid else "(INVALID)"
389            print(f"      [{rank}] {action_names[idx]}: {scores[idx]:.10f} {v}")
390
391        if best_idx >= 0:
392            print(f"    chosen: {action_names[best_idx]} (score={best_score:.10f})")
393            moves.apply_transition(state, action_names[best_idx])
394        else:
395            print(f"    no valid actions, breaking")
396            break
397
398        print()
399        step += 1
400        if step > 30:
401            print("  (safety limit: stopping after 30 steps)")
402            break
403
404    # apply state annotations to the doc so entities are visible
405    moves.set_annotations(state, doc_step)
406
407    # show final entities
408    print(f"  final entities (from manual stepping):")
409    for ent in doc_step.ents:
410        print(f"    {ent.text!r} → {ent.label_} [{ent.start_char}:{ent.end_char}]")
411
412    # compare with nlp() result
413    print()
414    doc_auto = nlp(SENTENCE)
415    print(f"  final entities (from nlp()):")
416    for ent in doc_auto.ents:
417        print(f"    {ent.text!r} → {ent.label_} [{ent.start_char}:{ent.end_char}]")
418
419    match = [(e.text, e.label_) for e in doc_step.ents] == [(e.text, e.label_) for e in doc_auto.ents]
420    print(f"  manual vs nlp() match: {match}")
421
422    print("\n" + "=" * 80)
423    print("DONE")
424    print("=" * 80)
425
426
427if __name__ == "__main__":
428    main()