1package yaml
2
3import (
4 "bytes"
5 "encoding/base64"
6 "errors"
7 "fmt"
8 "io"
9 "regexp"
10 "slices"
11 "strconv"
12 "strings"
13 "sync"
14
15 "go.yaml.in/yaml/v3"
16
17 "cuelang.org/go/cue/ast"
18 "cuelang.org/go/cue/literal"
19 "cuelang.org/go/cue/token"
20)
21
22// TODO(mvdan): we should sanity check that the decoder always produces valid CUE,
23// as it is possible to construct a cue/ast syntax tree with invalid literals
24// or with expressions that will always error, such as `float & 123`.
25//
26// One option would be to do this as part of the tests; a more general approach
27// may be fuzzing, which would find more bugs and work for any decoder,
28// although it may be slow as we need to involve the evaluator.
29
30// Decoder is a temporary interface compatible with both the old and new yaml decoders.
31type Decoder interface {
32 // Decode consumes a YAML value and returns it in CUE syntax tree node.
33 Decode() (ast.Expr, error)
34}
35
36// decoder wraps a [yaml.Decoder] to extract CUE syntax tree nodes.
37type decoder struct {
38 yamlDecoder yaml.Decoder
39
40 // yamlNonEmpty is true once yamlDecoder tells us the input YAML wasn't empty.
41 // Useful so that we can extract "null" when the input is empty.
42 yamlNonEmpty bool
43
44 // decodeErr is returned by any further calls to Decode when not nil.
45 decodeErr error
46
47 tokFile *token.File
48 tokLines []int
49
50 // pendingHeadComments collects the head (preceding) comments
51 // from the YAML nodes we are extracting.
52 // We can't add comments to a CUE syntax tree node until we've created it,
53 // but we need to extract these comments first since they have earlier positions.
54 pendingHeadComments []*ast.Comment
55
56 // extractingAliases ensures we don't loop forever when expanding YAML anchors.
57 extractingAliases map[*yaml.Node]bool
58
59 // lastPos is the last YAML node position that we decoded,
60 // used for working out relative positions such as token.NewSection.
61 // This position can only increase, moving forward in the file.
62 lastPos token.Position
63
64 // forceNewline ensures that the next position will be on a new line.
65 forceNewline bool
66}
67
68// TODO(mvdan): this can be io.Reader really, except that token.Pos is offset-based,
69// so the only way to really have true Offset+Line+Col numbers is to know
70// the size of the entire YAML node upfront.
71// With json we can use RawMessage to know the size of the input
72// before we extract into ast.Expr, but unfortunately, yaml.Node has no size.
73
74// NewDecoder creates a decoder for YAML values to extract CUE syntax tree nodes.
75//
76// The filename is used for position information in CUE syntax tree nodes
77// as well as any errors encountered while decoding YAML.
78func NewDecoder(filename string, b []byte) *decoder {
79 // Note that yaml.v3 can insert a null node just past the end of the input
80 // in some edge cases, so we pretend that there's an extra newline
81 // so that we don't panic when handling such a position.
82 tokFile := token.NewFile(filename, 0, len(b)+1)
83 tokFile.SetLinesForContent(b)
84 return &decoder{
85 tokFile: tokFile,
86 tokLines: append(tokFile.Lines(), len(b)),
87 yamlDecoder: *yaml.NewDecoder(bytes.NewReader(b)),
88 }
89}
90
91// Decode consumes a YAML value and returns it in CUE syntax tree node.
92//
93// A nil node with an io.EOF error is returned once no more YAML values
94// are available for decoding.
95func (d *decoder) Decode() (ast.Expr, error) {
96 if err := d.decodeErr; err != nil {
97 return nil, err
98 }
99 var yn yaml.Node
100 if err := d.yamlDecoder.Decode(&yn); err != nil {
101 if err == io.EOF {
102 // Any further Decode calls must return EOF to avoid an endless loop.
103 d.decodeErr = io.EOF
104
105 // If the input is empty, we produce `*null | _` followed by EOF.
106 // Note that when the input contains "---", we get an empty document
107 // with a null scalar value inside instead.
108 if !d.yamlNonEmpty {
109 // Attach positions which at least point to the filename.
110 pos := d.tokFile.Pos(0, token.NoRelPos)
111 return &ast.BinaryExpr{
112 Op: token.OR,
113 OpPos: pos,
114 X: &ast.UnaryExpr{
115 Op: token.MUL,
116 OpPos: pos,
117 X: &ast.BasicLit{
118 Kind: token.NULL,
119 ValuePos: pos,
120 Value: "null",
121 },
122 },
123 Y: &ast.Ident{
124 Name: "_",
125 NamePos: pos,
126 },
127 }, nil
128 }
129 // If the input wasn't empty, we already decoded some CUE syntax nodes,
130 // so here we should just return io.EOF to stop.
131 return nil, io.EOF
132 }
133 // Unfortunately, yaml.v3's syntax errors are opaque strings,
134 // and they only include line numbers in some but not all cases.
135 // TODO(mvdan): improve upstream's errors so they are structured
136 // and always contain some position information.
137 e := err.Error()
138 if s, ok := strings.CutPrefix(e, "yaml: line "); ok {
139 // From "yaml: line 3: some issue" to "foo.yaml:3: some issue".
140 e = d.tokFile.Name() + ":" + s
141 } else if s, ok := strings.CutPrefix(e, "yaml:"); ok {
142 // From "yaml: some issue" to "foo.yaml: some issue".
143 e = d.tokFile.Name() + ":" + s
144 } else {
145 return nil, err
146 }
147 err = errors.New(e)
148 // Any further Decode calls repeat this error.
149 d.decodeErr = err
150 return nil, err
151 }
152 d.yamlNonEmpty = true
153 return d.extract(&yn)
154}
155
156// Unmarshal parses a single YAML value to a CUE expression.
157func Unmarshal(filename string, data []byte) (ast.Expr, error) {
158 d := NewDecoder(filename, data)
159 n, err := d.Decode()
160 if err != nil {
161 if err == io.EOF {
162 return nil, nil // empty input
163 }
164 return nil, err
165 }
166 // TODO(mvdan): decoding the entire next value is unnecessary;
167 // consider either a "More" or "Done" method to tell if we are at EOF,
168 // or splitting the Decode method into two variants.
169 // This should use proper error values with positions as well.
170 if n2, err := d.Decode(); err == nil {
171 return nil, fmt.Errorf("%s: expected a single YAML document", n2.Pos())
172 } else if err != io.EOF {
173 return nil, fmt.Errorf("expected a single YAML document: %v", err)
174 }
175 return n, nil
176}
177
178func (d *decoder) extract(yn *yaml.Node) (ast.Expr, error) {
179 d.addHeadCommentsToPending(yn)
180 var expr ast.Expr
181 var err error
182 switch yn.Kind {
183 case yaml.DocumentNode:
184 expr, err = d.document(yn)
185 case yaml.SequenceNode:
186 expr, err = d.sequence(yn)
187 case yaml.MappingNode:
188 expr, err = d.mapping(yn)
189 case yaml.ScalarNode:
190 expr, err = d.scalar(yn)
191 case yaml.AliasNode:
192 expr, err = d.alias(yn)
193 default:
194 return nil, d.posErrorf(yn, "unknown yaml node kind: %d", yn.Kind)
195 }
196 if err != nil {
197 return nil, err
198 }
199 d.addCommentsToNode(expr, yn, 1)
200 return expr, nil
201}
202
203// comments parses a newline-delimited list of YAML "#" comments
204// and turns them into a list of cue/ast comments.
205func (d *decoder) comments(src string) []*ast.Comment {
206 if src == "" {
207 return nil
208 }
209 var comments []*ast.Comment
210 for line := range strings.SplitSeq(src, "\n") {
211 if line == "" {
212 continue // yaml.v3 comments have a trailing newline at times
213 }
214 comments = append(comments, &ast.Comment{
215 // Trim the leading "#".
216 // Note that yaml.v3 does not give us comment positions.
217 Text: "//" + line[1:],
218 })
219 }
220 return comments
221}
222
223// addHeadCommentsToPending parses a node's head comments and adds them to a pending list,
224// to be used later by addComments once a cue/ast node is constructed.
225func (d *decoder) addHeadCommentsToPending(yn *yaml.Node) {
226 comments := d.comments(yn.HeadComment)
227 // TODO(mvdan): once yaml.v3 records comment positions,
228 // we can better ensure that sections separated by empty lines are kept that way.
229 // For now, all we can do is approximate by counting lines,
230 // and assuming that head comments are not separated from their node.
231 // This will be wrong in some cases, moving empty lines, but is better than nothing.
232 if len(d.pendingHeadComments) == 0 && len(comments) > 0 {
233 c := comments[0]
234 if d.lastPos.IsValid() && (yn.Line-len(comments))-d.lastPos.Line >= 2 {
235 c.Slash = c.Slash.WithRel(token.NewSection)
236 }
237 }
238 d.pendingHeadComments = append(d.pendingHeadComments, comments...)
239}
240
241// addCommentsToNode adds any pending head comments, plus a YAML node's line
242// and foot comments, to a cue/ast node.
243func (d *decoder) addCommentsToNode(n ast.Node, yn *yaml.Node, linePos int8) {
244 // cue/ast and cue/format are not able to attach a comment to a node
245 // when the comment immediately follows the node.
246 // For some nodes like fields, the best we can do is move the comments up.
247 // For the root-level struct, we do want to leave comments
248 // at the end of the document to be left at the very end.
249 //
250 // TODO(mvdan): can we do better? for example, support attaching trailing comments to a cue/ast.Node?
251 footComments := d.comments(yn.FootComment)
252 if _, ok := n.(*ast.StructLit); !ok {
253 d.pendingHeadComments = append(d.pendingHeadComments, footComments...)
254 footComments = nil
255 }
256 if comments := d.pendingHeadComments; len(comments) > 0 {
257 ast.AddComment(n, &ast.CommentGroup{
258 Doc: true,
259 Position: 0,
260 List: comments,
261 })
262 }
263 if comments := d.comments(yn.LineComment); len(comments) > 0 {
264 ast.AddComment(n, &ast.CommentGroup{
265 Line: true,
266 Position: linePos,
267 List: comments,
268 })
269 }
270 if comments := footComments; len(comments) > 0 {
271 ast.AddComment(n, &ast.CommentGroup{
272 // After 100 tokens, so that the comment goes after the entire node.
273 // TODO(mvdan): this is hacky, can the cue/ast API support trailing comments better?
274 Position: 100,
275 List: comments,
276 })
277 }
278 d.pendingHeadComments = nil
279}
280
281func (d *decoder) posErrorf(yn *yaml.Node, format string, args ...any) error {
282 // TODO(mvdan): use columns as well; for now they are left out to avoid test churn
283 // return fmt.Errorf(d.pos(n).String()+" "+format, args...)
284 return fmt.Errorf(d.tokFile.Name()+":"+strconv.Itoa(yn.Line)+": "+format, args...)
285}
286
287// pos converts a YAML node position to a cue/ast position.
288// Note that this method uses and updates the last position in lastPos,
289// so it should be called on YAML nodes in increasing position order.
290func (d *decoder) pos(yn *yaml.Node) token.Pos {
291 // Calculate the position's offset via the line and column numbers.
292 offset := d.tokLines[yn.Line-1] + (yn.Column - 1)
293 pos := d.tokFile.Pos(offset, token.NoRelPos)
294
295 if d.forceNewline {
296 d.forceNewline = false
297 pos = pos.WithRel(token.Newline)
298 } else if d.lastPos.IsValid() {
299 switch {
300 case yn.Line-d.lastPos.Line >= 2:
301 pos = pos.WithRel(token.NewSection)
302 case yn.Line-d.lastPos.Line == 1:
303 pos = pos.WithRel(token.Newline)
304 case yn.Column-d.lastPos.Column > 0:
305 pos = pos.WithRel(token.Blank)
306 default:
307 pos = pos.WithRel(token.NoSpace)
308 }
309 // If for any reason the node's position is before the last position,
310 // give up and return an empty position. Akin to: yn.Pos().Before(d.lastPos)
311 //
312 // TODO(mvdan): Brought over from the old decoder; when does this happen?
313 // Can we get rid of those edge cases and this bit of logic?
314 if yn.Line < d.lastPos.Line || (yn.Line == d.lastPos.Line && yn.Column < d.lastPos.Column) {
315 return token.NoPos
316 }
317 }
318 d.lastPos = token.Position{Line: yn.Line, Column: yn.Column}
319 return pos
320}
321
322func (d *decoder) document(yn *yaml.Node) (ast.Expr, error) {
323 if n := len(yn.Content); n != 1 {
324 return nil, d.posErrorf(yn, "yaml document nodes are meant to have one content node but have %d", n)
325 }
326 return d.extract(yn.Content[0])
327}
328
329func (d *decoder) sequence(yn *yaml.Node) (ast.Expr, error) {
330 list := &ast.ListLit{
331 Lbrack: d.pos(yn).WithRel(token.Blank),
332 }
333 multiline := false
334 if len(yn.Content) > 0 {
335 multiline = yn.Line < yn.Content[len(yn.Content)-1].Line
336 }
337
338 // If a list is empty, or ends with a struct, the closing `]` is on the same line.
339 closeSameLine := true
340 for _, c := range yn.Content {
341 d.forceNewline = multiline
342 elem, err := d.extract(c)
343 if err != nil {
344 return nil, err
345 }
346 list.Elts = append(list.Elts, elem)
347 // A list of structs begins with `[{`, so let it end with `}]`.
348 _, closeSameLine = elem.(*ast.StructLit)
349 }
350 if multiline && !closeSameLine {
351 list.Rbrack = list.Rbrack.WithRel(token.Newline)
352 }
353 return list, nil
354}
355
356func (d *decoder) mapping(yn *yaml.Node) (ast.Expr, error) {
357 strct := &ast.StructLit{}
358 multiline := false
359 if len(yn.Content) > 0 {
360 multiline = yn.Line < yn.Content[len(yn.Content)-1].Line
361 }
362
363 if err := d.insertMap(yn, strct, multiline, false); err != nil {
364 return nil, err
365 }
366 // TODO(mvdan): moving these positions above insertMap breaks a few tests, why?
367 strct.Lbrace = d.pos(yn).WithRel(token.Blank)
368 if multiline {
369 strct.Rbrace = strct.Lbrace.WithRel(token.Newline)
370 } else {
371 strct.Rbrace = strct.Lbrace
372 }
373 return strct, nil
374}
375
376func (d *decoder) insertMap(yn *yaml.Node, m *ast.StructLit, multiline, mergeValues bool) error {
377 l := len(yn.Content)
378outer:
379 for i := 0; i < l; i += 2 {
380 if multiline {
381 d.forceNewline = true
382 }
383 yk, yv := yn.Content[i], yn.Content[i+1]
384 d.addHeadCommentsToPending(yk)
385 if isMerge(yk) {
386 mergeValues = true
387 if err := d.merge(yv, m, multiline); err != nil {
388 return err
389 }
390 continue
391 }
392
393 field := &ast.Field{}
394 label, err := d.label(yk)
395 if err != nil {
396 return err
397 }
398 d.addCommentsToNode(field, yk, 2)
399 field.Label = label
400
401 if mergeValues {
402 key := labelStr(label)
403 for _, decl := range m.Elts {
404 f := decl.(*ast.Field)
405 name, _, err := ast.LabelName(f.Label)
406 if err == nil && name == key {
407 f.Value, err = d.extract(yv)
408 if err != nil {
409 return err
410 }
411 continue outer
412 }
413 }
414 }
415
416 value, err := d.extract(yv)
417 if err != nil {
418 return err
419 }
420 field.Value = value
421
422 m.Elts = append(m.Elts, field)
423 }
424 return nil
425}
426
427func (d *decoder) merge(yn *yaml.Node, m *ast.StructLit, multiline bool) error {
428 switch yn.Kind {
429 case yaml.MappingNode:
430 return d.insertMap(yn, m, multiline, true)
431 case yaml.AliasNode:
432 return d.insertMap(yn.Alias, m, multiline, true)
433 case yaml.SequenceNode:
434 // Step backwards as earlier nodes take precedence.
435 for _, c := range slices.Backward(yn.Content) {
436 if err := d.merge(c, m, multiline); err != nil {
437 return err
438 }
439 }
440 return nil
441 default:
442 return d.posErrorf(yn, "map merge requires map or sequence of maps as the value")
443 }
444}
445
446func (d *decoder) label(yn *yaml.Node) (ast.Label, error) {
447 pos := d.pos(yn)
448
449 var expr ast.Expr
450 var err error
451 var value string
452 switch yn.Kind {
453 case yaml.ScalarNode:
454 expr, err = d.scalar(yn)
455 value = yn.Value
456 case yaml.AliasNode:
457 if yn.Alias.Kind != yaml.ScalarNode {
458 return nil, d.posErrorf(yn, "invalid map key: %v", yn.Alias.ShortTag())
459 }
460 expr, err = d.alias(yn)
461 value = yn.Alias.Value
462 default:
463 return nil, d.posErrorf(yn, "invalid map key: %v", yn.ShortTag())
464 }
465 if err != nil {
466 return nil, err
467 }
468
469 switch expr := expr.(type) {
470 case *ast.BasicLit:
471 if expr.Kind != token.STRING {
472 // With incoming YAML like `Null: 1`, the key scalar is normalized to "null".
473 value = expr.Value
474 }
475 label := ast.NewStringLabel(value)
476 ast.SetPos(label, pos)
477 return label, nil
478 default:
479 return nil, d.posErrorf(yn, "invalid label "+value)
480 }
481}
482
483const (
484 // TODO(mvdan): The strings below are from yaml.v3; should we be relying on upstream somehow?
485 nullTag = "!!null"
486 boolTag = "!!bool"
487 strTag = "!!str"
488 intTag = "!!int"
489 floatTag = "!!float"
490 timestampTag = "!!timestamp"
491 seqTag = "!!seq"
492 mapTag = "!!map"
493 binaryTag = "!!binary"
494 mergeTag = "!!merge"
495)
496
497// rxAnyOctalYaml11 uses the implicit tag resolution regular expression for base-8 integers
498// from YAML's 1.1 spec, but including the 8 and 9 digits which aren't valid for octal integers.
499var rxAnyOctalYaml11 = sync.OnceValue(func() *regexp.Regexp {
500 return regexp.MustCompile(`^[-+]?0[0-9_]+$`)
501})
502
503func (d *decoder) scalar(yn *yaml.Node) (ast.Expr, error) {
504 tag := yn.ShortTag()
505 // If the YAML scalar has no explicit tag, yaml.v3 infers a float tag,
506 // and the value looks like a YAML 1.1 octal literal,
507 // that means the input value was like `01289` and not a valid octal integer.
508 // The safest thing to do, and what most YAML decoders do, is to interpret as a string.
509 if yn.Style&yaml.TaggedStyle == 0 && tag == floatTag && rxAnyOctalYaml11().MatchString(yn.Value) {
510 tag = strTag
511 }
512 switch tag {
513 // TODO: use parse literal or parse expression instead.
514 case timestampTag:
515 return &ast.BasicLit{
516 ValuePos: d.pos(yn),
517 Kind: token.STRING,
518 Value: literal.String.Quote(yn.Value),
519 }, nil
520 case strTag:
521 return &ast.BasicLit{
522 ValuePos: d.pos(yn),
523 Kind: token.STRING,
524 Value: literal.String.WithOptionalTabIndent(1).Quote(yn.Value),
525 }, nil
526
527 case binaryTag:
528 data, err := base64.StdEncoding.DecodeString(yn.Value)
529 if err != nil {
530 return nil, d.posErrorf(yn, "!!binary value contains invalid base64 data")
531 }
532 return &ast.BasicLit{
533 ValuePos: d.pos(yn),
534 Kind: token.STRING,
535 Value: literal.Bytes.Quote(string(data)),
536 }, nil
537
538 case boolTag:
539 t := false
540 switch yn.Value {
541 // TODO(mvdan): The strings below are from yaml.v3; should we be relying on upstream somehow?
542 case "true", "True", "TRUE":
543 t = true
544 }
545 lit := ast.NewBool(t)
546 lit.ValuePos = d.pos(yn)
547 return lit, nil
548
549 case intTag:
550 // Convert YAML octal to CUE octal. If YAML accepted an invalid
551 // integer, just convert it as well to ensure CUE will fail.
552 value := yn.Value
553 if len(value) > 1 && value[0] == '0' && value[1] <= '9' {
554 value = "0o" + value[1:]
555 }
556 var info literal.NumInfo
557 // We make the assumption that any valid YAML integer literal will be a valid
558 // CUE integer literal as well, with the only exception of octal numbers above.
559 // Note that `!!int 123.456` is not allowed.
560 if err := literal.ParseNum(value, &info); err != nil {
561 return nil, d.posErrorf(yn, "cannot decode %q as %s: %v", value, tag, err)
562 } else if !info.IsInt() {
563 return nil, d.posErrorf(yn, "cannot decode %q as %s: not a literal number", value, tag)
564 }
565 return d.makeNum(yn, value, token.INT), nil
566
567 case floatTag:
568 value := yn.Value
569 // TODO(mvdan): The strings below are from yaml.v3; should we be relying on upstream somehow?
570 switch value {
571 case ".inf", ".Inf", ".INF", "+.inf", "+.Inf", "+.INF":
572 value = "+Inf"
573 case "-.inf", "-.Inf", "-.INF":
574 value = "-Inf"
575 case ".nan", ".NaN", ".NAN":
576 value = "NaN"
577 default:
578 var info literal.NumInfo
579 // We make the assumption that any valid YAML float literal will be a valid
580 // CUE float literal as well, with the only exception of Inf/NaN above.
581 // Note that `!!float 123` is allowed.
582 if err := literal.ParseNum(value, &info); err != nil {
583 return nil, d.posErrorf(yn, "cannot decode %q as %s: %v", value, tag, err)
584 }
585 // If the decoded YAML scalar was explicitly or implicitly a float,
586 // and the scalar literal looks like an integer,
587 // unify it with "number" to record the fact that it was represented as a float.
588 // Don't unify with float, as `float & 123` is invalid, and there's no need
589 // to forbid representing the number as an integer either.
590 if yn.Tag != "" {
591 if p := strings.IndexAny(value, ".eEiInN"); p == -1 {
592 // TODO: number(v) when we have conversions
593 // TODO(mvdan): don't shove the unification inside a BasicLit.Value string
594 //
595 // TODO(mvdan): would it be better to do turn `!!float 123` into `123.0`
596 // rather than `number & 123`? Note that `float & 123` is an error.
597 value = fmt.Sprintf("number & %s", value)
598 }
599 }
600 }
601 return d.makeNum(yn, value, token.FLOAT), nil
602
603 case nullTag:
604 return &ast.BasicLit{
605 ValuePos: d.pos(yn).WithRel(token.Blank),
606 Kind: token.NULL,
607 Value: "null",
608 }, nil
609 default:
610 return nil, d.posErrorf(yn, "cannot unmarshal tag %q", tag)
611 }
612}
613
614func (d *decoder) makeNum(yn *yaml.Node, val string, kind token.Token) (expr ast.Expr) {
615 val, negative := strings.CutPrefix(val, "-")
616 expr = &ast.BasicLit{
617 ValuePos: d.pos(yn),
618 Kind: kind,
619 Value: val,
620 }
621 if negative {
622 expr = &ast.UnaryExpr{
623 OpPos: d.pos(yn),
624 Op: token.SUB,
625 X: expr,
626 }
627 }
628 return expr
629}
630
631func (d *decoder) alias(yn *yaml.Node) (ast.Expr, error) {
632 if d.extractingAliases[yn] {
633 // TODO this could actually be allowed in some circumstances.
634 return nil, d.posErrorf(yn, "anchor %q value contains itself", yn.Value)
635 }
636 if d.extractingAliases == nil {
637 d.extractingAliases = make(map[*yaml.Node]bool)
638 }
639 d.extractingAliases[yn] = true
640 var node ast.Expr
641 node, err := d.extract(yn.Alias)
642 delete(d.extractingAliases, yn)
643 return node, err
644}
645
646func labelStr(l ast.Label) string {
647 switch l := l.(type) {
648 case *ast.Ident:
649 return l.Name
650 case *ast.BasicLit:
651 s, _ := literal.Unquote(l.Value)
652 return s
653 }
654 return ""
655}
656
657func isMerge(yn *yaml.Node) bool {
658 // TODO(mvdan): The boolean logic below is from yaml.v3; should we be relying on upstream somehow?
659 return yn.Kind == yaml.ScalarNode && yn.Value == "<<" && (yn.Tag == "" || yn.Tag == "!" || yn.ShortTag() == mergeTag)
660}