this repo has no description
at master 660 lines 21 kB view raw
1package yaml 2 3import ( 4 "bytes" 5 "encoding/base64" 6 "errors" 7 "fmt" 8 "io" 9 "regexp" 10 "slices" 11 "strconv" 12 "strings" 13 "sync" 14 15 "go.yaml.in/yaml/v3" 16 17 "cuelang.org/go/cue/ast" 18 "cuelang.org/go/cue/literal" 19 "cuelang.org/go/cue/token" 20) 21 22// TODO(mvdan): we should sanity check that the decoder always produces valid CUE, 23// as it is possible to construct a cue/ast syntax tree with invalid literals 24// or with expressions that will always error, such as `float & 123`. 25// 26// One option would be to do this as part of the tests; a more general approach 27// may be fuzzing, which would find more bugs and work for any decoder, 28// although it may be slow as we need to involve the evaluator. 29 30// Decoder is a temporary interface compatible with both the old and new yaml decoders. 31type Decoder interface { 32 // Decode consumes a YAML value and returns it in CUE syntax tree node. 33 Decode() (ast.Expr, error) 34} 35 36// decoder wraps a [yaml.Decoder] to extract CUE syntax tree nodes. 37type decoder struct { 38 yamlDecoder yaml.Decoder 39 40 // yamlNonEmpty is true once yamlDecoder tells us the input YAML wasn't empty. 41 // Useful so that we can extract "null" when the input is empty. 42 yamlNonEmpty bool 43 44 // decodeErr is returned by any further calls to Decode when not nil. 45 decodeErr error 46 47 tokFile *token.File 48 tokLines []int 49 50 // pendingHeadComments collects the head (preceding) comments 51 // from the YAML nodes we are extracting. 52 // We can't add comments to a CUE syntax tree node until we've created it, 53 // but we need to extract these comments first since they have earlier positions. 54 pendingHeadComments []*ast.Comment 55 56 // extractingAliases ensures we don't loop forever when expanding YAML anchors. 57 extractingAliases map[*yaml.Node]bool 58 59 // lastPos is the last YAML node position that we decoded, 60 // used for working out relative positions such as token.NewSection. 61 // This position can only increase, moving forward in the file. 62 lastPos token.Position 63 64 // forceNewline ensures that the next position will be on a new line. 65 forceNewline bool 66} 67 68// TODO(mvdan): this can be io.Reader really, except that token.Pos is offset-based, 69// so the only way to really have true Offset+Line+Col numbers is to know 70// the size of the entire YAML node upfront. 71// With json we can use RawMessage to know the size of the input 72// before we extract into ast.Expr, but unfortunately, yaml.Node has no size. 73 74// NewDecoder creates a decoder for YAML values to extract CUE syntax tree nodes. 75// 76// The filename is used for position information in CUE syntax tree nodes 77// as well as any errors encountered while decoding YAML. 78func NewDecoder(filename string, b []byte) *decoder { 79 // Note that yaml.v3 can insert a null node just past the end of the input 80 // in some edge cases, so we pretend that there's an extra newline 81 // so that we don't panic when handling such a position. 82 tokFile := token.NewFile(filename, 0, len(b)+1) 83 tokFile.SetLinesForContent(b) 84 return &decoder{ 85 tokFile: tokFile, 86 tokLines: append(tokFile.Lines(), len(b)), 87 yamlDecoder: *yaml.NewDecoder(bytes.NewReader(b)), 88 } 89} 90 91// Decode consumes a YAML value and returns it in CUE syntax tree node. 92// 93// A nil node with an io.EOF error is returned once no more YAML values 94// are available for decoding. 95func (d *decoder) Decode() (ast.Expr, error) { 96 if err := d.decodeErr; err != nil { 97 return nil, err 98 } 99 var yn yaml.Node 100 if err := d.yamlDecoder.Decode(&yn); err != nil { 101 if err == io.EOF { 102 // Any further Decode calls must return EOF to avoid an endless loop. 103 d.decodeErr = io.EOF 104 105 // If the input is empty, we produce `*null | _` followed by EOF. 106 // Note that when the input contains "---", we get an empty document 107 // with a null scalar value inside instead. 108 if !d.yamlNonEmpty { 109 // Attach positions which at least point to the filename. 110 pos := d.tokFile.Pos(0, token.NoRelPos) 111 return &ast.BinaryExpr{ 112 Op: token.OR, 113 OpPos: pos, 114 X: &ast.UnaryExpr{ 115 Op: token.MUL, 116 OpPos: pos, 117 X: &ast.BasicLit{ 118 Kind: token.NULL, 119 ValuePos: pos, 120 Value: "null", 121 }, 122 }, 123 Y: &ast.Ident{ 124 Name: "_", 125 NamePos: pos, 126 }, 127 }, nil 128 } 129 // If the input wasn't empty, we already decoded some CUE syntax nodes, 130 // so here we should just return io.EOF to stop. 131 return nil, io.EOF 132 } 133 // Unfortunately, yaml.v3's syntax errors are opaque strings, 134 // and they only include line numbers in some but not all cases. 135 // TODO(mvdan): improve upstream's errors so they are structured 136 // and always contain some position information. 137 e := err.Error() 138 if s, ok := strings.CutPrefix(e, "yaml: line "); ok { 139 // From "yaml: line 3: some issue" to "foo.yaml:3: some issue". 140 e = d.tokFile.Name() + ":" + s 141 } else if s, ok := strings.CutPrefix(e, "yaml:"); ok { 142 // From "yaml: some issue" to "foo.yaml: some issue". 143 e = d.tokFile.Name() + ":" + s 144 } else { 145 return nil, err 146 } 147 err = errors.New(e) 148 // Any further Decode calls repeat this error. 149 d.decodeErr = err 150 return nil, err 151 } 152 d.yamlNonEmpty = true 153 return d.extract(&yn) 154} 155 156// Unmarshal parses a single YAML value to a CUE expression. 157func Unmarshal(filename string, data []byte) (ast.Expr, error) { 158 d := NewDecoder(filename, data) 159 n, err := d.Decode() 160 if err != nil { 161 if err == io.EOF { 162 return nil, nil // empty input 163 } 164 return nil, err 165 } 166 // TODO(mvdan): decoding the entire next value is unnecessary; 167 // consider either a "More" or "Done" method to tell if we are at EOF, 168 // or splitting the Decode method into two variants. 169 // This should use proper error values with positions as well. 170 if n2, err := d.Decode(); err == nil { 171 return nil, fmt.Errorf("%s: expected a single YAML document", n2.Pos()) 172 } else if err != io.EOF { 173 return nil, fmt.Errorf("expected a single YAML document: %v", err) 174 } 175 return n, nil 176} 177 178func (d *decoder) extract(yn *yaml.Node) (ast.Expr, error) { 179 d.addHeadCommentsToPending(yn) 180 var expr ast.Expr 181 var err error 182 switch yn.Kind { 183 case yaml.DocumentNode: 184 expr, err = d.document(yn) 185 case yaml.SequenceNode: 186 expr, err = d.sequence(yn) 187 case yaml.MappingNode: 188 expr, err = d.mapping(yn) 189 case yaml.ScalarNode: 190 expr, err = d.scalar(yn) 191 case yaml.AliasNode: 192 expr, err = d.alias(yn) 193 default: 194 return nil, d.posErrorf(yn, "unknown yaml node kind: %d", yn.Kind) 195 } 196 if err != nil { 197 return nil, err 198 } 199 d.addCommentsToNode(expr, yn, 1) 200 return expr, nil 201} 202 203// comments parses a newline-delimited list of YAML "#" comments 204// and turns them into a list of cue/ast comments. 205func (d *decoder) comments(src string) []*ast.Comment { 206 if src == "" { 207 return nil 208 } 209 var comments []*ast.Comment 210 for line := range strings.SplitSeq(src, "\n") { 211 if line == "" { 212 continue // yaml.v3 comments have a trailing newline at times 213 } 214 comments = append(comments, &ast.Comment{ 215 // Trim the leading "#". 216 // Note that yaml.v3 does not give us comment positions. 217 Text: "//" + line[1:], 218 }) 219 } 220 return comments 221} 222 223// addHeadCommentsToPending parses a node's head comments and adds them to a pending list, 224// to be used later by addComments once a cue/ast node is constructed. 225func (d *decoder) addHeadCommentsToPending(yn *yaml.Node) { 226 comments := d.comments(yn.HeadComment) 227 // TODO(mvdan): once yaml.v3 records comment positions, 228 // we can better ensure that sections separated by empty lines are kept that way. 229 // For now, all we can do is approximate by counting lines, 230 // and assuming that head comments are not separated from their node. 231 // This will be wrong in some cases, moving empty lines, but is better than nothing. 232 if len(d.pendingHeadComments) == 0 && len(comments) > 0 { 233 c := comments[0] 234 if d.lastPos.IsValid() && (yn.Line-len(comments))-d.lastPos.Line >= 2 { 235 c.Slash = c.Slash.WithRel(token.NewSection) 236 } 237 } 238 d.pendingHeadComments = append(d.pendingHeadComments, comments...) 239} 240 241// addCommentsToNode adds any pending head comments, plus a YAML node's line 242// and foot comments, to a cue/ast node. 243func (d *decoder) addCommentsToNode(n ast.Node, yn *yaml.Node, linePos int8) { 244 // cue/ast and cue/format are not able to attach a comment to a node 245 // when the comment immediately follows the node. 246 // For some nodes like fields, the best we can do is move the comments up. 247 // For the root-level struct, we do want to leave comments 248 // at the end of the document to be left at the very end. 249 // 250 // TODO(mvdan): can we do better? for example, support attaching trailing comments to a cue/ast.Node? 251 footComments := d.comments(yn.FootComment) 252 if _, ok := n.(*ast.StructLit); !ok { 253 d.pendingHeadComments = append(d.pendingHeadComments, footComments...) 254 footComments = nil 255 } 256 if comments := d.pendingHeadComments; len(comments) > 0 { 257 ast.AddComment(n, &ast.CommentGroup{ 258 Doc: true, 259 Position: 0, 260 List: comments, 261 }) 262 } 263 if comments := d.comments(yn.LineComment); len(comments) > 0 { 264 ast.AddComment(n, &ast.CommentGroup{ 265 Line: true, 266 Position: linePos, 267 List: comments, 268 }) 269 } 270 if comments := footComments; len(comments) > 0 { 271 ast.AddComment(n, &ast.CommentGroup{ 272 // After 100 tokens, so that the comment goes after the entire node. 273 // TODO(mvdan): this is hacky, can the cue/ast API support trailing comments better? 274 Position: 100, 275 List: comments, 276 }) 277 } 278 d.pendingHeadComments = nil 279} 280 281func (d *decoder) posErrorf(yn *yaml.Node, format string, args ...any) error { 282 // TODO(mvdan): use columns as well; for now they are left out to avoid test churn 283 // return fmt.Errorf(d.pos(n).String()+" "+format, args...) 284 return fmt.Errorf(d.tokFile.Name()+":"+strconv.Itoa(yn.Line)+": "+format, args...) 285} 286 287// pos converts a YAML node position to a cue/ast position. 288// Note that this method uses and updates the last position in lastPos, 289// so it should be called on YAML nodes in increasing position order. 290func (d *decoder) pos(yn *yaml.Node) token.Pos { 291 // Calculate the position's offset via the line and column numbers. 292 offset := d.tokLines[yn.Line-1] + (yn.Column - 1) 293 pos := d.tokFile.Pos(offset, token.NoRelPos) 294 295 if d.forceNewline { 296 d.forceNewline = false 297 pos = pos.WithRel(token.Newline) 298 } else if d.lastPos.IsValid() { 299 switch { 300 case yn.Line-d.lastPos.Line >= 2: 301 pos = pos.WithRel(token.NewSection) 302 case yn.Line-d.lastPos.Line == 1: 303 pos = pos.WithRel(token.Newline) 304 case yn.Column-d.lastPos.Column > 0: 305 pos = pos.WithRel(token.Blank) 306 default: 307 pos = pos.WithRel(token.NoSpace) 308 } 309 // If for any reason the node's position is before the last position, 310 // give up and return an empty position. Akin to: yn.Pos().Before(d.lastPos) 311 // 312 // TODO(mvdan): Brought over from the old decoder; when does this happen? 313 // Can we get rid of those edge cases and this bit of logic? 314 if yn.Line < d.lastPos.Line || (yn.Line == d.lastPos.Line && yn.Column < d.lastPos.Column) { 315 return token.NoPos 316 } 317 } 318 d.lastPos = token.Position{Line: yn.Line, Column: yn.Column} 319 return pos 320} 321 322func (d *decoder) document(yn *yaml.Node) (ast.Expr, error) { 323 if n := len(yn.Content); n != 1 { 324 return nil, d.posErrorf(yn, "yaml document nodes are meant to have one content node but have %d", n) 325 } 326 return d.extract(yn.Content[0]) 327} 328 329func (d *decoder) sequence(yn *yaml.Node) (ast.Expr, error) { 330 list := &ast.ListLit{ 331 Lbrack: d.pos(yn).WithRel(token.Blank), 332 } 333 multiline := false 334 if len(yn.Content) > 0 { 335 multiline = yn.Line < yn.Content[len(yn.Content)-1].Line 336 } 337 338 // If a list is empty, or ends with a struct, the closing `]` is on the same line. 339 closeSameLine := true 340 for _, c := range yn.Content { 341 d.forceNewline = multiline 342 elem, err := d.extract(c) 343 if err != nil { 344 return nil, err 345 } 346 list.Elts = append(list.Elts, elem) 347 // A list of structs begins with `[{`, so let it end with `}]`. 348 _, closeSameLine = elem.(*ast.StructLit) 349 } 350 if multiline && !closeSameLine { 351 list.Rbrack = list.Rbrack.WithRel(token.Newline) 352 } 353 return list, nil 354} 355 356func (d *decoder) mapping(yn *yaml.Node) (ast.Expr, error) { 357 strct := &ast.StructLit{} 358 multiline := false 359 if len(yn.Content) > 0 { 360 multiline = yn.Line < yn.Content[len(yn.Content)-1].Line 361 } 362 363 if err := d.insertMap(yn, strct, multiline, false); err != nil { 364 return nil, err 365 } 366 // TODO(mvdan): moving these positions above insertMap breaks a few tests, why? 367 strct.Lbrace = d.pos(yn).WithRel(token.Blank) 368 if multiline { 369 strct.Rbrace = strct.Lbrace.WithRel(token.Newline) 370 } else { 371 strct.Rbrace = strct.Lbrace 372 } 373 return strct, nil 374} 375 376func (d *decoder) insertMap(yn *yaml.Node, m *ast.StructLit, multiline, mergeValues bool) error { 377 l := len(yn.Content) 378outer: 379 for i := 0; i < l; i += 2 { 380 if multiline { 381 d.forceNewline = true 382 } 383 yk, yv := yn.Content[i], yn.Content[i+1] 384 d.addHeadCommentsToPending(yk) 385 if isMerge(yk) { 386 mergeValues = true 387 if err := d.merge(yv, m, multiline); err != nil { 388 return err 389 } 390 continue 391 } 392 393 field := &ast.Field{} 394 label, err := d.label(yk) 395 if err != nil { 396 return err 397 } 398 d.addCommentsToNode(field, yk, 2) 399 field.Label = label 400 401 if mergeValues { 402 key := labelStr(label) 403 for _, decl := range m.Elts { 404 f := decl.(*ast.Field) 405 name, _, err := ast.LabelName(f.Label) 406 if err == nil && name == key { 407 f.Value, err = d.extract(yv) 408 if err != nil { 409 return err 410 } 411 continue outer 412 } 413 } 414 } 415 416 value, err := d.extract(yv) 417 if err != nil { 418 return err 419 } 420 field.Value = value 421 422 m.Elts = append(m.Elts, field) 423 } 424 return nil 425} 426 427func (d *decoder) merge(yn *yaml.Node, m *ast.StructLit, multiline bool) error { 428 switch yn.Kind { 429 case yaml.MappingNode: 430 return d.insertMap(yn, m, multiline, true) 431 case yaml.AliasNode: 432 return d.insertMap(yn.Alias, m, multiline, true) 433 case yaml.SequenceNode: 434 // Step backwards as earlier nodes take precedence. 435 for _, c := range slices.Backward(yn.Content) { 436 if err := d.merge(c, m, multiline); err != nil { 437 return err 438 } 439 } 440 return nil 441 default: 442 return d.posErrorf(yn, "map merge requires map or sequence of maps as the value") 443 } 444} 445 446func (d *decoder) label(yn *yaml.Node) (ast.Label, error) { 447 pos := d.pos(yn) 448 449 var expr ast.Expr 450 var err error 451 var value string 452 switch yn.Kind { 453 case yaml.ScalarNode: 454 expr, err = d.scalar(yn) 455 value = yn.Value 456 case yaml.AliasNode: 457 if yn.Alias.Kind != yaml.ScalarNode { 458 return nil, d.posErrorf(yn, "invalid map key: %v", yn.Alias.ShortTag()) 459 } 460 expr, err = d.alias(yn) 461 value = yn.Alias.Value 462 default: 463 return nil, d.posErrorf(yn, "invalid map key: %v", yn.ShortTag()) 464 } 465 if err != nil { 466 return nil, err 467 } 468 469 switch expr := expr.(type) { 470 case *ast.BasicLit: 471 if expr.Kind != token.STRING { 472 // With incoming YAML like `Null: 1`, the key scalar is normalized to "null". 473 value = expr.Value 474 } 475 label := ast.NewStringLabel(value) 476 ast.SetPos(label, pos) 477 return label, nil 478 default: 479 return nil, d.posErrorf(yn, "invalid label "+value) 480 } 481} 482 483const ( 484 // TODO(mvdan): The strings below are from yaml.v3; should we be relying on upstream somehow? 485 nullTag = "!!null" 486 boolTag = "!!bool" 487 strTag = "!!str" 488 intTag = "!!int" 489 floatTag = "!!float" 490 timestampTag = "!!timestamp" 491 seqTag = "!!seq" 492 mapTag = "!!map" 493 binaryTag = "!!binary" 494 mergeTag = "!!merge" 495) 496 497// rxAnyOctalYaml11 uses the implicit tag resolution regular expression for base-8 integers 498// from YAML's 1.1 spec, but including the 8 and 9 digits which aren't valid for octal integers. 499var rxAnyOctalYaml11 = sync.OnceValue(func() *regexp.Regexp { 500 return regexp.MustCompile(`^[-+]?0[0-9_]+$`) 501}) 502 503func (d *decoder) scalar(yn *yaml.Node) (ast.Expr, error) { 504 tag := yn.ShortTag() 505 // If the YAML scalar has no explicit tag, yaml.v3 infers a float tag, 506 // and the value looks like a YAML 1.1 octal literal, 507 // that means the input value was like `01289` and not a valid octal integer. 508 // The safest thing to do, and what most YAML decoders do, is to interpret as a string. 509 if yn.Style&yaml.TaggedStyle == 0 && tag == floatTag && rxAnyOctalYaml11().MatchString(yn.Value) { 510 tag = strTag 511 } 512 switch tag { 513 // TODO: use parse literal or parse expression instead. 514 case timestampTag: 515 return &ast.BasicLit{ 516 ValuePos: d.pos(yn), 517 Kind: token.STRING, 518 Value: literal.String.Quote(yn.Value), 519 }, nil 520 case strTag: 521 return &ast.BasicLit{ 522 ValuePos: d.pos(yn), 523 Kind: token.STRING, 524 Value: literal.String.WithOptionalTabIndent(1).Quote(yn.Value), 525 }, nil 526 527 case binaryTag: 528 data, err := base64.StdEncoding.DecodeString(yn.Value) 529 if err != nil { 530 return nil, d.posErrorf(yn, "!!binary value contains invalid base64 data") 531 } 532 return &ast.BasicLit{ 533 ValuePos: d.pos(yn), 534 Kind: token.STRING, 535 Value: literal.Bytes.Quote(string(data)), 536 }, nil 537 538 case boolTag: 539 t := false 540 switch yn.Value { 541 // TODO(mvdan): The strings below are from yaml.v3; should we be relying on upstream somehow? 542 case "true", "True", "TRUE": 543 t = true 544 } 545 lit := ast.NewBool(t) 546 lit.ValuePos = d.pos(yn) 547 return lit, nil 548 549 case intTag: 550 // Convert YAML octal to CUE octal. If YAML accepted an invalid 551 // integer, just convert it as well to ensure CUE will fail. 552 value := yn.Value 553 if len(value) > 1 && value[0] == '0' && value[1] <= '9' { 554 value = "0o" + value[1:] 555 } 556 var info literal.NumInfo 557 // We make the assumption that any valid YAML integer literal will be a valid 558 // CUE integer literal as well, with the only exception of octal numbers above. 559 // Note that `!!int 123.456` is not allowed. 560 if err := literal.ParseNum(value, &info); err != nil { 561 return nil, d.posErrorf(yn, "cannot decode %q as %s: %v", value, tag, err) 562 } else if !info.IsInt() { 563 return nil, d.posErrorf(yn, "cannot decode %q as %s: not a literal number", value, tag) 564 } 565 return d.makeNum(yn, value, token.INT), nil 566 567 case floatTag: 568 value := yn.Value 569 // TODO(mvdan): The strings below are from yaml.v3; should we be relying on upstream somehow? 570 switch value { 571 case ".inf", ".Inf", ".INF", "+.inf", "+.Inf", "+.INF": 572 value = "+Inf" 573 case "-.inf", "-.Inf", "-.INF": 574 value = "-Inf" 575 case ".nan", ".NaN", ".NAN": 576 value = "NaN" 577 default: 578 var info literal.NumInfo 579 // We make the assumption that any valid YAML float literal will be a valid 580 // CUE float literal as well, with the only exception of Inf/NaN above. 581 // Note that `!!float 123` is allowed. 582 if err := literal.ParseNum(value, &info); err != nil { 583 return nil, d.posErrorf(yn, "cannot decode %q as %s: %v", value, tag, err) 584 } 585 // If the decoded YAML scalar was explicitly or implicitly a float, 586 // and the scalar literal looks like an integer, 587 // unify it with "number" to record the fact that it was represented as a float. 588 // Don't unify with float, as `float & 123` is invalid, and there's no need 589 // to forbid representing the number as an integer either. 590 if yn.Tag != "" { 591 if p := strings.IndexAny(value, ".eEiInN"); p == -1 { 592 // TODO: number(v) when we have conversions 593 // TODO(mvdan): don't shove the unification inside a BasicLit.Value string 594 // 595 // TODO(mvdan): would it be better to do turn `!!float 123` into `123.0` 596 // rather than `number & 123`? Note that `float & 123` is an error. 597 value = fmt.Sprintf("number & %s", value) 598 } 599 } 600 } 601 return d.makeNum(yn, value, token.FLOAT), nil 602 603 case nullTag: 604 return &ast.BasicLit{ 605 ValuePos: d.pos(yn).WithRel(token.Blank), 606 Kind: token.NULL, 607 Value: "null", 608 }, nil 609 default: 610 return nil, d.posErrorf(yn, "cannot unmarshal tag %q", tag) 611 } 612} 613 614func (d *decoder) makeNum(yn *yaml.Node, val string, kind token.Token) (expr ast.Expr) { 615 val, negative := strings.CutPrefix(val, "-") 616 expr = &ast.BasicLit{ 617 ValuePos: d.pos(yn), 618 Kind: kind, 619 Value: val, 620 } 621 if negative { 622 expr = &ast.UnaryExpr{ 623 OpPos: d.pos(yn), 624 Op: token.SUB, 625 X: expr, 626 } 627 } 628 return expr 629} 630 631func (d *decoder) alias(yn *yaml.Node) (ast.Expr, error) { 632 if d.extractingAliases[yn] { 633 // TODO this could actually be allowed in some circumstances. 634 return nil, d.posErrorf(yn, "anchor %q value contains itself", yn.Value) 635 } 636 if d.extractingAliases == nil { 637 d.extractingAliases = make(map[*yaml.Node]bool) 638 } 639 d.extractingAliases[yn] = true 640 var node ast.Expr 641 node, err := d.extract(yn.Alias) 642 delete(d.extractingAliases, yn) 643 return node, err 644} 645 646func labelStr(l ast.Label) string { 647 switch l := l.(type) { 648 case *ast.Ident: 649 return l.Name 650 case *ast.BasicLit: 651 s, _ := literal.Unquote(l.Value) 652 return s 653 } 654 return "" 655} 656 657func isMerge(yn *yaml.Node) bool { 658 // TODO(mvdan): The boolean logic below is from yaml.v3; should we be relying on upstream somehow? 659 return yn.Kind == yaml.ScalarNode && yn.Value == "<<" && (yn.Tag == "" || yn.Tag == "!" || yn.ShortTag() == mergeTag) 660}