1// Copyright 2024 The CUE Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package toml converts TOML to and from CUE.
16//
17// WARNING: THIS PACKAGE IS EXPERIMENTAL.
18// ITS API MAY CHANGE AT ANY TIME.
19package toml
20
21import (
22 "fmt"
23 "io"
24 "maps"
25 "slices"
26 "strconv"
27 "strings"
28 "time"
29
30 toml "github.com/pelletier/go-toml/v2/unstable"
31
32 "cuelang.org/go/cue/ast"
33 "cuelang.org/go/cue/errors"
34 "cuelang.org/go/cue/literal"
35 "cuelang.org/go/cue/token"
36)
37
38// TODO(mvdan): schema and decode options
39
40// NewDecoder creates a decoder from a stream of TOML input.
41func NewDecoder(filename string, r io.Reader) *Decoder {
42 // Note that we don't consume the reader here,
43 // as there's no need, and we can't return an error either.
44 return &Decoder{r: r, filename: filename, seenTableKeys: make(map[string]bool)}
45}
46
47// Decoder implements the decoding state.
48//
49// Note that TOML files and streams never decode multiple CUE nodes;
50// subsequent calls to [Decoder.Decode] may return [io.EOF].
51type Decoder struct {
52 r io.Reader
53
54 filename string
55
56 decoded bool // whether [Decoder.Decoded] has been called already
57 parser toml.Parser
58
59 // seenTableKeys tracks which rooted keys we have already decoded as tables,
60 // as duplicate table keys in TOML are not allowed.
61 seenTableKeys map[rootedKey]bool
62
63 // topFile is the top-level CUE file we are decoding into.
64 // TODO(mvdan): make an *ast.File once the decoder returns ast.Node rather than ast.Expr.
65 topFile *ast.StructLit
66
67 // tokenFile is used to create positions which can be used for error values and syntax tree nodes.
68 tokenFile *token.File
69
70 // openTableArrays keeps track of all the declared table arrays so that
71 // later headers can append a new table array element, or add a field
72 // to the last element in a table array.
73 //
74 // TODO(mvdan): an unsorted slice means we do two linear searches per header key.
75 // For N distinct `[[keys]]`, this means a decoding runtime of O(2*N*N).
76 // Consider either sorting this array so we can do a binary search for O(N*log2(N)),
77 // or perhaps a tree, although for a nesting level D, that could cause O(N*D),
78 // and a tree would use more slices and so more allocations.
79 //
80 // Note that a map is not a good option either, because even though it makes
81 // exact lookups cheap, prefix matches are still linear and relatively slow.
82 // A sorted slice allows both mechanisms to use a form of binary search.
83 openTableArrays []openTableArray
84
85 // currentTableKey is the rooted key for the current table where the following
86 // TOML `key = value` lines will be inserted.
87 currentTableKey rootedKey
88
89 // currentTable is the CUE struct literal for currentTableKey.
90 // It is nil before the first [header] or [[header]],
91 // in which case any key-values are inserted in topFile.
92 currentTable *ast.StructLit
93}
94
95// rootedKey is a dot-separated path from the root of the TOML document.
96// The string elements in between the dots may be quoted to avoid ambiguity.
97// For the time being, this is just an alias for the sake of documentation.
98//
99// A path into an array element is like "arr.3",
100// which looks very similar to a table's "tbl.key",
101// particularly since a table key can be any string.
102// However, we just need these keys to detect duplicates,
103// and a path cannot be both an array and table, so it's OK.
104type rootedKey = string
105
106// openTableArray records information about a declared table array.
107type openTableArray struct {
108 rkey rootedKey
109 level int // the level of nesting, 1 or higher, e.g. 2 for key="foo.bar"
110 list *ast.ListLit
111 lastTable *ast.StructLit
112}
113
114// TODO(mvdan): support decoding comments
115
116// Decode parses the input stream as TOML and converts it to a CUE [*ast.File].
117// Because TOML files only contain a single top-level expression,
118// subsequent calls to this method may return [io.EOF].
119func (d *Decoder) Decode() (ast.Expr, error) {
120 if d.decoded {
121 return nil, io.EOF
122 }
123 d.decoded = true
124 // TODO(mvdan): unfortunately go-toml does not support streaming as of v2.2.2.
125 data, err := io.ReadAll(d.r)
126 if err != nil {
127 return nil, err
128 }
129 d.tokenFile = token.NewFile(d.filename, 0, len(data))
130 d.tokenFile.SetLinesForContent(data)
131 d.parser.Reset(data)
132 // Note that if the input is empty the result will be the same
133 // as for an empty table: an empty struct.
134 // The TOML spec and other decoders also work this way.
135 d.topFile = &ast.StructLit{}
136 for d.parser.NextExpression() {
137 if err := d.nextRootNode(d.parser.Expression()); err != nil {
138 return nil, err
139 }
140 }
141 if err := d.parser.Error(); err != nil {
142 if err, ok := err.(*toml.ParserError); ok {
143 shape := d.parser.Shape(d.parser.Range(err.Highlight))
144 return nil, d.posErrf(shape.Start, "%s", err.Message)
145 }
146 return nil, err
147 }
148 return d.topFile, nil
149}
150
151func (d *Decoder) shape(tnode *toml.Node) toml.Shape {
152 if tnode.Raw.Length == 0 {
153 // Otherwise the Shape method call below happily returns a position like 1:1,
154 // which is worse than no position information as it confuses the user.
155 panic("Decoder.nodePos was given an empty toml.Node as position")
156 }
157 return d.parser.Shape(tnode.Raw)
158}
159
160func (d *Decoder) nodeErrf(tnode *toml.Node, format string, args ...any) error {
161 return d.posErrf(d.shape(tnode).Start, format, args...)
162}
163
164func (d *Decoder) posErrf(pos toml.Position, format string, args ...any) error {
165 return errors.Newf(d.tokenFile.Pos(pos.Offset, token.NoRelPos), format, args...)
166}
167
168// nextRootNode is called for every top-level expression from the TOML parser.
169//
170// This method does not return a syntax tree node directly,
171// because some kinds of top-level expressions like comments and table headers
172// require recording some state in the decoder to produce a node at a later time.
173func (d *Decoder) nextRootNode(tnode *toml.Node) error {
174 switch tnode.Kind {
175 // Key-Values in TOML are in the form of:
176 //
177 // foo.title = "Foo"
178 // foo.bar.baz = "value"
179 //
180 // We decode them as "inline" structs in CUE, which keeps the original shape:
181 //
182 // foo: title: "Foo"
183 // foo: bar: baz: "value"
184 //
185 // An alternative would be to join struct literals, which avoids some repetition,
186 // but also introduces extra lines and may break some comment positions:
187 //
188 // foo: {
189 // title: "Foo"
190 // bar: baz: "value"
191 // }
192 case toml.KeyValue:
193 // Top-level fields begin a new line.
194 field, err := d.decodeField(d.currentTableKey, tnode, token.Newline)
195 if err != nil {
196 return err
197 }
198 if d.currentTable != nil {
199 d.currentTable.Elts = append(d.currentTable.Elts, field)
200 } else {
201 d.topFile.Elts = append(d.topFile.Elts, field)
202 }
203
204 case toml.Table:
205 // Tables always begin a new line.
206 key, keyElems := d.decodeKey("", tnode.Key())
207
208 // All table keys must be unique, including for the top-level table.
209 if d.seenTableKeys[key] {
210 return d.nodeErrf(tnode.Child(), "duplicate key: %s", key)
211 }
212 d.seenTableKeys[key] = true
213
214 // We want a multi-line struct with curly braces,
215 // just like TOML's tables are on multiple lines.
216 d.currentTable = &ast.StructLit{
217 // No positions, as TOML doesn't have table delimiters.
218 Lbrace: token.NoPos.WithRel(token.Blank),
219 Rbrace: token.NoPos.WithRel(token.Newline),
220 }
221 array := d.findArrayPrefix(key)
222 if array != nil { // [last_array.new_table]
223 if array.rkey == key {
224 return d.nodeErrf(tnode.Child(), "cannot redeclare table array %q as a table", key)
225 }
226 subKeyElems := keyElems[array.level:]
227 topField, leafField := d.inlineFields(subKeyElems, token.Newline)
228 array.lastTable.Elts = append(array.lastTable.Elts, topField)
229 leafField.Value = d.currentTable
230 } else { // [new_table]
231 topField, leafField := d.inlineFields(keyElems, token.Newline)
232 d.topFile.Elts = append(d.topFile.Elts, topField)
233 leafField.Value = d.currentTable
234 }
235 d.currentTableKey = key
236
237 case toml.ArrayTable:
238 // Table array elements always begin a new line.
239 key, keyElems := d.decodeKey("", tnode.Key())
240 if d.seenTableKeys[key] {
241 return d.nodeErrf(tnode.Child(), "cannot redeclare key %q as a table array", key)
242 }
243 // Each struct inside a table array sits on separate lines.
244 d.currentTable = &ast.StructLit{
245 // No positions, as TOML doesn't have table delimiters.
246 Lbrace: token.NoPos.WithRel(token.Newline),
247 Rbrace: token.NoPos.WithRel(token.Newline),
248 }
249 if array := d.findArrayPrefix(key); array != nil && array.level == len(keyElems) {
250 // [[last_array]] - appending to an existing array.
251 d.currentTableKey = key + "." + strconv.Itoa(len(array.list.Elts))
252 array.lastTable = d.currentTable
253 array.list.Elts = append(array.list.Elts, d.currentTable)
254 } else {
255 // Creating a new array via either [[new_array]] or [[last_array.new_array]].
256 // We want a multi-line list with square braces,
257 // since TOML's table arrays are on multiple lines.
258 list := &ast.ListLit{
259 // No positions, as TOML doesn't have array table delimiters.
260 Lbrack: token.NoPos.WithRel(token.Blank),
261 Rbrack: token.NoPos.WithRel(token.Newline),
262 }
263 if array == nil {
264 // [[new_array]] - at the top level
265 topField, leafField := d.inlineFields(keyElems, token.Newline)
266 d.topFile.Elts = append(d.topFile.Elts, topField)
267 leafField.Value = list
268 } else {
269 // [[last_array.new_array]] - on the last array element
270 subKeyElems := keyElems[array.level:]
271 topField, leafField := d.inlineFields(subKeyElems, token.Newline)
272 array.lastTable.Elts = append(array.lastTable.Elts, topField)
273 leafField.Value = list
274 }
275
276 d.currentTableKey = key + ".0"
277 list.Elts = append(list.Elts, d.currentTable)
278 d.openTableArrays = append(d.openTableArrays, openTableArray{
279 rkey: key,
280 level: len(keyElems),
281 list: list,
282 lastTable: d.currentTable,
283 })
284 }
285
286 default:
287 return fmt.Errorf("encoding/toml.Decoder.nextRootNode: unknown %s %#v", tnode.Kind, tnode)
288 }
289 return nil
290}
291
292// decodeField decodes a single table key and its value as a struct field.
293func (d *Decoder) decodeField(rkey rootedKey, tnode *toml.Node, relPos token.RelPos) (*ast.Field, error) {
294 rkey, keyElems := d.decodeKey(rkey, tnode.Key())
295 if d.findArray(rkey) != nil {
296 return nil, d.nodeErrf(tnode.Child().Next(), "cannot redeclare table array %q as a table", rkey)
297 }
298 topField, leafField := d.inlineFields(keyElems, relPos)
299 // All table keys must be unique, including inner table ones.
300 if d.seenTableKeys[rkey] {
301 return nil, d.nodeErrf(tnode.Child().Next(), "duplicate key: %s", rkey)
302 }
303 d.seenTableKeys[rkey] = true
304 value, err := d.decodeExpr(rkey, tnode.Value())
305 if err != nil {
306 return nil, err
307 }
308 leafField.Value = value
309 return topField, nil
310}
311
312// findArray returns an existing table array if one exists at exactly the given key.
313func (d *Decoder) findArray(rkey rootedKey) *openTableArray {
314 for i, arr := range d.openTableArrays {
315 if arr.rkey == rkey {
316 return &d.openTableArrays[i]
317 }
318 }
319 return nil
320}
321
322// findArray returns an existing table array if one exists at exactly the given key
323// or as a prefix to the given key.
324func (d *Decoder) findArrayPrefix(rkey rootedKey) *openTableArray {
325 // TODO(mvdan): see the performance TODO on [Decoder.openTableArrays].
326
327 // Prefer an exact match over a relative prefix match.
328 if arr := d.findArray(rkey); arr != nil {
329 // TODO: the fact that we need to delete from both structures below
330 // strongly hints towards merging the two structures in some way.
331 // We already have a TODO about making openTableArrays a more efficient structure.
332
333 // When we find an exact match, we must forget about its subkeys
334 // because we're starting an entirely new array element.
335 d.openTableArrays = slices.DeleteFunc(d.openTableArrays, func(arr openTableArray) bool {
336 return strings.HasPrefix(arr.rkey, rkey+".")
337 })
338 // We also need to forget about seen table keys.
339 maps.DeleteFunc(d.seenTableKeys, func(seenRkey rootedKey, _ bool) bool {
340 return strings.HasPrefix(seenRkey, rkey+".")
341 })
342 return arr
343 }
344 // The longest relative key match wins.
345 maxLevel := 0
346 var maxLevelArr *openTableArray
347 for i, arr := range d.openTableArrays {
348 if strings.HasPrefix(rkey, arr.rkey+".") && arr.level > maxLevel {
349 maxLevel = arr.level
350 maxLevelArr = &d.openTableArrays[i]
351 }
352 }
353 if maxLevel > 0 {
354 return maxLevelArr
355 }
356 return nil
357}
358
359// tomlKey represents a name with a position which forms part of a TOML dotted key,
360// such as "foo" from "[foo.bar.baz]".
361type tomlKey struct {
362 name string
363 shape toml.Shape
364}
365
366// decodeKey extracts a rootedKey from a TOML node key iterator,
367// appending to the given parent key and returning the unquoted string elements.
368func (d *Decoder) decodeKey(rkey rootedKey, iter toml.Iterator) (rootedKey, []tomlKey) {
369 var elems []tomlKey
370 for iter.Next() {
371 node := iter.Node()
372 name := string(node.Data)
373 // TODO(mvdan): use an append-like API once we have benchmarks
374 if len(rkey) > 0 {
375 rkey += "."
376 }
377 rkey += quoteLabelIfNeeded(name)
378 elems = append(elems, tomlKey{name, d.shape(node)})
379 }
380 return rkey, elems
381}
382
383// inlineFields constructs a single-line chain of CUE fields joined with structs,
384// so that an input like:
385//
386// ["foo", "bar.baz", "zzz"]
387//
388// results in the CUE fields:
389//
390// foo: "bar.baz": zzz: <nil>
391//
392// The "top" field, in this case "foo", can then be added as an element to a struct.
393// The "leaf" field, in this case "zzz", leaves its value as nil to be filled out.
394func (d *Decoder) inlineFields(tkeys []tomlKey, relPos token.RelPos) (top, leaf *ast.Field) {
395 curField := &ast.Field{
396 Label: d.label(tkeys[0], relPos),
397 }
398
399 topField := curField
400 for _, tkey := range tkeys[1:] {
401 nextField := &ast.Field{
402 Label: d.label(tkey, token.Blank), // on the same line
403 }
404 curField.Value = &ast.StructLit{Elts: []ast.Decl{nextField}}
405 curField = nextField
406 }
407 return topField, curField
408}
409
410// quoteLabelIfNeeded quotes a label name only if it needs quoting.
411func quoteLabelIfNeeded(name string) string {
412 if ast.StringLabelNeedsQuoting(name) {
413 return literal.Label.Quote(name)
414 }
415 return name
416}
417
418// label creates an ast.Label that represents a key with exactly the literal string name.
419// This means a quoted string literal for the key "_", as TOML never means "top",
420// as well as for any keys beginning with an underscore, as we don't want to hide any fields.
421// cue/format knows how to quote any other identifiers correctly.
422func (d *Decoder) label(tkey tomlKey, relPos token.RelPos) ast.Label {
423 pos := d.tokenFile.Pos(tkey.shape.Start.Offset, relPos)
424 label := ast.NewStringLabel(tkey.name)
425 ast.SetPos(label, pos)
426 return label
427}
428
429// decodeExpr decodes a single TOML value expression, found on the right side
430// of a `key = value` line.
431func (d *Decoder) decodeExpr(rkey rootedKey, tnode *toml.Node) (ast.Expr, error) {
432 // TODO(mvdan): we currently assume that TOML basic literals (string, int, float)
433 // are also valid CUE literals; we should double check this, perhaps via fuzzing.
434 data := string(tnode.Data)
435 var expr ast.Expr
436 switch tnode.Kind {
437 case toml.String:
438 expr = ast.NewString(data)
439 case toml.Integer:
440 expr = ast.NewLit(token.INT, data)
441 case toml.Float:
442 expr = ast.NewLit(token.FLOAT, data)
443 case toml.Bool:
444 expr = ast.NewBool(data == "true")
445 case toml.Array:
446 list := &ast.ListLit{}
447 elems := tnode.Children()
448 for elems.Next() {
449 key := rkey + "." + strconv.Itoa(len(list.Elts))
450 elem, err := d.decodeExpr(key, elems.Node())
451 if err != nil {
452 return nil, err
453 }
454 list.Elts = append(list.Elts, elem)
455 }
456 expr = list
457 case toml.InlineTable:
458 strct := &ast.StructLit{
459 // We want a single-line struct, just like TOML's inline tables are on a single line.
460 Lbrace: token.NoPos.WithRel(token.Blank),
461 Rbrace: token.NoPos.WithRel(token.Blank),
462 }
463 elems := tnode.Children()
464 for elems.Next() {
465 // Inline table fields are on the same line.
466 field, err := d.decodeField(rkey, elems.Node(), token.Blank)
467 if err != nil {
468 return nil, err
469 }
470 strct.Elts = append(strct.Elts, field)
471 }
472 expr = strct
473 case toml.LocalDate, toml.LocalTime, toml.LocalDateTime, toml.DateTime:
474 // CUE does not have native date nor time literal kinds,
475 // so we decode these as strings exactly as they came in
476 // and we validate them with time.Format using the corresponding format string.
477 // Not only does this ensure that the resulting CUE can be used with our time package,
478 // but it also means that we can roundtrip a TOML timestamp without confusing it for a string.
479 var format ast.Expr
480 switch tnode.Kind {
481 case toml.LocalDate:
482 // TODO(mvdan): rename time.RFC3339Date to time.DateOnly to mirror Go
483 format = ast.NewSel(&ast.Ident{
484 Name: "time",
485 Node: ast.NewImport(nil, "time"),
486 }, "RFC3339Date")
487 case toml.LocalTime:
488 // TODO(mvdan): add TimeOnly to CUE's time package to mirror Go
489 format = ast.NewString(time.TimeOnly)
490 case toml.LocalDateTime:
491 // RFC3339 minus the timezone; this seems like a format peculiar to TOML.
492 format = ast.NewString("2006-01-02T15:04:05")
493 default: // DateTime
494 format = ast.NewSel(&ast.Ident{
495 Name: "time",
496 Node: ast.NewImport(nil, "time"),
497 }, "RFC3339")
498 }
499 expr = ast.NewBinExpr(token.AND, ast.NewString(data), ast.NewCall(
500 ast.NewSel(&ast.Ident{
501 Name: "time",
502 Node: ast.NewImport(nil, "time"),
503 }, "Format"), format),
504 )
505 default:
506 return nil, fmt.Errorf("encoding/toml.Decoder.decodeExpr: unknown %s %#v", tnode.Kind, tnode)
507 }
508 // TODO(mvdan): some go-toml nodes such as Kind=toml.Bool do not seem to have a Raw Range
509 // which would let us grab their position information; fix this upstream.
510 if tnode.Raw.Length > 0 {
511 ast.SetPos(expr, d.tokenFile.Pos(d.shape(tnode).Start.Offset, token.NoRelPos))
512 }
513 return expr, nil
514}