1// Copyright 2025 The CUE Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package koala converts XML to and from CUE, as described in the proposal for the [koala] encoding.
16// This encoding is inspired by the [BadgerFish] convention for translating XML to JSON.
17// It differs from this to better fit CUE syntax, (as "$" and "@" are special characters),
18// and for improved readability, as described in the koala proposal.
19//
20// XML elements are modeled as CUE structs, their attributes are modeled as struct fields
21// prefixed with "$", and their inner text content is modeled as a field named "$$".
22//
23// WARNING: THIS PACKAGE IS EXPERIMENTAL.
24// ITS API MAY CHANGE AT ANY TIME.
25//
26// [koala]: https://cuelang.org/discussion/3776
27// [BadgerFish]: http://www.sklar.com/badgerfish/
28package koala
29
30import (
31 "bytes"
32 "encoding/xml"
33 "fmt"
34 "io"
35 "strings"
36 "unicode"
37
38 "cuelang.org/go/cue/ast"
39 "cuelang.org/go/cue/token"
40)
41
42// Decoder implements the decoding state.
43type Decoder struct {
44 reader io.Reader
45 fileName string
46 tokenFile *token.File
47
48 decoderRan bool
49
50 // current XML element being processed.
51 currXmlElement *xmlElement
52
53 // The top-level CUE struct.
54 astRoot *ast.StructLit
55 // CUE model of ancestors of current XML element being processed.
56 ancestors []currFieldInfo
57 // CUE model of current XML element being processed.
58 currField currFieldInfo
59 // CUE model of current XML element's inner content ($$ attribute).
60 currInnerText *ast.Field
61}
62
63// currFieldInfo encapsulates details of the CUE field for the current XML element being processed.
64type currFieldInfo struct {
65 // CUE model of current XML element.
66 field *ast.Field
67 // Running map of the current field's children.
68 currFieldChildren map[string]*ast.Field
69}
70
71// xmlElement models an XML Element hierarchy.
72// It is used for tracking namespace prefixes.
73type xmlElement struct {
74 xmlName xml.Name
75 attr []xml.Attr
76 parent *xmlElement
77 children []*xmlElement
78 textContentIsWhiteSpace bool
79}
80
81// The prefix used to model the inner text content within an XML element.
82const contentAttribute string = "$$"
83
84// The prefix used to model each attribute of an XML element.
85const attributeSymbol string = "$"
86
87// NewDecoder creates a decoder from a stream of XML input.
88func NewDecoder(fileName string, r io.Reader) *Decoder {
89 return &Decoder{reader: r, fileName: fileName}
90}
91
92// Decode parses the input stream as XML and converts it to a CUE [ast.Expr].
93// The input stream is taken from the [Decoder] and consumed.
94func (dec *Decoder) Decode() (ast.Expr, error) {
95 if dec.decoderRan {
96 return nil, io.EOF
97 }
98 dec.decoderRan = true
99 // TODO(mvdan): note that we read the whole input just for the sake of [token.NewFile];
100 // either revamp that API so that it doesn't need the length upfront,
101 // or lean into it and have internal/encoding pass the input size here.
102 xmlText, err := io.ReadAll(dec.reader)
103 if err != nil {
104 return nil, err
105 }
106 reader := bytes.NewReader(xmlText)
107 xmlDec := xml.NewDecoder(reader)
108
109 // Create a token file to track the position of the XML content in the CUE file.
110 dec.tokenFile = token.NewFile(dec.fileName, 0, len(xmlText))
111 dec.tokenFile.SetLinesForContent(xmlText)
112
113 for {
114 startOffset := xmlDec.InputOffset()
115 t, err := xmlDec.Token()
116 if err == io.EOF {
117 break
118 }
119 if err != nil {
120 return nil, err
121 }
122 switch xmlToken := t.(type) {
123 case xml.StartElement:
124 err = dec.decodeStartElement(xmlToken, startOffset)
125 case xml.CharData:
126 err = dec.decoderInnerText(xmlToken, startOffset)
127 case xml.EndElement:
128 err = dec.decodeEndElement()
129 }
130 if err != nil {
131 return nil, err
132 }
133 // If the XML document has ended, break out of the loop.
134 if dec.astRoot != nil && dec.currXmlElement == nil {
135 break
136 }
137 }
138 return dec.astRoot, nil
139}
140
141func (dec *Decoder) decoderInnerText(xmlToken xml.CharData, contentOffset int64) error {
142 // If this is text content within an XML element.
143 textContent := string(xml.CharData(xmlToken))
144 if dec.currField.field == nil {
145 if isWhiteSpace(textContent) {
146 return nil
147 }
148 return fmt.Errorf("text content outside of an XML element is not supported")
149 }
150 pos := dec.tokenFile.Pos(int(contentOffset), token.NoRelPos)
151 txtLabel := ast.NewStringLabel(contentAttribute)
152 ast.SetPos(txtLabel, pos)
153 val := toBasicLit(textContent)
154 ast.SetPos(val, pos)
155 textContentNode := &ast.Field{
156 Label: txtLabel,
157 Value: val,
158 TokenPos: pos,
159 }
160 dec.currInnerText = textContentNode
161 dec.currXmlElement.textContentIsWhiteSpace = isWhiteSpace(textContent)
162 return nil
163}
164
165func (dec *Decoder) decodeEndElement() error {
166 // If there is text content within the element, add it to the element's value.
167 if dec.currXmlElement != nil && dec.currInnerText != nil {
168 // Only support text content within an element that has no sub-elements.
169 if len(dec.currXmlElement.children) == 0 {
170 dec.appendToCurrFieldStruct(dec.currInnerText)
171 dec.currInnerText = nil
172 } else if len(dec.currXmlElement.children) > 0 && !dec.currXmlElement.textContentIsWhiteSpace {
173 // If there is text content within an element that has sub-elements, return an error.
174 return mixedContentError()
175 }
176 }
177 // For the xmlElement hierarchy: step back up the XML hierarchy.
178 if dec.currXmlElement != nil {
179 dec.currXmlElement = dec.currXmlElement.parent
180 }
181 // For the CUE ast: end current element, and step back up the XML hierarchy.
182 if len(dec.ancestors) > 0 {
183 dec.currField = dec.ancestors[len(dec.ancestors)-1]
184 dec.ancestors = dec.ancestors[:len(dec.ancestors)-1]
185 }
186 return nil
187}
188
189func (dec *Decoder) decodeStartElement(xmlToken xml.StartElement, startOffset int64) error {
190 // Covers the root node.
191 if dec.currField.field == nil {
192 dec.currXmlElement = &xmlElement{xmlName: xmlToken.Name, attr: xmlToken.Attr}
193 cueElement := dec.cueFieldFromXmlElement(xmlToken, dec.currXmlElement, startOffset)
194 dec.currField.assignNewCurrField(cueElement)
195 dec.astRoot = ast.NewStruct(dec.currField.field)
196 ast.SetPos(dec.astRoot, dec.tokenFile.Pos(0, token.NoRelPos))
197 return nil
198 }
199 // If this is not the root node, check if there is text content within the element.
200 if dec.currInnerText != nil && !dec.currXmlElement.textContentIsWhiteSpace {
201 return mixedContentError()
202 }
203 // Clear any whitespace text content.
204 dec.currInnerText = nil
205 // For xmlElement hierarchy: step down the XML hierarchy.
206 parentXmlNode := dec.currXmlElement
207 dec.currXmlElement = &xmlElement{xmlName: xmlToken.Name, attr: xmlToken.Attr, parent: parentXmlNode}
208 parentXmlNode.children = append(parentXmlNode.children, dec.currXmlElement)
209 // For the CUE ast: step down the CUE hierarchy.
210 dec.ancestors = append(dec.ancestors, dec.currField)
211 newElement := dec.cueFieldFromXmlElement(xmlToken, dec.currXmlElement, startOffset)
212 // Check if this new XML element has a name that's been seen before at the current level.
213 prefixedXmlElementName := prefixedElementName(xmlToken, dec.currXmlElement)
214 sameNameElements := dec.currField.currFieldChildren[prefixedXmlElementName]
215 if sameNameElements != nil {
216 list, ok := sameNameElements.Value.(*ast.ListLit)
217 // If the field's value is not a ListLit, create a new ListLit and append the existing field.
218 if !ok {
219 list = &ast.ListLit{Elts: []ast.Expr{sameNameElements.Value}}
220 sameNameElements.Value = list
221 }
222 // Append the new element to the ListLit, which we now know exists.
223 list.Elts = append(list.Elts, newElement.Value)
224 dec.currField.assignNewCurrField(newElement)
225 return nil
226 }
227 dec.currField.currFieldChildren[prefixedXmlElementName] = newElement
228 dec.appendToCurrFieldStruct(newElement)
229 dec.currField.assignNewCurrField(newElement)
230 return nil
231}
232
233func (dec *Decoder) appendToCurrFieldStruct(field *ast.Field) {
234 dec.currField.field.Value.(*ast.StructLit).Elts = append(dec.currField.field.Value.(*ast.StructLit).Elts, field)
235}
236
237func mixedContentError() error {
238 return fmt.Errorf("text content within an XML element that has sub-elements is not supported")
239}
240
241func isWhiteSpace(s string) bool {
242 for _, r := range s {
243 if !unicode.IsSpace(r) {
244 return false
245 }
246 }
247 return true
248}
249
250// cueFieldFromXmlElement creates a new [ast.Field] to model the given xml element information
251// in [xml.StartElement] and [xmlElement]. The startOffset represents the offset
252// for the beginning of the start tag of the given XML element.
253func (dec *Decoder) cueFieldFromXmlElement(elem xml.StartElement, xmlNode *xmlElement, startOffset int64) *ast.Field {
254 elementName := prefixedElementName(elem, xmlNode)
255 resLabel := ast.NewStringLabel(elementName)
256 pos := dec.tokenFile.Pos(int(startOffset), token.NoRelPos)
257 ast.SetPos(resLabel, pos)
258 resultValue := &ast.StructLit{}
259 result := &ast.Field{
260 Label: resLabel,
261 Value: resultValue,
262 TokenPos: pos,
263 }
264 // Extract attributes as children.
265 for _, a := range elem.Attr {
266 attrName := prefixedAttrName(a, elem, xmlNode)
267 label := ast.NewStringLabel(attributeSymbol + attrName)
268 value := toBasicLit(a.Value)
269 ast.SetPos(label, pos)
270 ast.SetPos(value, pos)
271 attrExpr := &ast.Field{
272 Label: label,
273 Value: value,
274 TokenPos: pos,
275 }
276 resultValue.Elts = append(resultValue.Elts, attrExpr)
277 }
278 return result
279}
280
281// prefixedElementName returns the full name of an element,
282// including its namespace prefix if it has one; but without namespace prefix if it is "xmlns".
283func prefixedElementName(elem xml.StartElement, xmlNode *xmlElement) string {
284 elementName := elem.Name.Local
285 if elem.Name.Space != "" {
286 prefixNS := nsPrefix(elem.Name.Space, elem.Attr, xmlNode)
287 if prefixNS != "xmlns" {
288 elementName = prefixNS + ":" + elem.Name.Local
289 }
290 }
291 return elementName
292}
293
294// prefixedAttrName returns the full name of an attribute, including its namespace prefix if it has one.
295func prefixedAttrName(a xml.Attr, elem xml.StartElement, xmlNode *xmlElement) string {
296 attrName := a.Name.Local
297 if a.Name.Space != "" {
298 prefix := nsPrefix(a.Name.Space, elem.Attr, xmlNode)
299 attrName = prefix + ":" + a.Name.Local
300 }
301 return attrName
302}
303
304func toBasicLit(s string) *ast.BasicLit {
305 s = strings.ReplaceAll(s, "\r", "")
306 return ast.NewString(s)
307}
308
309// nsPrefix finds the prefix label for a given namespace by looking at the current node's
310// attributes and then walking up the hierarchy of XML nodes.
311func nsPrefix(nameSpace string, attributes []xml.Attr, xmlNode *xmlElement) string {
312 // When the prefix is xmlns, then the namespace is xmlns according to the golang XML parser.
313 if nameSpace == "xmlns" {
314 return "xmlns"
315 }
316 for _, attr := range attributes {
317 if attr.Value == nameSpace {
318 return attr.Name.Local
319 }
320 }
321 if xmlNode.parent != nil {
322 return nsPrefix(nameSpace, xmlNode.parent.attr, xmlNode.parent)
323 }
324 panic("could not find prefix for namespace " + nameSpace)
325}
326
327func (cf *currFieldInfo) assignNewCurrField(field *ast.Field) {
328 cf.field = field
329 cf.currFieldChildren = make(map[string]*ast.Field)
330}