encoding/xml/koala/decode.go at master · mvdan.cc/cue

mvdan.cc / cue
this repo has no description
cue / encoding / xml / koala / decode.go
at master 330 lines 11 kB view raw
  1// Copyright 2025 The CUE Authors
  2//
  3// Licensed under the Apache License, Version 2.0 (the "License");
  4// you may not use this file except in compliance with the License.
  5// You may obtain a copy of the License at
  6//
  7//	http://www.apache.org/licenses/LICENSE-2.0
  8//
  9// Unless required by applicable law or agreed to in writing, software
 10// distributed under the License is distributed on an "AS IS" BASIS,
 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12// See the License for the specific language governing permissions and
 13// limitations under the License.
 14
 15// Package koala converts XML to and from CUE, as described in the proposal for the [koala] encoding.
 16// This encoding is inspired by the [BadgerFish] convention for translating XML to JSON.
 17// It differs from this to better fit CUE syntax, (as "$" and "@" are special characters),
 18// and for improved readability, as described in the koala proposal.
 19//
 20// XML elements are modeled as CUE structs, their attributes are modeled as struct fields
 21// prefixed with "$", and their inner text content is modeled as a field named "$$".
 22//
 23// WARNING: THIS PACKAGE IS EXPERIMENTAL.
 24// ITS API MAY CHANGE AT ANY TIME.
 25//
 26// [koala]: https://cuelang.org/discussion/3776
 27// [BadgerFish]: http://www.sklar.com/badgerfish/
 28package koala
 29
 30import (
 31	"bytes"
 32	"encoding/xml"
 33	"fmt"
 34	"io"
 35	"strings"
 36	"unicode"
 37
 38	"cuelang.org/go/cue/ast"
 39	"cuelang.org/go/cue/token"
 40)
 41
 42// Decoder implements the decoding state.
 43type Decoder struct {
 44	reader    io.Reader
 45	fileName  string
 46	tokenFile *token.File
 47
 48	decoderRan bool
 49
 50	// current XML element being processed.
 51	currXmlElement *xmlElement
 52
 53	// The top-level CUE struct.
 54	astRoot *ast.StructLit
 55	// CUE model of ancestors of current XML element being processed.
 56	ancestors []currFieldInfo
 57	// CUE model of current XML element being processed.
 58	currField currFieldInfo
 59	// CUE model of current XML element's inner content ($$ attribute).
 60	currInnerText *ast.Field
 61}
 62
 63// currFieldInfo encapsulates details of the CUE field for the current XML element being processed.
 64type currFieldInfo struct {
 65	// CUE model of current XML element.
 66	field *ast.Field
 67	// Running map of the current field's children.
 68	currFieldChildren map[string]*ast.Field
 69}
 70
 71// xmlElement models an XML Element hierarchy.
 72// It is used for tracking namespace prefixes.
 73type xmlElement struct {
 74	xmlName                 xml.Name
 75	attr                    []xml.Attr
 76	parent                  *xmlElement
 77	children                []*xmlElement
 78	textContentIsWhiteSpace bool
 79}
 80
 81// The prefix used to model the inner text content within an XML element.
 82const contentAttribute string = "$$"
 83
 84// The prefix used to model each attribute of an XML element.
 85const attributeSymbol string = "$"
 86
 87// NewDecoder creates a decoder from a stream of XML input.
 88func NewDecoder(fileName string, r io.Reader) *Decoder {
 89	return &Decoder{reader: r, fileName: fileName}
 90}
 91
 92// Decode parses the input stream as XML and converts it to a CUE [ast.Expr].
 93// The input stream is taken from the [Decoder] and consumed.
 94func (dec *Decoder) Decode() (ast.Expr, error) {
 95	if dec.decoderRan {
 96		return nil, io.EOF
 97	}
 98	dec.decoderRan = true
 99	// TODO(mvdan): note that we read the whole input just for the sake of [token.NewFile];
100	// either revamp that API so that it doesn't need the length upfront,
101	// or lean into it and have internal/encoding pass the input size here.
102	xmlText, err := io.ReadAll(dec.reader)
103	if err != nil {
104		return nil, err
105	}
106	reader := bytes.NewReader(xmlText)
107	xmlDec := xml.NewDecoder(reader)
108
109	// Create a token file to track the position of the XML content in the CUE file.
110	dec.tokenFile = token.NewFile(dec.fileName, 0, len(xmlText))
111	dec.tokenFile.SetLinesForContent(xmlText)
112
113	for {
114		startOffset := xmlDec.InputOffset()
115		t, err := xmlDec.Token()
116		if err == io.EOF {
117			break
118		}
119		if err != nil {
120			return nil, err
121		}
122		switch xmlToken := t.(type) {
123		case xml.StartElement:
124			err = dec.decodeStartElement(xmlToken, startOffset)
125		case xml.CharData:
126			err = dec.decoderInnerText(xmlToken, startOffset)
127		case xml.EndElement:
128			err = dec.decodeEndElement()
129		}
130		if err != nil {
131			return nil, err
132		}
133		// If the XML document has ended, break out of the loop.
134		if dec.astRoot != nil && dec.currXmlElement == nil {
135			break
136		}
137	}
138	return dec.astRoot, nil
139}
140
141func (dec *Decoder) decoderInnerText(xmlToken xml.CharData, contentOffset int64) error {
142	// If this is text content within an XML element.
143	textContent := string(xml.CharData(xmlToken))
144	if dec.currField.field == nil {
145		if isWhiteSpace(textContent) {
146			return nil
147		}
148		return fmt.Errorf("text content outside of an XML element is not supported")
149	}
150	pos := dec.tokenFile.Pos(int(contentOffset), token.NoRelPos)
151	txtLabel := ast.NewStringLabel(contentAttribute)
152	ast.SetPos(txtLabel, pos)
153	val := toBasicLit(textContent)
154	ast.SetPos(val, pos)
155	textContentNode := &ast.Field{
156		Label:    txtLabel,
157		Value:    val,
158		TokenPos: pos,
159	}
160	dec.currInnerText = textContentNode
161	dec.currXmlElement.textContentIsWhiteSpace = isWhiteSpace(textContent)
162	return nil
163}
164
165func (dec *Decoder) decodeEndElement() error {
166	// If there is text content within the element, add it to the element's value.
167	if dec.currXmlElement != nil && dec.currInnerText != nil {
168		// Only support text content within an element that has no sub-elements.
169		if len(dec.currXmlElement.children) == 0 {
170			dec.appendToCurrFieldStruct(dec.currInnerText)
171			dec.currInnerText = nil
172		} else if len(dec.currXmlElement.children) > 0 && !dec.currXmlElement.textContentIsWhiteSpace {
173			// If there is text content within an element that has sub-elements, return an error.
174			return mixedContentError()
175		}
176	}
177	// For the xmlElement hierarchy: step back up the XML hierarchy.
178	if dec.currXmlElement != nil {
179		dec.currXmlElement = dec.currXmlElement.parent
180	}
181	// For the CUE ast: end current element, and step back up the XML hierarchy.
182	if len(dec.ancestors) > 0 {
183		dec.currField = dec.ancestors[len(dec.ancestors)-1]
184		dec.ancestors = dec.ancestors[:len(dec.ancestors)-1]
185	}
186	return nil
187}
188
189func (dec *Decoder) decodeStartElement(xmlToken xml.StartElement, startOffset int64) error {
190	// Covers the root node.
191	if dec.currField.field == nil {
192		dec.currXmlElement = &xmlElement{xmlName: xmlToken.Name, attr: xmlToken.Attr}
193		cueElement := dec.cueFieldFromXmlElement(xmlToken, dec.currXmlElement, startOffset)
194		dec.currField.assignNewCurrField(cueElement)
195		dec.astRoot = ast.NewStruct(dec.currField.field)
196		ast.SetPos(dec.astRoot, dec.tokenFile.Pos(0, token.NoRelPos))
197		return nil
198	}
199	// If this is not the root node, check if there is text content within the element.
200	if dec.currInnerText != nil && !dec.currXmlElement.textContentIsWhiteSpace {
201		return mixedContentError()
202	}
203	// Clear any whitespace text content.
204	dec.currInnerText = nil
205	// For xmlElement hierarchy: step down the XML hierarchy.
206	parentXmlNode := dec.currXmlElement
207	dec.currXmlElement = &xmlElement{xmlName: xmlToken.Name, attr: xmlToken.Attr, parent: parentXmlNode}
208	parentXmlNode.children = append(parentXmlNode.children, dec.currXmlElement)
209	// For the CUE ast: step down the CUE hierarchy.
210	dec.ancestors = append(dec.ancestors, dec.currField)
211	newElement := dec.cueFieldFromXmlElement(xmlToken, dec.currXmlElement, startOffset)
212	// Check if this new XML element has a name that's been seen before at the current level.
213	prefixedXmlElementName := prefixedElementName(xmlToken, dec.currXmlElement)
214	sameNameElements := dec.currField.currFieldChildren[prefixedXmlElementName]
215	if sameNameElements != nil {
216		list, ok := sameNameElements.Value.(*ast.ListLit)
217		// If the field's value is not a ListLit, create a new ListLit and append the existing field.
218		if !ok {
219			list = &ast.ListLit{Elts: []ast.Expr{sameNameElements.Value}}
220			sameNameElements.Value = list
221		}
222		// Append the new element to the ListLit, which we now know exists.
223		list.Elts = append(list.Elts, newElement.Value)
224		dec.currField.assignNewCurrField(newElement)
225		return nil
226	}
227	dec.currField.currFieldChildren[prefixedXmlElementName] = newElement
228	dec.appendToCurrFieldStruct(newElement)
229	dec.currField.assignNewCurrField(newElement)
230	return nil
231}
232
233func (dec *Decoder) appendToCurrFieldStruct(field *ast.Field) {
234	dec.currField.field.Value.(*ast.StructLit).Elts = append(dec.currField.field.Value.(*ast.StructLit).Elts, field)
235}
236
237func mixedContentError() error {
238	return fmt.Errorf("text content within an XML element that has sub-elements is not supported")
239}
240
241func isWhiteSpace(s string) bool {
242	for _, r := range s {
243		if !unicode.IsSpace(r) {
244			return false
245		}
246	}
247	return true
248}
249
250// cueFieldFromXmlElement creates a new [ast.Field] to model the given xml element information
251// in [xml.StartElement] and [xmlElement]. The startOffset represents the offset
252// for the beginning of the start tag of the given XML element.
253func (dec *Decoder) cueFieldFromXmlElement(elem xml.StartElement, xmlNode *xmlElement, startOffset int64) *ast.Field {
254	elementName := prefixedElementName(elem, xmlNode)
255	resLabel := ast.NewStringLabel(elementName)
256	pos := dec.tokenFile.Pos(int(startOffset), token.NoRelPos)
257	ast.SetPos(resLabel, pos)
258	resultValue := &ast.StructLit{}
259	result := &ast.Field{
260		Label:    resLabel,
261		Value:    resultValue,
262		TokenPos: pos,
263	}
264	// Extract attributes as children.
265	for _, a := range elem.Attr {
266		attrName := prefixedAttrName(a, elem, xmlNode)
267		label := ast.NewStringLabel(attributeSymbol + attrName)
268		value := toBasicLit(a.Value)
269		ast.SetPos(label, pos)
270		ast.SetPos(value, pos)
271		attrExpr := &ast.Field{
272			Label:    label,
273			Value:    value,
274			TokenPos: pos,
275		}
276		resultValue.Elts = append(resultValue.Elts, attrExpr)
277	}
278	return result
279}
280
281// prefixedElementName returns the full name of an element,
282// including its namespace prefix if it has one; but without namespace prefix if it is "xmlns".
283func prefixedElementName(elem xml.StartElement, xmlNode *xmlElement) string {
284	elementName := elem.Name.Local
285	if elem.Name.Space != "" {
286		prefixNS := nsPrefix(elem.Name.Space, elem.Attr, xmlNode)
287		if prefixNS != "xmlns" {
288			elementName = prefixNS + ":" + elem.Name.Local
289		}
290	}
291	return elementName
292}
293
294// prefixedAttrName returns the full name of an attribute, including its namespace prefix if it has one.
295func prefixedAttrName(a xml.Attr, elem xml.StartElement, xmlNode *xmlElement) string {
296	attrName := a.Name.Local
297	if a.Name.Space != "" {
298		prefix := nsPrefix(a.Name.Space, elem.Attr, xmlNode)
299		attrName = prefix + ":" + a.Name.Local
300	}
301	return attrName
302}
303
304func toBasicLit(s string) *ast.BasicLit {
305	s = strings.ReplaceAll(s, "\r", "")
306	return ast.NewString(s)
307}
308
309// nsPrefix finds the prefix label for a given namespace by looking at the current node's
310// attributes and then walking up the hierarchy of XML nodes.
311func nsPrefix(nameSpace string, attributes []xml.Attr, xmlNode *xmlElement) string {
312	// When the prefix is xmlns, then the namespace is xmlns according to the golang XML parser.
313	if nameSpace == "xmlns" {
314		return "xmlns"
315	}
316	for _, attr := range attributes {
317		if attr.Value == nameSpace {
318			return attr.Name.Local
319		}
320	}
321	if xmlNode.parent != nil {
322		return nsPrefix(nameSpace, xmlNode.parent.attr, xmlNode.parent)
323	}
324	panic("could not find prefix for namespace " + nameSpace)
325}
326
327func (cf *currFieldInfo) assignNewCurrField(field *ast.Field) {
328	cf.field = field
329	cf.currFieldChildren = make(map[string]*ast.Field)
330}