this repo has no description
at master 330 lines 11 kB view raw
1// Copyright 2025 The CUE Authors 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package koala converts XML to and from CUE, as described in the proposal for the [koala] encoding. 16// This encoding is inspired by the [BadgerFish] convention for translating XML to JSON. 17// It differs from this to better fit CUE syntax, (as "$" and "@" are special characters), 18// and for improved readability, as described in the koala proposal. 19// 20// XML elements are modeled as CUE structs, their attributes are modeled as struct fields 21// prefixed with "$", and their inner text content is modeled as a field named "$$". 22// 23// WARNING: THIS PACKAGE IS EXPERIMENTAL. 24// ITS API MAY CHANGE AT ANY TIME. 25// 26// [koala]: https://cuelang.org/discussion/3776 27// [BadgerFish]: http://www.sklar.com/badgerfish/ 28package koala 29 30import ( 31 "bytes" 32 "encoding/xml" 33 "fmt" 34 "io" 35 "strings" 36 "unicode" 37 38 "cuelang.org/go/cue/ast" 39 "cuelang.org/go/cue/token" 40) 41 42// Decoder implements the decoding state. 43type Decoder struct { 44 reader io.Reader 45 fileName string 46 tokenFile *token.File 47 48 decoderRan bool 49 50 // current XML element being processed. 51 currXmlElement *xmlElement 52 53 // The top-level CUE struct. 54 astRoot *ast.StructLit 55 // CUE model of ancestors of current XML element being processed. 56 ancestors []currFieldInfo 57 // CUE model of current XML element being processed. 58 currField currFieldInfo 59 // CUE model of current XML element's inner content ($$ attribute). 60 currInnerText *ast.Field 61} 62 63// currFieldInfo encapsulates details of the CUE field for the current XML element being processed. 64type currFieldInfo struct { 65 // CUE model of current XML element. 66 field *ast.Field 67 // Running map of the current field's children. 68 currFieldChildren map[string]*ast.Field 69} 70 71// xmlElement models an XML Element hierarchy. 72// It is used for tracking namespace prefixes. 73type xmlElement struct { 74 xmlName xml.Name 75 attr []xml.Attr 76 parent *xmlElement 77 children []*xmlElement 78 textContentIsWhiteSpace bool 79} 80 81// The prefix used to model the inner text content within an XML element. 82const contentAttribute string = "$$" 83 84// The prefix used to model each attribute of an XML element. 85const attributeSymbol string = "$" 86 87// NewDecoder creates a decoder from a stream of XML input. 88func NewDecoder(fileName string, r io.Reader) *Decoder { 89 return &Decoder{reader: r, fileName: fileName} 90} 91 92// Decode parses the input stream as XML and converts it to a CUE [ast.Expr]. 93// The input stream is taken from the [Decoder] and consumed. 94func (dec *Decoder) Decode() (ast.Expr, error) { 95 if dec.decoderRan { 96 return nil, io.EOF 97 } 98 dec.decoderRan = true 99 // TODO(mvdan): note that we read the whole input just for the sake of [token.NewFile]; 100 // either revamp that API so that it doesn't need the length upfront, 101 // or lean into it and have internal/encoding pass the input size here. 102 xmlText, err := io.ReadAll(dec.reader) 103 if err != nil { 104 return nil, err 105 } 106 reader := bytes.NewReader(xmlText) 107 xmlDec := xml.NewDecoder(reader) 108 109 // Create a token file to track the position of the XML content in the CUE file. 110 dec.tokenFile = token.NewFile(dec.fileName, 0, len(xmlText)) 111 dec.tokenFile.SetLinesForContent(xmlText) 112 113 for { 114 startOffset := xmlDec.InputOffset() 115 t, err := xmlDec.Token() 116 if err == io.EOF { 117 break 118 } 119 if err != nil { 120 return nil, err 121 } 122 switch xmlToken := t.(type) { 123 case xml.StartElement: 124 err = dec.decodeStartElement(xmlToken, startOffset) 125 case xml.CharData: 126 err = dec.decoderInnerText(xmlToken, startOffset) 127 case xml.EndElement: 128 err = dec.decodeEndElement() 129 } 130 if err != nil { 131 return nil, err 132 } 133 // If the XML document has ended, break out of the loop. 134 if dec.astRoot != nil && dec.currXmlElement == nil { 135 break 136 } 137 } 138 return dec.astRoot, nil 139} 140 141func (dec *Decoder) decoderInnerText(xmlToken xml.CharData, contentOffset int64) error { 142 // If this is text content within an XML element. 143 textContent := string(xml.CharData(xmlToken)) 144 if dec.currField.field == nil { 145 if isWhiteSpace(textContent) { 146 return nil 147 } 148 return fmt.Errorf("text content outside of an XML element is not supported") 149 } 150 pos := dec.tokenFile.Pos(int(contentOffset), token.NoRelPos) 151 txtLabel := ast.NewStringLabel(contentAttribute) 152 ast.SetPos(txtLabel, pos) 153 val := toBasicLit(textContent) 154 ast.SetPos(val, pos) 155 textContentNode := &ast.Field{ 156 Label: txtLabel, 157 Value: val, 158 TokenPos: pos, 159 } 160 dec.currInnerText = textContentNode 161 dec.currXmlElement.textContentIsWhiteSpace = isWhiteSpace(textContent) 162 return nil 163} 164 165func (dec *Decoder) decodeEndElement() error { 166 // If there is text content within the element, add it to the element's value. 167 if dec.currXmlElement != nil && dec.currInnerText != nil { 168 // Only support text content within an element that has no sub-elements. 169 if len(dec.currXmlElement.children) == 0 { 170 dec.appendToCurrFieldStruct(dec.currInnerText) 171 dec.currInnerText = nil 172 } else if len(dec.currXmlElement.children) > 0 && !dec.currXmlElement.textContentIsWhiteSpace { 173 // If there is text content within an element that has sub-elements, return an error. 174 return mixedContentError() 175 } 176 } 177 // For the xmlElement hierarchy: step back up the XML hierarchy. 178 if dec.currXmlElement != nil { 179 dec.currXmlElement = dec.currXmlElement.parent 180 } 181 // For the CUE ast: end current element, and step back up the XML hierarchy. 182 if len(dec.ancestors) > 0 { 183 dec.currField = dec.ancestors[len(dec.ancestors)-1] 184 dec.ancestors = dec.ancestors[:len(dec.ancestors)-1] 185 } 186 return nil 187} 188 189func (dec *Decoder) decodeStartElement(xmlToken xml.StartElement, startOffset int64) error { 190 // Covers the root node. 191 if dec.currField.field == nil { 192 dec.currXmlElement = &xmlElement{xmlName: xmlToken.Name, attr: xmlToken.Attr} 193 cueElement := dec.cueFieldFromXmlElement(xmlToken, dec.currXmlElement, startOffset) 194 dec.currField.assignNewCurrField(cueElement) 195 dec.astRoot = ast.NewStruct(dec.currField.field) 196 ast.SetPos(dec.astRoot, dec.tokenFile.Pos(0, token.NoRelPos)) 197 return nil 198 } 199 // If this is not the root node, check if there is text content within the element. 200 if dec.currInnerText != nil && !dec.currXmlElement.textContentIsWhiteSpace { 201 return mixedContentError() 202 } 203 // Clear any whitespace text content. 204 dec.currInnerText = nil 205 // For xmlElement hierarchy: step down the XML hierarchy. 206 parentXmlNode := dec.currXmlElement 207 dec.currXmlElement = &xmlElement{xmlName: xmlToken.Name, attr: xmlToken.Attr, parent: parentXmlNode} 208 parentXmlNode.children = append(parentXmlNode.children, dec.currXmlElement) 209 // For the CUE ast: step down the CUE hierarchy. 210 dec.ancestors = append(dec.ancestors, dec.currField) 211 newElement := dec.cueFieldFromXmlElement(xmlToken, dec.currXmlElement, startOffset) 212 // Check if this new XML element has a name that's been seen before at the current level. 213 prefixedXmlElementName := prefixedElementName(xmlToken, dec.currXmlElement) 214 sameNameElements := dec.currField.currFieldChildren[prefixedXmlElementName] 215 if sameNameElements != nil { 216 list, ok := sameNameElements.Value.(*ast.ListLit) 217 // If the field's value is not a ListLit, create a new ListLit and append the existing field. 218 if !ok { 219 list = &ast.ListLit{Elts: []ast.Expr{sameNameElements.Value}} 220 sameNameElements.Value = list 221 } 222 // Append the new element to the ListLit, which we now know exists. 223 list.Elts = append(list.Elts, newElement.Value) 224 dec.currField.assignNewCurrField(newElement) 225 return nil 226 } 227 dec.currField.currFieldChildren[prefixedXmlElementName] = newElement 228 dec.appendToCurrFieldStruct(newElement) 229 dec.currField.assignNewCurrField(newElement) 230 return nil 231} 232 233func (dec *Decoder) appendToCurrFieldStruct(field *ast.Field) { 234 dec.currField.field.Value.(*ast.StructLit).Elts = append(dec.currField.field.Value.(*ast.StructLit).Elts, field) 235} 236 237func mixedContentError() error { 238 return fmt.Errorf("text content within an XML element that has sub-elements is not supported") 239} 240 241func isWhiteSpace(s string) bool { 242 for _, r := range s { 243 if !unicode.IsSpace(r) { 244 return false 245 } 246 } 247 return true 248} 249 250// cueFieldFromXmlElement creates a new [ast.Field] to model the given xml element information 251// in [xml.StartElement] and [xmlElement]. The startOffset represents the offset 252// for the beginning of the start tag of the given XML element. 253func (dec *Decoder) cueFieldFromXmlElement(elem xml.StartElement, xmlNode *xmlElement, startOffset int64) *ast.Field { 254 elementName := prefixedElementName(elem, xmlNode) 255 resLabel := ast.NewStringLabel(elementName) 256 pos := dec.tokenFile.Pos(int(startOffset), token.NoRelPos) 257 ast.SetPos(resLabel, pos) 258 resultValue := &ast.StructLit{} 259 result := &ast.Field{ 260 Label: resLabel, 261 Value: resultValue, 262 TokenPos: pos, 263 } 264 // Extract attributes as children. 265 for _, a := range elem.Attr { 266 attrName := prefixedAttrName(a, elem, xmlNode) 267 label := ast.NewStringLabel(attributeSymbol + attrName) 268 value := toBasicLit(a.Value) 269 ast.SetPos(label, pos) 270 ast.SetPos(value, pos) 271 attrExpr := &ast.Field{ 272 Label: label, 273 Value: value, 274 TokenPos: pos, 275 } 276 resultValue.Elts = append(resultValue.Elts, attrExpr) 277 } 278 return result 279} 280 281// prefixedElementName returns the full name of an element, 282// including its namespace prefix if it has one; but without namespace prefix if it is "xmlns". 283func prefixedElementName(elem xml.StartElement, xmlNode *xmlElement) string { 284 elementName := elem.Name.Local 285 if elem.Name.Space != "" { 286 prefixNS := nsPrefix(elem.Name.Space, elem.Attr, xmlNode) 287 if prefixNS != "xmlns" { 288 elementName = prefixNS + ":" + elem.Name.Local 289 } 290 } 291 return elementName 292} 293 294// prefixedAttrName returns the full name of an attribute, including its namespace prefix if it has one. 295func prefixedAttrName(a xml.Attr, elem xml.StartElement, xmlNode *xmlElement) string { 296 attrName := a.Name.Local 297 if a.Name.Space != "" { 298 prefix := nsPrefix(a.Name.Space, elem.Attr, xmlNode) 299 attrName = prefix + ":" + a.Name.Local 300 } 301 return attrName 302} 303 304func toBasicLit(s string) *ast.BasicLit { 305 s = strings.ReplaceAll(s, "\r", "") 306 return ast.NewString(s) 307} 308 309// nsPrefix finds the prefix label for a given namespace by looking at the current node's 310// attributes and then walking up the hierarchy of XML nodes. 311func nsPrefix(nameSpace string, attributes []xml.Attr, xmlNode *xmlElement) string { 312 // When the prefix is xmlns, then the namespace is xmlns according to the golang XML parser. 313 if nameSpace == "xmlns" { 314 return "xmlns" 315 } 316 for _, attr := range attributes { 317 if attr.Value == nameSpace { 318 return attr.Name.Local 319 } 320 } 321 if xmlNode.parent != nil { 322 return nsPrefix(nameSpace, xmlNode.parent.attr, xmlNode.parent) 323 } 324 panic("could not find prefix for namespace " + nameSpace) 325} 326 327func (cf *currFieldInfo) assignNewCurrField(field *ast.Field) { 328 cf.field = field 329 cf.currFieldChildren = make(map[string]*ast.Field) 330}