From 8d8de1dd94b1ca83b15768b120dcdee7299eda3d Mon Sep 17 00:00:00 2001 From: Brett Vickers Date: Sun, 7 Jul 2024 09:38:56 -0700 Subject: [PATCH] Attributes deduplicated more efficiently When reading an XML document, this package uses a more time-efficient technique to detect and remove attributes with duplicated names (within each element). --- etree.go | 41 +++++++++++++++++++++++++++-------------- etree_test.go | 25 +++++++++++-------------- 2 files changed, 38 insertions(+), 28 deletions(-) diff --git a/etree.go b/etree.go index 815b6c4..0e28674 100644 --- a/etree.go +++ b/etree.go @@ -889,6 +889,7 @@ func (e *Element) readFrom(ri io.Reader, settings ReadSettings) (n int64, err er r = newXmlSimpleReader(ri) } + attrCheck := make(map[xml.Name]int) dec := newDecoder(r, settings) var stack stack[*Element] @@ -921,8 +922,19 @@ func (e *Element) readFrom(ri io.Reader, settings ReadSettings) (n int64, err er switch t := t.(type) { case xml.StartElement: e := newElement(t.Name.Space, t.Name.Local, top) - for _, a := range t.Attr { - e.createAttr(a.Name.Space, a.Name.Local, a.Value, e, settings.PreserveDuplicateAttrs) + if settings.PreserveDuplicateAttrs || len(t.Attr) < 2 { + for _, a := range t.Attr { + e.addAttr(a.Name.Space, a.Name.Local, a.Value) + } + } else { + for _, a := range t.Attr { + if i, contains := attrCheck[a.Name]; contains { + e.Attr[i].Value = a.Value + } else { + attrCheck[a.Name] = e.addAttr(a.Name.Space, a.Name.Local, a.Value) + } + } + clear(attrCheck) } stack.push(e) case xml.EndElement: @@ -1365,28 +1377,29 @@ func (e *Element) addChild(t Token) { // prefix followed by a colon. func (e *Element) CreateAttr(key, value string) *Attr { space, skey := spaceDecompose(key) - return e.createAttr(space, skey, value, e, false) -} -// createAttr is a helper function that creates attributes. -func (e *Element) createAttr(space, key, value string, parent *Element, preserveDups bool) *Attr { - if !preserveDups { - for i, a := range e.Attr { - if space == a.Space && key == a.Key { - e.Attr[i].Value = value - return &e.Attr[i] - } + for i, a := range e.Attr { + if space == a.Space && skey == a.Key { + e.Attr[i].Value = value + return &e.Attr[i] } } + i := e.addAttr(space, skey, value) + return &e.Attr[i] +} + +// addAttr is a helper function that adds an attribute to an element. Returns +// the index of the added attribute. +func (e *Element) addAttr(space, key, value string) int { a := Attr{ Space: space, Key: key, Value: value, - element: parent, + element: e, } e.Attr = append(e.Attr, a) - return &e.Attr[len(e.Attr)-1] + return len(e.Attr) - 1 } // RemoveAttr removes the first attribute of this element whose key matches diff --git a/etree_test.go b/etree_test.go index 8ea9994..dcaa1cf 100644 --- a/etree_test.go +++ b/etree_test.go @@ -1467,7 +1467,7 @@ func TestReindexChildren(t *testing.T) { } func TestPreserveDuplicateAttrs(t *testing.T) { - s := `` + s := `` checkAttrCount := func(e *Element, n int) { if len(e.Attr) != n { @@ -1490,23 +1490,20 @@ func TestPreserveDuplicateAttrs(t *testing.T) { t.Run("enabled", func(t *testing.T) { doc := newDocumentFromString2(t, s, ReadSettings{PreserveDuplicateAttrs: true}) e := doc.FindElement("element") - checkAttrCount(e, 2) - checkAttr(e, 0, "attr", "test") - checkAttr(e, 1, "attr", "test2") + checkAttrCount(e, 5) + checkAttr(e, 0, "x", "value1") + checkAttr(e, 1, "y", "value2") + checkAttr(e, 2, "x", "value3") + checkAttr(e, 3, "x", "value4") + checkAttr(e, 4, "y", "value5") }) t.Run("disabled", func(t *testing.T) { - doc := newDocumentFromString2(t, s, ReadSettings{PreserveDuplicateAttrs: false}) + doc := newDocumentFromString2(t, s, ReadSettings{}) e := doc.FindElement("element") - checkAttrCount(e, 1) - checkAttr(e, 0, "attr", "test2") - }) - - t.Run("default", func(t *testing.T) { - doc := newDocumentFromString(t, s) - e := doc.FindElement("element") - checkAttrCount(e, 1) - checkAttr(e, 0, "attr", "test2") + checkAttrCount(e, 2) + checkAttr(e, 0, "x", "value4") + checkAttr(e, 1, "y", "value5") }) }