From 667a2920f2f6b8c42c726c032381c26df2a5bf96 Mon Sep 17 00:00:00 2001 From: Yusuke Inuzuka Date: Wed, 28 Aug 2019 20:27:33 +0900 Subject: [PATCH] Change attribute parsing strategy --- _benchmark/cmark/.gitignore | 1 + _benchmark/go/benchmark_test.go | 2 +- ast/ast.go | 31 +++--- parser/attribute.go | 77 +++++++++----- parser/atx_heading.go | 68 +++++++------ renderer/html/html.go | 3 +- util/util.go | 171 -------------------------------- 7 files changed, 109 insertions(+), 244 deletions(-) diff --git a/_benchmark/cmark/.gitignore b/_benchmark/cmark/.gitignore index eb9a71c..0c9a6c6 100644 --- a/_benchmark/cmark/.gitignore +++ b/_benchmark/cmark/.gitignore @@ -1 +1,2 @@ cmark-master +cmark_benchmark diff --git a/_benchmark/go/benchmark_test.go b/_benchmark/go/benchmark_test.go index 51b76a2..fcf3db2 100644 --- a/_benchmark/go/benchmark_test.go +++ b/_benchmark/go/benchmark_test.go @@ -11,7 +11,7 @@ import ( "gitlab.com/golang-commonmark/markdown" bf1 "github.com/russross/blackfriday" - bf2 "github.com/russross/blackfriday/v2" + bf2 "gopkg.in/russross/blackfriday.v2" ) func BenchmarkMarkdown(b *testing.B) { diff --git a/ast/ast.go b/ast/ast.go index 8645d31..025c599 100644 --- a/ast/ast.go +++ b/ast/ast.go @@ -42,7 +42,7 @@ func NewNodeKind(name string) NodeKind { // An Attribute is an attribute of the Node type Attribute struct { Name []byte - Value []byte + Value interface{} } var attrNameIDS = []byte("#") @@ -143,17 +143,20 @@ type Node interface { IsRaw() bool // SetAttribute sets the given value to the attributes. - SetAttribute(name, value []byte) + SetAttribute(name []byte, value interface{}) + + // SetAttributeString sets the given value to the attributes. + SetAttributeString(name string, value interface{}) // Attribute returns a (attribute value, true) if an attribute // associated with the given name is found, otherwise // (nil, false) - Attribute(name []byte) ([]byte, bool) + Attribute(name []byte) (interface{}, bool) // AttributeString returns a (attribute value, true) if an attribute // associated with the given name is found, otherwise // (nil, false) - AttributeString(name string) ([]byte, bool) + AttributeString(name string) (interface{}, bool) // Attributes returns a list of attributes. // This may be a nil if there are no attributes. @@ -327,7 +330,7 @@ func (n *BaseNode) Text(source []byte) []byte { } // SetAttribute implements Node.SetAttribute. -func (n *BaseNode) SetAttribute(name, value []byte) { +func (n *BaseNode) SetAttribute(name []byte, value interface{}) { if n.attributes == nil { n.attributes = make([]Attribute, 0, 10) } else { @@ -339,20 +342,16 @@ func (n *BaseNode) SetAttribute(name, value []byte) { } } } - if len(name) == 1 { - if name[0] == '#' { - n.attributes = append(n.attributes, Attribute{attrNameID, value}) - return - } else if name[0] == '.' { - n.attributes = append(n.attributes, Attribute{attrNameClass, value}) - return - } - } n.attributes = append(n.attributes, Attribute{name, value}) } +// SetAttributeString implements Node.SetAttributeString +func (n *BaseNode) SetAttributeString(name string, value interface{}) { + n.SetAttribute(util.StringToReadOnlyBytes(name), value) +} + // Attribute implements Node.Attribute. -func (n *BaseNode) Attribute(name []byte) ([]byte, bool) { +func (n *BaseNode) Attribute(name []byte) (interface{}, bool) { if n.attributes == nil { return nil, false } @@ -365,7 +364,7 @@ func (n *BaseNode) Attribute(name []byte) ([]byte, bool) { } // AttributeString implements Node.AttributeString. -func (n *BaseNode) AttributeString(s string) ([]byte, bool) { +func (n *BaseNode) AttributeString(s string) (interface{}, bool) { return n.Attribute(util.StringToReadOnlyBytes(s)) } diff --git a/parser/attribute.go b/parser/attribute.go index 607366e..53141b7 100644 --- a/parser/attribute.go +++ b/parser/attribute.go @@ -2,21 +2,47 @@ package parser import ( "bytes" - "fmt" "github.com/yuin/goldmark/text" "github.com/yuin/goldmark/util" "strconv" ) -type attribute struct { - Name string +var attrNameID = []byte("id") +var attrNameClass = []byte("class") + +// An Attribute is an attribute of the markdown elements +type Attribute struct { + Name []byte Value interface{} } +// An Attributes is a collection of attributes. +type Attributes []Attribute + +// Find returns a (value, true) if an attribute correspond with given name is found, otherwise (nil, false). +func (as Attributes) Find(name []byte) (interface{}, bool) { + for _, a := range as { + if bytes.Equal(a.Name, name) { + return a.Value, true + } + } + return nil, false +} + +func (as Attributes) findUpdate(name []byte, cb func(v interface{}) interface{}) bool { + for i, a := range as { + if bytes.Equal(a.Name, name) { + as[i].Value = cb(a.Value) + return true + } + } + return false +} + // ParseAttributes parses attributes into a map. -// ParseAttributes returns a parsed map and true if could parse +// ParseAttributes returns a parsed attributes and true if could parse // attributes, otherwise nil and false. -func ParseAttributes(reader text.Reader) (map[string]interface{}, bool) { +func ParseAttributes(reader text.Reader) (Attributes, bool) { savedLine, savedPosition := reader.Position() reader.SkipSpaces() if reader.Peek() != '{' { @@ -24,28 +50,29 @@ func ParseAttributes(reader text.Reader) (map[string]interface{}, bool) { return nil, false } reader.Advance(1) - m := map[string]interface{}{} + attrs := Attributes{} for { if reader.Peek() == '}' { reader.Advance(1) - return m, true + return attrs, true } attr, ok := parseAttribute(reader) if !ok { reader.SetPosition(savedLine, savedPosition) return nil, false } - if attr.Name == "class" { - if v, ok := m["class"]; ok { - if _, ok2 := v.([][]byte); !ok2 { - m["class"] = [][]byte{v.([]byte)} + if bytes.Equal(attr.Name, attrNameClass) { + if !attrs.findUpdate(attrNameClass, func(v interface{}) interface{} { + var ret interface{} + if ret, ok = v.([][]byte); !ok { + ret = [][]byte{v.([]byte)} } - m["class"] = append(m["class"].([][]byte), util.StringToReadOnlyBytes(fmt.Sprintf("%v", attr.Value))) - } else { - m["class"] = util.StringToReadOnlyBytes(fmt.Sprintf("%v", attr.Value)) + return append(ret.([][]byte), attr.Value.([]byte)) + }) { + attrs = append(attrs, attr) } } else { - m[attr.Name] = attr.Value + attrs = append(attrs, attr) } reader.SkipSpaces() if reader.Peek() == ',' { @@ -55,7 +82,7 @@ func ParseAttributes(reader text.Reader) (map[string]interface{}, bool) { } } -func parseAttribute(reader text.Reader) (attribute, bool) { +func parseAttribute(reader text.Reader) (Attribute, bool) { reader.SkipSpaces() c := reader.Peek() if c == '#' || c == '.' { @@ -64,18 +91,18 @@ func parseAttribute(reader text.Reader) (attribute, bool) { i := 0 for ; i < len(line) && !util.IsSpace(line[i]) && (!util.IsPunct(line[i]) || line[i] == '_' || line[i] == '-'); i++ { } - name := "class" + name := attrNameClass if c == '#' { - name = "id" + name = attrNameID } reader.Advance(i) - return attribute{Name: name, Value: line[0:i]}, true + return Attribute{Name: name, Value: line[0:i]}, true } line, _ := reader.PeekLine() c = line[0] if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == ':') { - return attribute{}, false + return Attribute{}, false } i := 0 for ; i < len(line); i++ { @@ -86,20 +113,20 @@ func parseAttribute(reader text.Reader) (attribute, bool) { break } } - name := string(line[:i]) + name := line[:i] reader.Advance(i) reader.SkipSpaces() c = reader.Peek() if c != '=' { - return attribute{}, false + return Attribute{}, false } reader.Advance(1) reader.SkipSpaces() value, ok := parseAttributeValue(reader) if !ok { - return attribute{}, false + return Attribute{}, false } - return attribute{Name: name, Value: value}, true + return Attribute{Name: name, Value: value}, true } @@ -110,7 +137,7 @@ func parseAttributeValue(reader text.Reader) (interface{}, bool) { ok := false switch c { case text.EOF: - return attribute{}, false + return Attribute{}, false case '{': value, ok = ParseAttributes(reader) case '[': diff --git a/parser/atx_heading.go b/parser/atx_heading.go index 669fe6e..c90a285 100644 --- a/parser/atx_heading.go +++ b/parser/atx_heading.go @@ -99,8 +99,8 @@ func (b *atxHeadingParser) Open(parent ast.Node, reader text.Reader, pc Context) parsed := false if b.Attribute { // handles special case like ### heading ### {#id} start-- - closureOpen := -1 closureClose := -1 + closureOpen := -1 for i := start; i < stop; { c := line[i] if util.IsEscapedPunctuation(line, i) { @@ -117,28 +117,14 @@ func (b *atxHeadingParser) Open(parent ast.Node, reader text.Reader, pc Context) } } if closureClose > 0 { - i := closureClose - for ; i < stop && util.IsSpace(line[i]); i++ { - } - if i < stop-1 || line[i] == '{' { - as := i + 1 - for as < stop { - ai, skip := util.FindAttributeIndex(line[as:], true) - if ai[0] < 0 { - break - } - node.SetAttribute(line[as+ai[0]:as+ai[1]], - util.UnescapePunctuations(line[as+ai[2]:as+ai[3]])) - as += ai[3] + skip - } - for ; as < stop && util.IsSpace(line[as]); as++ { - } - if line[as] == '}' && (as > stop-2 || util.IsBlank(line[as:])) { - parsed = true - node.Lines().Append(text.NewSegment(segment.Start+start+1, segment.Start+closureOpen)) - } else { - node.RemoveAttributes() + reader.Advance(closureClose) + attrs, ok := ParseAttributes(reader) + parsed = ok + if parsed { + for _, attr := range attrs { + node.SetAttribute(attr.Name, attr.Value) } + node.Lines().Append(text.NewSegment(segment.Start+start+1, segment.Start+closureOpen)) } } } @@ -194,7 +180,6 @@ func (b *atxHeadingParser) CanAcceptIndentedLine() bool { } var attrAutoHeadingIDPrefix = []byte("heading") -var attrNameID = []byte("#") func generateAutoHeadingID(node *ast.Heading, reader text.Reader, pc Context) { lastIndex := node.Lines().Len() - 1 @@ -208,14 +193,37 @@ func parseLastLineAttributes(node ast.Node, reader text.Reader, pc Context) { lastIndex := node.Lines().Len() - 1 lastLine := node.Lines().At(lastIndex) line := lastLine.Value(reader.Source()) - indicies := util.FindAttributeIndiciesReverse(line, true) - if indicies != nil { - for _, index := range indicies { - node.SetAttribute(line[index[0]:index[1]], - util.UnescapePunctuations(line[index[2]:index[3]])) + lr := text.NewReader(line) + var attrs Attributes + var ok bool + var start text.Segment + var sl int + var end text.Segment + for { + c := lr.Peek() + if c == text.EOF { + break + } + if c == '\\' { + lr.Advance(1) + if lr.Peek() == '{' { + lr.Advance(1) + } + continue + } + if c == '{' { + sl, start = lr.Position() + attrs, ok = ParseAttributes(lr) + _, end = lr.Position() + lr.SetPosition(sl, start) + } + lr.Advance(1) + } + if ok && util.IsBlank(line[end.Stop:]) { + for _, attr := range attrs { + node.SetAttribute(attr.Name, attr.Value) } - lastLine.Stop = lastLine.Start + indicies[0][0] - 1 - lastLine.TrimRightSpace(reader.Source()) + lastLine.Stop = lastLine.Start + start.Start node.Lines().Set(lastIndex, lastLine) } } diff --git a/renderer/html/html.go b/renderer/html/html.go index bed6a85..3019511 100644 --- a/renderer/html/html.go +++ b/renderer/html/html.go @@ -505,11 +505,12 @@ func (r *Renderer) renderString(w util.BufWriter, source []byte, node ast.Node, // RenderAttributes renders given node's attributes. func (r *Renderer) RenderAttributes(w util.BufWriter, node ast.Node) { + for _, attr := range node.Attributes() { _, _ = w.WriteString(" ") _, _ = w.Write(attr.Name) _, _ = w.WriteString(`="`) - _, _ = w.Write(util.EscapeHTML(attr.Value)) + _, _ = w.Write(util.EscapeHTML(attr.Value.([]byte))) _ = w.WriteByte('"') } } diff --git a/util/util.go b/util/util.go index bcaa51b..64a2b90 100644 --- a/util/util.go +++ b/util/util.go @@ -631,177 +631,6 @@ func URLEscape(v []byte, resolveReference bool) []byte { return cob.Bytes() } -// FindAttributeIndiciesReverse searches attribute indicies from tail of the given -// bytes and returns indicies. -func FindAttributeIndiciesReverse(b []byte, canEscapeQuotes bool) [][4]int { - i := 0 -retry: - var result [][4]int - as := -1 - for i < len(b) { - if IsEscapedPunctuation(b, i) { - i += 2 - continue - } - if b[i] == '{' { - i++ - as = i - break - } - i++ - } - if as < 0 { - return nil - } - for as < len(b) { - ai, skip := FindAttributeIndex(b[as:], canEscapeQuotes) - if ai[0] < 0 { - break - } - i = as + ai[3] - if result == nil { - result = [][4]int{} - } - result = append(result, [4]int{as + ai[0], as + ai[1], as + ai[2], as + ai[3]}) - as += ai[3] + skip - } - if b[as] == '}' && (as > len(b)-2 || IsBlank(b[as:])) { - return result - } - goto retry -} - -// FindAttributeIndex searches -// - #id -// - .class -// - attr=value -// in given bytes. -// FindHTMLAttributeIndex returns an int array that elements are -// [name_start, name_stop, value_start, value_stop]. -// value_start and value_stop does not include " or '. -// If no attributes found, it returns ([4]int{-1, -1, -1, -1}, 0). -func FindAttributeIndex(b []byte, canEscapeQuotes bool) ([4]int, int) { - result := [4]int{-1, -1, -1, -1} - i := 0 - l := len(b) - for ; i < l && IsSpace(b[i]); i++ { - } - if i >= l { - return result, 0 - } - c := b[i] - if c == '#' || c == '.' { - result[0] = i - i++ - result[1] = i - result[2] = i - for ; i < l && !IsSpace(b[i]) && (!IsPunct(b[i]) || b[i] == '_' || b[i] == '-'); i++ { - } - result[3] = i - return result, 0 - } - return FindHTMLAttributeIndex(b, canEscapeQuotes) -} - -// FindHTMLAttributeIndex searches HTML attributes in given bytes. -// FindHTMLAttributeIndex returns an int array that elements are -// [name_start, name_stop, value_start, value_stop]. -// value_start and value_stop does not include " or '. -// If no attributes found, it returns [4]int{-1, -1, -1, -1}. -func FindHTMLAttributeIndex(b []byte, canEscapeQuotes bool) ([4]int, int) { - result := [4]int{-1, -1, -1, -1} - i := 0 - l := len(b) - for ; i < l && IsSpace(b[i]); i++ { - } - if i >= l { - return result, 0 - } - c := b[i] - if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || - c == '_' || c == ':') { - return result, 0 - } - result[0] = i - for ; i < l; i++ { - c := b[i] - if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || - (c >= '0' && c <= '9') || - c == '_' || c == ':' || c == '.' || c == '-') { - break - } - } - result[1] = i - for ; i < l && IsSpace(b[i]); i++ { - } - if i >= l { - return [4]int{-1, -1, -1, -1}, 0 - } - if b[i] != '=' { - return [4]int{-1, -1, -1, -1}, 0 - } - i++ - for ; i < l && IsSpace(b[i]); i++ { - } - if i >= l { - return [4]int{-1, -1, -1, -1}, 0 - } - skip := 0 - if b[i] == '"' { - i++ - result[2] = i - if canEscapeQuotes { - pos := FindClosure(b[i:], '"', '"', false, false) - if pos < 0 { - return [4]int{-1, -1, -1, -1}, 0 - } - result[3] = pos + i - } else { - for ; i < l && b[i] != '"'; i++ { - } - result[3] = i - if result[2] == result[3] || i == l && b[l-1] != '"' { - return [4]int{-1, -1, -1, -1}, 0 - } - } - skip = 1 - } else if b[i] == '\'' { - i++ - result[2] = i - if canEscapeQuotes { - pos := FindClosure(b[i:], '\'', '\'', false, false) - if pos < 0 { - return [4]int{-1, -1, -1, -1}, 0 - } - result[3] = pos + i - } else { - for ; i < l && b[i] != '\''; i++ { - } - result[3] = i - if result[2] == result[3] || i == l && b[l-1] != '\'' { - return [4]int{-1, -1, -1, -1}, 0 - } - } - skip = 1 - } else { - result[2] = i - for ; i < l; i++ { - c = b[i] - if c == '\\' || c == '"' || c == '\'' || - c == '=' || c == '<' || c == '>' || c == '`' || - c == '{' || c == '}' || - (c >= 0 && c <= 0x20) { - break - } - } - result[3] = i - if result[2] == result[3] { - return [4]int{-1, -1, -1, -1}, 0 - } - } - return result, skip -} - // FindURLIndex returns a stop index value if the given bytes seem an URL. // This function is equivalent to [A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]* . func FindURLIndex(b []byte) int {