publisher: Some performance tweaks for the HTML elements collector - hugo - [fork] hugo port for 9front
 (HTM) git clone git@git.drkhsh.at/hugo.git
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) Submodules
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit ef34dd8f0e94e52ba6f1d5d607e4ac3ae98a7abb
 (DIR) parent bc80022e033a5462d1a9ce541f40a050994011cc
 (HTM) Author: Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
       Date:   Tue, 20 Apr 2021 16:50:03 +0200
       
       publisher: Some performance tweaks for the HTML elements collector
       
       Diffstat:
         M publisher/htmlElementsCollector.go  |      80 +++++++++++++++++++-------------
         M publisher/htmlElementsCollector_te… |      70 +------------------------------
       
       2 files changed, 49 insertions(+), 101 deletions(-)
       ---
 (DIR) diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go
       @@ -108,13 +108,13 @@ func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlEleme
                }
        }
        
       -// Write splits the incoming stream into single html element and writes these into elementSet
       +// Write splits the incoming stream into single html element.
        func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
                n = len(p)
                i := 0
        
                for i < len(p) {
       -                // if is not collecting, cycle through byte stream until start bracket "<" is found
       +                // If we are not collecting, cycle through byte stream until start bracket "<" is found.
                        if !w.isCollecting {
                                for ; i < len(p); i++ {
                                        b := p[i]
       @@ -126,9 +126,9 @@ func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
                        }
        
                        if w.isCollecting {
       -                        // if is collecting, cycle through byte stream until end bracket ">" is found
       -                        // disregard any ">" if within a quote
       -                        // write bytes until found to buffer
       +                        // If we are collecting, cycle through byte stream until end bracket ">" is found,
       +                        // disregard any ">" if within a quote,
       +                        // write bytes until found to buffer.
                                for ; i < len(p); i++ {
                                        b := p[i]
                                        w.toggleIfQuote(b)
       @@ -141,54 +141,69 @@ func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
                                }
                        }
        
       -                // if no end bracket ">" is found while collecting, but the stream ended
       +                // If no end bracket ">" is found while collecting, but the stream ended
                        // this could mean we received chunks of a stream from e.g. the minify functionality
       -                // next if loop will be skipped
       +                // next if loop will be skipped.
        
       -                // at this point we have collected an element line between angle brackets "<" and ">"
       +                // At this point we have collected an element line between angle brackets "<" and ">".
                        if !w.isCollecting {
       -                        s := w.buff.String()
       -                        w.buff.Reset()
       -
       -                        // filter out unwanted tags
       -                        // empty string, just in case
       -                        // if within preformatted code blocks <pre>, <textarea>, <script>, <style>
       -                        // comments and doctype tags
       -                        // end tags
       -                        switch {
       -                        case s == "": // empty string
       +                        if w.buff.Len() == 0 {
                                        continue
       -                        case w.inPreTag != "": // within preformatted code block
       +                        }
       +
       +                        if w.inPreTag != "" { // within preformatted code block
       +                                s := w.buff.String()
       +                                w.buff.Reset()
                                        if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName {
                                                w.inPreTag = ""
                                        }
                                        continue
       -                        case strings.HasPrefix(s, "<!"): // comment or doctype tag
       -                                continue
       -                        case strings.HasPrefix(s, "</"): // end tag
       -                                continue
                                }
        
       -                        // check if we have processed this element before.
       +                        // First check if we have processed this element before.
                                w.collector.mu.RLock()
       -                        seen := w.collector.elementSet[s]
       +
       +                        // Work with the bytes slice as long as it's practical,
       +                        // to save memory allocations.
       +                        b := w.buff.Bytes()
       +
       +                        // See https://github.com/dominikh/go-tools/issues/723
       +                        //lint:ignore S1030 This construct avoids memory allocation for the string.
       +                        seen := w.collector.elementSet[string(b)]
                                w.collector.mu.RUnlock()
                                if seen {
       +                                w.buff.Reset()
                                        continue
                                }
        
       -                        // check if a preformatted code block started
       +                        // Filter out unwanted tags
       +                        // if within preformatted code blocks <pre>, <textarea>, <script>, <style>
       +                        // comments and doctype tags
       +                        // end tags.
       +                        switch {
       +                        case bytes.HasPrefix(b, []byte("<!")): // comment or doctype tag
       +                                w.buff.Reset()
       +                                continue
       +                        case bytes.HasPrefix(b, []byte("</")): // end tag
       +                                w.buff.Reset()
       +                                continue
       +                        }
       +
       +                        s := w.buff.String()
       +                        w.buff.Reset()
       +
       +                        // Check if a preformatted code block started.
                                if tagName, isStart := parseStartTag(s); isStart && isPreFormatted(tagName) {
                                        w.inPreTag = tagName
                                }
        
       -                        // parse each collected element
       +                        // Parse each collected element.
                                el, err := parseHTMLElement(s)
                                if err != nil {
                                        return n, err
                                }
        
       -                        // write this tag to the element set
       +                        // Write this tag to the element set.
                                w.collector.mu.Lock()
                                w.collector.elementSet[s] = true
                                w.collector.elements = append(w.collector.elements, el)
       @@ -265,17 +280,18 @@ var (
                htmlJsonFixer = strings.NewReplacer(", ", "\n")
                jsonAttrRe    = regexp.MustCompile(`'?(.*?)'?:.*`)
                classAttrRe   = regexp.MustCompile(`(?i)^class$|transition`)
       -)
        
       -func parseHTMLElement(elStr string) (el htmlElement, err error) {
       -        var tagBuffer string = ""
       -        exceptionList := map[string]bool{
       +        exceptionList = map[string]bool{
                        "thead": true,
                        "tbody": true,
                        "tfoot": true,
                        "td":    true,
                        "tr":    true,
                }
       +)
       +
       +func parseHTMLElement(elStr string) (el htmlElement, err error) {
       +        var tagBuffer string = ""
        
                tagName, ok := parseStartTag(elStr)
                if !ok {
 (DIR) diff --git a/publisher/htmlElementsCollector_test.go b/publisher/htmlElementsCollector_test.go
       @@ -14,7 +14,6 @@
        package publisher
        
        import (
       -        "bytes"
                "fmt"
                "strings"
                "testing"
       @@ -129,33 +128,8 @@ func TestClassCollector(t *testing.T) {
                }
        }
        
       -func BenchmarkClassCollectorWriter(b *testing.B) {
       +func BenchmarkElementsCollectorWriter(b *testing.B) {
                const benchHTML = `
       -<html>
       -<body id="i1" class="a b c d">
       -<a class="c d e"></a>
       -<br>
       -<a class="c d e"></a>
       -<a class="c d e"></a>
       -<br>
       -<a id="i2" class="c d e f"></a>
       -<a id="i3" class="c d e"></a>
       -<a class="c d e"></a>
       -<br>
       -<a class="c d e"></a>
       -<a class="c d e"></a>
       -<a class="c d e"></a>
       -<a class="c d e"></a>
       -</body>
       -</html>
       -`
       -        for i := 0; i < b.N; i++ {
       -                w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
       -                fmt.Fprint(w, benchHTML)
       -        }
       -}
       -
       -const benchHTML = `
        <!DOCTYPE html>
        <html>
        <head>
       @@ -207,51 +181,9 @@ const benchHTML = `
        </body>
        </html>
        `
       -
       -func BenchmarkElementsCollectorWriter(b *testing.B) {
       -        b.ReportAllocs()
                for i := 0; i < b.N; i++ {
                        w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
                        fmt.Fprint(w, benchHTML)
       -        }
       -}
       -
       -func BenchmarkElementsCollectorWriterMinified(b *testing.B) {
       -        b.ReportAllocs()
       -        v := viper.New()
       -        m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
       -        var buf bytes.Buffer
       -        m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML))
       -        b.ResetTimer()
       -
       -        for i := 0; i < b.N; i++ {
       -                w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
       -                fmt.Fprint(w, buf.String())
       -        }
       -}
       -
       -func BenchmarkElementsCollectorWriterWithMinifyStream(b *testing.B) {
       -        b.ReportAllocs()
       -        v := viper.New()
       -        m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
       -        b.ResetTimer()
       -
       -        for i := 0; i < b.N; i++ {
       -                w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
       -                m.Minify(media.HTMLType, w, strings.NewReader(benchHTML))
       -        }
       -}
       -
       -func BenchmarkElementsCollectorWriterWithMinifyString(b *testing.B) {
       -        b.ReportAllocs()
       -        v := viper.New()
       -        m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
       -        b.ResetTimer()
        
       -        for i := 0; i < b.N; i++ {
       -                var buf bytes.Buffer
       -                m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML))
       -                w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
       -                fmt.Fprint(w, buf.String())
                }
        }