Skip to content

Commit

Permalink
normalize: Improve ToCRLF/StringToCRLF performance
Browse files Browse the repository at this point in the history
The ToCRLF/StringToCRLF functions are not very performance critical, but
we call it for each mail, and the current implementation is very
inefficient (mainly because it goes one byte at a time).

This patch replaces it with a better implementation that goes line by line.

The new implementation of ToCRLF is ~40% faster, and StringToCRLF is ~60%
faster.

```
$ benchstat old.txt new.txt
goos: linux
goarch: amd64
pkg: blitiri.com.ar/go/chasquid/internal/normalize
cpu: 13th Gen Intel(R) Core(TM) i9-13900T
                │    old.txt    │               new.txt                │
                │    sec/op     │    sec/op     vs base                │
ToCRLF-32         162.96µ ±  6%   95.42µ ± 12%  -41.44% (p=0.000 n=10)
StringToCRLF-32   190.70µ ± 14%   76.51µ ±  6%  -59.88% (p=0.000 n=10)
geomean            176.3µ         85.44µ        -51.53%
```
  • Loading branch information
albertito committed Oct 31, 2024
1 parent 723c47d commit 41bb7b6
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 12 deletions.
62 changes: 51 additions & 11 deletions internal/normalize/normalize.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,23 +78,63 @@ func DomainToUnicode(addr string) (string, error) {
// preexisting CRLF, it leaves it be. It assumes that CR is never used on its
// own.
func ToCRLF(in []byte) []byte {
b := bytes.NewBuffer(nil)
b := bytes.Buffer{}
b.Grow(len(in))
for _, c := range in {
switch c {
case '\r':
// Ignore CR, we'll add it back later. It should never appear
// alone in the contexts where this function is used.
case '\n':
b.Write([]byte("\r\n"))
default:
b.WriteByte(c)

// We go line by line, but beware:
// Split("a\nb", "\n") -> ["a", "b"]
// Split("a\nb\n", "\n") -> ["a", "b", ""]
// So we handle the last line separately.
lines := bytes.Split(in, []byte("\n"))
for i, line := range lines {
b.Write(line)
if i == len(lines)-1 {
// Do not add newline to the last line:
// - If the string ends with a newline, we already added it in
// the previous-to-last line, and this line is "".
// - If the string does NOT end with a newline, this preserves
// that property.
break
}
if !bytes.HasSuffix(line, []byte("\r")) {
// Missing the CR.
b.WriteByte('\r')
}
b.WriteByte('\n')
}

return b.Bytes()
}

// StringToCRLF is like ToCRLF, but operates on strings.
func StringToCRLF(in string) string {
return string(ToCRLF([]byte(in)))
// We implement it the same way as ToCRLF, but with string versions.
// This is significantly faster than converting the string to a byte
// slice, calling ToCRLF, and converting it back.
b := strings.Builder{}
b.Grow(len(in))

// We go line by line, but beware:
// Split("a\nb", "\n") -> ["a", "b"]
// Split("a\nb\n", "\n") -> ["a", "b", ""]
// So we handle the last line separately.
lines := strings.Split(in, "\n")
for i, line := range lines {
b.WriteString(line)
if i == len(lines)-1 {
// Do not add newline to the last line:
// - If the string ends with a newline, we already added it in
// the previous-to-last line, and this line is "".
// - If the string does NOT end with a newline, this preserves
// that property.
break
}
if !strings.HasSuffix(line, "\r") {
// Missing the CR.
b.WriteByte('\r')
}
b.WriteByte('\n')
}

return b.String()
}
45 changes: 44 additions & 1 deletion internal/normalize/normalize_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
package normalize

import "testing"
import (
"bytes"
"strings"
"testing"
)

func TestUser(t *testing.T) {
valid := []struct{ user, norm string }{
Expand Down Expand Up @@ -134,8 +138,19 @@ func TestToCRLF(t *testing.T) {
in, out string
}{
{"", ""},
{"a", "a"},

// Does not end in newline.
{"a\n", "a\r\n"},
{"a\nb", "a\r\nb"},
{"a\r\nb", "a\r\nb"},

// Ends in newline.
{"a\nb\n", "a\r\nb\r\n"},
{"a\r\nb\n", "a\r\nb\r\n"},
{"a\r\nb\r\n", "a\r\nb\r\n"},
{"a\r\nb\n\n", "a\r\nb\r\n\r\n"},
{"a\r\nb\r\n\r\n", "a\r\nb\r\n\r\n"},
}
for _, c := range cases {
got := string(ToCRLF([]byte(c.in)))
Expand Down Expand Up @@ -173,3 +188,31 @@ func FuzzDomainToUnicode(f *testing.F) {
DomainToUnicode(addr)
})
}

func BenchmarkToCRLF(b *testing.B) {
// Generate a 1000-line message.
bb := bytes.Buffer{}
for i := 0; i < 1000; i++ {
bb.WriteString("this is a very pretty line 🐅\n")
}
buf := bb.Bytes()

b.ResetTimer()
for i := 0; i < b.N; i++ {
ToCRLF(buf)
}
}

func BenchmarkStringToCRLF(b *testing.B) {
// Generate a 1000-line message.
sb := strings.Builder{}
for i := 0; i < 1000; i++ {
sb.WriteString("this is a very pretty line 🐅\n")
}
s := sb.String()

b.ResetTimer()
for i := 0; i < b.N; i++ {
StringToCRLF(s)
}
}

0 comments on commit 41bb7b6

Please sign in to comment.