normalize: Improve ToCRLF/StringToCRLF performance

The ToCRLF/StringToCRLF functions are not very performance critical, but we call it for each mail, and the current implementation is very inefficient (mainly because it goes one byte at a time). This patch replaces it with a better implementation that goes line by line. The new implementation of ToCRLF is ~40% faster, and StringToCRLF is ~60% faster. ``` $ benchstat old.txt new.txt goos: linux goarch: amd64 pkg: blitiri.com.ar/go/chasquid/internal/normalize cpu: 13th Gen Intel(R) Core(TM) i9-13900T │ old.txt │ new.txt │ │ sec/op │ sec/op vs base │ ToCRLF-32 162.96µ ± 6% 95.42µ ± 12% -41.44% (p=0.000 n=10) StringToCRLF-32 190.70µ ± 14% 76.51µ ± 6% -59.88% (p=0.000 n=10) geomean 176.3µ 85.44µ -51.53% ```
albertito · Oct 31, 2024 · 41bb7b6 · 41bb7b6
1 parent 723c47d
commit 41bb7b6
Show file tree

Hide file tree

Showing 2 changed files with 95 additions and 12 deletions.
diff --git a/internal/normalize/normalize.go b/internal/normalize/normalize.go
@@ -78,23 +78,63 @@ func DomainToUnicode(addr string) (string, error) {
 // preexisting CRLF, it leaves it be. It assumes that CR is never used on its
 // own.
 func ToCRLF(in []byte) []byte {
-	b := bytes.NewBuffer(nil)
+	b := bytes.Buffer{}
 	b.Grow(len(in))
-	for _, c := range in {
-		switch c {
-		case '\r':
-			// Ignore CR, we'll add it back later. It should never appear
-			// alone in the contexts where this function is used.
-		case '\n':
-			b.Write([]byte("\r\n"))
-		default:
-			b.WriteByte(c)
+
+	// We go line by line, but beware:
+	//   Split("a\nb", "\n") -> ["a", "b"]
+	//   Split("a\nb\n", "\n") -> ["a", "b", ""]
+	// So we handle the last line separately.
+	lines := bytes.Split(in, []byte("\n"))
+	for i, line := range lines {
+		b.Write(line)
+		if i == len(lines)-1 {
+			// Do not add newline to the last line:
+			//  - If the string ends with a newline, we already added it in
+			//    the previous-to-last line, and this line is "".
+			//  - If the string does NOT end with a newline, this preserves
+			//    that property.
+			break
+		}
+		if !bytes.HasSuffix(line, []byte("\r")) {
+			// Missing the CR.
+			b.WriteByte('\r')
 		}
+		b.WriteByte('\n')
 	}
+
 	return b.Bytes()
 }
 
 // StringToCRLF is like ToCRLF, but operates on strings.
 func StringToCRLF(in string) string {
-	return string(ToCRLF([]byte(in)))
+	// We implement it the same way as ToCRLF, but with string versions.
+	// This is significantly faster than converting the string to a byte
+	// slice, calling ToCRLF, and converting it back.
+	b := strings.Builder{}
+	b.Grow(len(in))
+
+	// We go line by line, but beware:
+	//   Split("a\nb", "\n") -> ["a", "b"]
+	//   Split("a\nb\n", "\n") -> ["a", "b", ""]
+	// So we handle the last line separately.
+	lines := strings.Split(in, "\n")
+	for i, line := range lines {
+		b.WriteString(line)
+		if i == len(lines)-1 {
+			// Do not add newline to the last line:
+			//  - If the string ends with a newline, we already added it in
+			//    the previous-to-last line, and this line is "".
+			//  - If the string does NOT end with a newline, this preserves
+			//    that property.
+			break
+		}
+		if !strings.HasSuffix(line, "\r") {
+			// Missing the CR.
+			b.WriteByte('\r')
+		}
+		b.WriteByte('\n')
+	}
+
+	return b.String()
 }
diff --git a/internal/normalize/normalize_test.go b/internal/normalize/normalize_test.go
@@ -1,6 +1,10 @@
 package normalize
 
-import "testing"
+import (
+	"bytes"
+	"strings"
+	"testing"
+)
 
 func TestUser(t *testing.T) {
 	valid := []struct{ user, norm string }{
@@ -134,8 +138,19 @@ func TestToCRLF(t *testing.T) {
 		in, out string
 	}{
 		{"", ""},
+		{"a", "a"},
+
+		// Does not end in newline.
+		{"a\n", "a\r\n"},
 		{"a\nb", "a\r\nb"},
 		{"a\r\nb", "a\r\nb"},
+
+		// Ends in newline.
+		{"a\nb\n", "a\r\nb\r\n"},
+		{"a\r\nb\n", "a\r\nb\r\n"},
+		{"a\r\nb\r\n", "a\r\nb\r\n"},
+		{"a\r\nb\n\n", "a\r\nb\r\n\r\n"},
+		{"a\r\nb\r\n\r\n", "a\r\nb\r\n\r\n"},
 	}
 	for _, c := range cases {
 		got := string(ToCRLF([]byte(c.in)))
@@ -173,3 +188,31 @@ func FuzzDomainToUnicode(f *testing.F) {
 		DomainToUnicode(addr)
 	})
 }
+
+func BenchmarkToCRLF(b *testing.B) {
+	// Generate a 1000-line message.
+	bb := bytes.Buffer{}
+	for i := 0; i < 1000; i++ {
+		bb.WriteString("this is a very pretty line 🐅\n")
+	}
+	buf := bb.Bytes()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		ToCRLF(buf)
+	}
+}
+
+func BenchmarkStringToCRLF(b *testing.B) {
+	// Generate a 1000-line message.
+	sb := strings.Builder{}
+	for i := 0; i < 1000; i++ {
+		sb.WriteString("this is a very pretty line 🐅\n")
+	}
+	s := sb.String()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		StringToCRLF(s)
+	}
+}